/**
* Sanitizes personally identifiable information (PII) from member data for AI embeddings.
*
* SIMPLE API: data in, options in, sanitized data out
*
* Strategy:
* - Names, cities, postcodes, countries → plain text (for AI search)
* - Emails, phones, IBANs, street addresses → hashed (privacy protection)
* - Full hashes stored for exact matching in database
*
* @param {Object} data - Member/person data object
* @param {Object} [options] - Sanitization options
* @param {boolean} [options.keepNames=true] - Keep names in plain text
* @param {boolean} [options.includeMetadata=true] - Include hierarchie, stage, gender
* @param {string} [options.secret] - Secret for hashing (default: process.env.ibanPassphrase)
* @returns {{embeddingText: string, searchHashes: {emails: string[], phones: string[], iban: string|null}}} Sanitized embedding data with text and hashes
*
* @example
* const result = sanitizePII(personData);
* // {
* // embeddingText: "Wolfgang Lohwasser Wollo gender:M hierarchie:3 Gailingen 78262 de EMAIL:abc12345 PHONE:def456",
* // searchHashes: { emails: ["full64charhash..."], phones: [...], iban: "..." }
* // }
*/
export default function sanitizePII(data, options = {}) {
const {
keepNames = true,
includeMetadata = true,
secret = process.env.ibanPassphrase || 'default-secret'
} = options;
// Helper: Create short hash for embedding text (with type-aware normalization)
const shortHash = (value, type) => {
if (!value) return '';
const normalized = normalizeValue(value, type);
return crypto.createHmac('sha256', secret)
.update(normalized)
.digest('hex')
.substring(0, 8);
};
// Helper: Create full hash for database indexing (with type-aware normalization)
const fullHash = (value, type) => {
if (!value) return null;
const normalized = normalizeValue(value, type);
return crypto.createHmac('sha256', secret)
.update(normalized)
.digest('hex');
};
const parts = [];
// 1. Namen im Klartext (für AI-Suche)
if (keepNames) {
if (data.firstName) parts.push(data.firstName);
if (data.lastName) parts.push(data.lastName);
if (data.nickname) parts.push(data.nickname);
}
// 2. Metadata im Klartext
if (includeMetadata) {
if (data.title) parts.push(data.title);
if (data.greeting) parts.push(data.greeting);
if (data.gender) parts.push(`gender:${data.gender}`);
if (data.hierarchie) parts.push(`hierarchie:${data.hierarchie}`);
if (data.stage) parts.push(`stage:${data.stage}`);
}
// 3. Städte, PLZ, Länder im Klartext (für AI-Suche)
if (data.address && Array.isArray(data.address)) {
data.address.forEach(addr => {
if (addr.city) parts.push(addr.city);
if (addr.postcode) parts.push(addr.postcode);
if (addr.countryCode) parts.push(addr.countryCode);
});
}
// 4. PII hashen - Emails
const emails = [];
const emailHashes = [];
if (data.email && Array.isArray(data.email)) {
data.email.forEach(e => {
if (e.email) {
emails.push(e.email);
parts.push(`[EMAIL_${shortHash(e.email, 'email')}]`);
emailHashes.push(fullHash(e.email, 'email'));
}
});
}
// 5. PII hashen - Phones
const phoneHashes = [];
if (data.phone && Array.isArray(data.phone)) {
data.phone.forEach(p => {
if (p.number) {
parts.push(`[PHONE_${shortHash(p.number, 'phone')}]`);
phoneHashes.push(fullHash(p.number, 'phone'));
}
});
}
// 6. PII hashen - IBANs
let ibanHash = null;
if (data.accounts && Array.isArray(data.accounts)) {
data.accounts.forEach(account => {
const decrypted = decryptAccount(account);
if (decrypted.IBAN) {
parts.push(`[IBAN_${shortHash(decrypted.IBAN, 'iban')}]`);
if (!ibanHash) { // Only first IBAN
ibanHash = fullHash(decrypted.IBAN, 'iban');
}
}
});
}
// 7. PII hashen - Adressen (Straße + Hausnummer)
if (data.address && Array.isArray(data.address)) {
data.address.forEach(addr => {
if (addr.road && addr.houseNumber) {
const fullAddress = `${addr.road} ${addr.houseNumber}`;
parts.push(`[ADDRESS_${shortHash(fullAddress, 'address')}]`);
}
});
}
return {
embeddingText: parts.join(' ').trim(),
searchHashes: {
emails: emailHashes.filter(Boolean),
phones: phoneHashes.filter(Boolean),
iban: ibanHash
}
};
}
/**
* Regex patterns for detecting sensitive information in strings
*/
export const SENSITIVE_PATTERNS = {
// Email: standard RFC 5322 simplified pattern
email: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/gi,
// Phone: international formats (+XX, 00XX, etc.) and common formats
// Matches: +49123456789, 0049123456789, (123) 456-7890, 123-456-7890, etc.
phone: /(\+\d{1,3}[-\.\s]?)?(\(?\d{1,4}\)?[-\.\s]?)?(\d{1,4}[-\.\s]?){1,3}\d{1,4}\b/g,
// IBAN: 2 letter country code + 2 check digits + up to 30 alphanumeric
// Matches: DE89370400440532013000, GB82 WEST 1234 5698 7654 32, etc.
iban: /\b[A-Z]{2}[0-9]{2}[A-Z0-9\s]{10,30}\b/gi,
// Street address patterns (simplified - catches many common formats)
// Matches: "123 Main St", "45 Baker Street", "Hauptstraße 23", etc.
address: /\b\d{1,5}\s+([A-Z][a-z]+\s*){1,4}(Street|St|Avenue|Ave|Road|Rd|Drive|Dr|Lane|Ln|Boulevard|Blvd|Straße|straße|str|Weg|Platz)\b/gi,
// Credit card numbers (with optional spaces/dashes)
creditCard: /\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/g,
// Social security / tax IDs (various formats)
ssn: /\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b/g
};
/**
* Normalize sensitive values before hashing for consistency
* Same value in different formats produces same hash
*
* @param {string} value - Raw value to normalize
* @param {string} type - Type of value (email, phone, iban, etc.)
* @returns {string} - Normalized value
*/
function normalizeValue(value, type) {
if (!value) return '';
switch (type) {
case 'email':
// Email: lowercase, trim whitespace
return value.toLowerCase().trim();
case 'phone':
// Phone: remove all non-digit characters except leading +
// +49 123 456 789 → +49123456789
// (123) 456-7890 → 123456790
// 0049 123 456 → 0049123456
const cleaned = value.replace(/[\s\-\(\)\.]/g, '');
return cleaned.toLowerCase();
case 'iban':
// IBAN: uppercase, remove all spaces
// DE89 3704 0044 0532 0130 00 → DE89370400440532013000
return value.replace(/\s+/g, '').toUpperCase();
case 'card':
// Credit card: remove all spaces and dashes
// 4532-1234-5678-9010 → 4532123456789010
return value.replace(/[\s\-]/g, '');
case 'ssn':
// SSN/Tax ID: remove spaces and dashes
// 123-45-6789 → 123456789
return value.replace(/[\s\-]/g, '');
case 'address':
// Address: lowercase, normalize whitespace
// "123 Main Street" → "123 main street"
return value.toLowerCase().replace(/\s+/g, ' ').trim();
default:
// Default: lowercase, remove extra whitespace
return value.toLowerCase().replace(/\s+/g, '');
}
}
/**
* Create deterministic hash for sensitive string data
* Same input always produces same hash (preserves uniqueness in embeddings)
*
* @param {string} value - Value to hash
* @param {string} type - Type of value (email, phone, etc.)
* @returns {string} - Hashed placeholder
*/
export function hashSensitiveValue(value, type) {
// Normalize value based on type
const normalized = normalizeValue(value, type);
// Create deterministic hash
const hash = crypto.createHash('sha256')
.update(normalized)
.digest('hex')
.substring(0, 8); // Use first 8 chars for readability
return `[${type.toUpperCase()}_${hash}]`;
}
/**
* Sanitize text by replacing sensitive information with hashed placeholders
* Preserves semantic structure while protecting PII
*
* @param {string} text - Text to sanitize
* @param {Object} options - Sanitization options
* @param {boolean} [options.sanitizeEmail=true] - Hash email addresses
* @param {boolean} [options.sanitizePhone=true] - Hash phone numbers
* @param {boolean} [options.sanitizeIban=true] - Hash IBAN numbers
* @param {boolean} [options.sanitizeAddress=true] - Hash street addresses
* @param {boolean} [options.sanitizeCreditCard=true] - Hash credit card numbers
* @param {boolean} [options.sanitizeSSN=true] - Hash SSN/tax IDs
* @returns {string} - Sanitized text with hashed placeholders
*/
export function sanitizeTextForEmbedding(text, options = {}) {
if (!text || typeof text !== 'string') return text;
const {
sanitizeEmail = true,
sanitizePhone = true,
sanitizeIban = true,
sanitizeAddress = true,
sanitizeCreditCard = true,
sanitizeSSN = true
} = options;
let sanitized = text;
// Order matters: start with more specific patterns first
if (sanitizeCreditCard) {
sanitized = sanitized.replace(SENSITIVE_PATTERNS.creditCard, (match) =>
hashSensitiveValue(match, 'card')
);
}
if (sanitizeIban) {
sanitized = sanitized.replace(SENSITIVE_PATTERNS.iban, (match) =>
hashSensitiveValue(match, 'iban')
);
}
if (sanitizeSSN) {
sanitized = sanitized.replace(SENSITIVE_PATTERNS.ssn, (match) =>
hashSensitiveValue(match, 'ssn')
);
}
if (sanitizeEmail) {
sanitized = sanitized.replace(SENSITIVE_PATTERNS.email, (match) =>
hashSensitiveValue(match, 'email')
);
}
if (sanitizePhone) {
// Phone detection is tricky - only replace if it looks like a phone number
// Skip if it's likely a date, version number, or other numeric pattern
sanitized = sanitized.replace(SENSITIVE_PATTERNS.phone, (match) => {
// Skip if it's too short or looks like a year/date
if (match.replace(/\D/g, '').length < 7) return match;
if (/^\d{4}$/.test(match.trim())) return match; // Year
if (/^\d{1,2}[-./]\d{1,2}[-./]\d{2,4}$/.test(match.trim())) return match; // Date
return hashSensitiveValue(match, 'phone');
});
}
if (sanitizeAddress) {
sanitized = sanitized.replace(SENSITIVE_PATTERNS.address, (match) =>
hashSensitiveValue(match, 'address')
);
}
return sanitized;
}
/**
* Test sanitization with sample data (for debugging)
*/
export function testSanitization() {
const samples = [
'Contact me at john.doe@example.com or call +49 123 456789',
'My IBAN is DE89 3704 0044 0532 0130 00',
'Send payment to 123 Main Street, or card 4532-1234-5678-9010',
'SSN: 123-45-6789, Phone: (555) 123-4567'
];
console.log('Sanitization test:');
samples.forEach(sample => {
console.log('\nOriginal:', sample);
console.log('Sanitized:', sanitizeTextForEmbedding(sample));
});
}
import crypto from 'crypto';
import { decryptAccount } from './crypto.js';