Source: utils/santitizePII.js

/**
 * Sanitizes personally identifiable information (PII) from member data for AI embeddings.
 * 
 * SIMPLE API: data in, options in, sanitized data out
 * 
 * Strategy:
 * - Names, cities, postcodes, countries → plain text (for AI search)
 * - Emails, phones, IBANs, street addresses → hashed (privacy protection)
 * - Full hashes stored for exact matching in database
 * 
 * @param {Object} data - Member/person data object
 * @param {Object} [options] - Sanitization options
 * @param {boolean} [options.keepNames=true] - Keep names in plain text
 * @param {boolean} [options.includeMetadata=true] - Include hierarchie, stage, gender
 * @param {string} [options.secret] - Secret for hashing (default: process.env.ibanPassphrase)
 * @returns {{embeddingText: string, searchHashes: {emails: string[], phones: string[], iban: string|null}}} Sanitized embedding data with text and hashes
 * 
 * @example
 * const result = sanitizePII(personData);
 * // {
 * //   embeddingText: "Wolfgang Lohwasser Wollo gender:M hierarchie:3 Gailingen 78262 de EMAIL:abc12345 PHONE:def456",
 * //   searchHashes: { emails: ["full64charhash..."], phones: [...], iban: "..." }
 * // }
 */
export default function sanitizePII(data, options = {}) {
    const {
        keepNames = true,
        includeMetadata = true,
        secret = process.env.ibanPassphrase || 'default-secret'
    } = options;

    // Helper: Create short hash for embedding text (with type-aware normalization)
    const shortHash = (value, type) => {
        if (!value) return '';
        const normalized = normalizeValue(value, type);
        return crypto.createHmac('sha256', secret)
            .update(normalized)
            .digest('hex')
            .substring(0, 8);
    };

    // Helper: Create full hash for database indexing (with type-aware normalization)
    const fullHash = (value, type) => {
        if (!value) return null;
        const normalized = normalizeValue(value, type);
        return crypto.createHmac('sha256', secret)
            .update(normalized)
            .digest('hex');
    };

    const parts = [];

    // 1. Namen im Klartext (für AI-Suche)
    if (keepNames) {
        if (data.firstName) parts.push(data.firstName);
        if (data.lastName) parts.push(data.lastName);
        if (data.nickname) parts.push(data.nickname);
    }

    // 2. Metadata im Klartext
    if (includeMetadata) {
        if (data.title) parts.push(data.title);
        if (data.greeting) parts.push(data.greeting);
        if (data.gender) parts.push(`gender:${data.gender}`);
        if (data.hierarchie) parts.push(`hierarchie:${data.hierarchie}`);
        if (data.stage) parts.push(`stage:${data.stage}`);
    }

    // 3. Städte, PLZ, Länder im Klartext (für AI-Suche)
    if (data.address && Array.isArray(data.address)) {
        data.address.forEach(addr => {
            if (addr.city) parts.push(addr.city);
            if (addr.postcode) parts.push(addr.postcode);
            if (addr.countryCode) parts.push(addr.countryCode);
        });
    }

    // 4. PII hashen - Emails
    const emails = [];
    const emailHashes = [];
    if (data.email && Array.isArray(data.email)) {
        data.email.forEach(e => {
            if (e.email) {
                emails.push(e.email);
                parts.push(`[EMAIL_${shortHash(e.email, 'email')}]`);
                emailHashes.push(fullHash(e.email, 'email'));
            }
        });
    }

    // 5. PII hashen - Phones
    const phoneHashes = [];
    if (data.phone && Array.isArray(data.phone)) {
        data.phone.forEach(p => {
            if (p.number) {
                parts.push(`[PHONE_${shortHash(p.number, 'phone')}]`);
                phoneHashes.push(fullHash(p.number, 'phone'));
            }
        });
    }

    // 6. PII hashen - IBANs
    let ibanHash = null;
    if (data.accounts && Array.isArray(data.accounts)) {
        data.accounts.forEach(account => {
            const decrypted = decryptAccount(account);
            if (decrypted.IBAN) {
                parts.push(`[IBAN_${shortHash(decrypted.IBAN, 'iban')}]`);
                if (!ibanHash) { // Only first IBAN
                    ibanHash = fullHash(decrypted.IBAN, 'iban');
                }
            }
        });
    }

    // 7. PII hashen - Adressen (Straße + Hausnummer)
    if (data.address && Array.isArray(data.address)) {
        data.address.forEach(addr => {
            if (addr.road && addr.houseNumber) {
                const fullAddress = `${addr.road} ${addr.houseNumber}`;
                parts.push(`[ADDRESS_${shortHash(fullAddress, 'address')}]`);
            }
        });
    }

    return {
        embeddingText: parts.join(' ').trim(),
        searchHashes: {
            emails: emailHashes.filter(Boolean),
            phones: phoneHashes.filter(Boolean),
            iban: ibanHash
        }
    };
}
/**
 * Regex patterns for detecting sensitive information in strings
 */
export const SENSITIVE_PATTERNS = {
  // Email: standard RFC 5322 simplified pattern
  email: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/gi,
  
  // Phone: international formats (+XX, 00XX, etc.) and common formats
  // Matches: +49123456789, 0049123456789, (123) 456-7890, 123-456-7890, etc.
  phone: /(\+\d{1,3}[-\.\s]?)?(\(?\d{1,4}\)?[-\.\s]?)?(\d{1,4}[-\.\s]?){1,3}\d{1,4}\b/g,
  
  // IBAN: 2 letter country code + 2 check digits + up to 30 alphanumeric
  // Matches: DE89370400440532013000, GB82 WEST 1234 5698 7654 32, etc.
  iban: /\b[A-Z]{2}[0-9]{2}[A-Z0-9\s]{10,30}\b/gi,
  
  // Street address patterns (simplified - catches many common formats)
  // Matches: "123 Main St", "45 Baker Street", "Hauptstraße 23", etc.
  address: /\b\d{1,5}\s+([A-Z][a-z]+\s*){1,4}(Street|St|Avenue|Ave|Road|Rd|Drive|Dr|Lane|Ln|Boulevard|Blvd|Straße|straße|str|Weg|Platz)\b/gi,
  
  // Credit card numbers (with optional spaces/dashes)
  creditCard: /\b\d{4}[\s-]?\d{4}[\s-]?\d{4}[\s-]?\d{4}\b/g,
  
  // Social security / tax IDs (various formats)
  ssn: /\b\d{3}[-\s]?\d{2}[-\s]?\d{4}\b/g
};

/**
 * Normalize sensitive values before hashing for consistency
 * Same value in different formats produces same hash
 * 
 * @param {string} value - Raw value to normalize
 * @param {string} type - Type of value (email, phone, iban, etc.)
 * @returns {string} - Normalized value
 */
function normalizeValue(value, type) {
  if (!value) return '';
  
  switch (type) {
    case 'email':
      // Email: lowercase, trim whitespace
      return value.toLowerCase().trim();
      
    case 'phone':
      // Phone: remove all non-digit characters except leading +
      // +49 123 456 789 → +49123456789
      // (123) 456-7890 → 123456790
      // 0049 123 456 → 0049123456
      const cleaned = value.replace(/[\s\-\(\)\.]/g, '');
      return cleaned.toLowerCase();
      
    case 'iban':
      // IBAN: uppercase, remove all spaces
      // DE89 3704 0044 0532 0130 00 → DE89370400440532013000
      return value.replace(/\s+/g, '').toUpperCase();
      
    case 'card':
      // Credit card: remove all spaces and dashes
      // 4532-1234-5678-9010 → 4532123456789010
      return value.replace(/[\s\-]/g, '');
      
    case 'ssn':
      // SSN/Tax ID: remove spaces and dashes
      // 123-45-6789 → 123456789
      return value.replace(/[\s\-]/g, '');
      
    case 'address':
      // Address: lowercase, normalize whitespace
      // "123  Main   Street" → "123 main street"
      return value.toLowerCase().replace(/\s+/g, ' ').trim();
      
    default:
      // Default: lowercase, remove extra whitespace
      return value.toLowerCase().replace(/\s+/g, '');
  }
}

/**
 * Create deterministic hash for sensitive string data
 * Same input always produces same hash (preserves uniqueness in embeddings)
 * 
 * @param {string} value - Value to hash
 * @param {string} type - Type of value (email, phone, etc.)
 * @returns {string} - Hashed placeholder
 */
export function hashSensitiveValue(value, type) {
  // Normalize value based on type
  const normalized = normalizeValue(value, type);
  
  // Create deterministic hash
  const hash = crypto.createHash('sha256')
    .update(normalized)
    .digest('hex')
    .substring(0, 8); // Use first 8 chars for readability
  
  return `[${type.toUpperCase()}_${hash}]`;
}

/**
 * Sanitize text by replacing sensitive information with hashed placeholders
 * Preserves semantic structure while protecting PII
 * 
 * @param {string} text - Text to sanitize
 * @param {Object} options - Sanitization options
 * @param {boolean} [options.sanitizeEmail=true] - Hash email addresses
 * @param {boolean} [options.sanitizePhone=true] - Hash phone numbers
 * @param {boolean} [options.sanitizeIban=true] - Hash IBAN numbers
 * @param {boolean} [options.sanitizeAddress=true] - Hash street addresses
 * @param {boolean} [options.sanitizeCreditCard=true] - Hash credit card numbers
 * @param {boolean} [options.sanitizeSSN=true] - Hash SSN/tax IDs
 * @returns {string} - Sanitized text with hashed placeholders
 */
export function sanitizeTextForEmbedding(text, options = {}) {
  if (!text || typeof text !== 'string') return text;
  
  const {
    sanitizeEmail = true,
    sanitizePhone = true,
    sanitizeIban = true,
    sanitizeAddress = true,
    sanitizeCreditCard = true,
    sanitizeSSN = true
  } = options;
  
  let sanitized = text;
  
  // Order matters: start with more specific patterns first
  if (sanitizeCreditCard) {
    sanitized = sanitized.replace(SENSITIVE_PATTERNS.creditCard, (match) => 
      hashSensitiveValue(match, 'card')
    );
  }
  
  if (sanitizeIban) {
    sanitized = sanitized.replace(SENSITIVE_PATTERNS.iban, (match) => 
      hashSensitiveValue(match, 'iban')
    );
  }
  
  if (sanitizeSSN) {
    sanitized = sanitized.replace(SENSITIVE_PATTERNS.ssn, (match) => 
      hashSensitiveValue(match, 'ssn')
    );
  }
  
  if (sanitizeEmail) {
    sanitized = sanitized.replace(SENSITIVE_PATTERNS.email, (match) => 
      hashSensitiveValue(match, 'email')
    );
  }
  
  if (sanitizePhone) {
    // Phone detection is tricky - only replace if it looks like a phone number
    // Skip if it's likely a date, version number, or other numeric pattern
    sanitized = sanitized.replace(SENSITIVE_PATTERNS.phone, (match) => {
      // Skip if it's too short or looks like a year/date
      if (match.replace(/\D/g, '').length < 7) return match;
      if (/^\d{4}$/.test(match.trim())) return match; // Year
      if (/^\d{1,2}[-./]\d{1,2}[-./]\d{2,4}$/.test(match.trim())) return match; // Date
      
      return hashSensitiveValue(match, 'phone');
    });
  }
  
  if (sanitizeAddress) {
    sanitized = sanitized.replace(SENSITIVE_PATTERNS.address, (match) => 
      hashSensitiveValue(match, 'address')
    );
  }
  
  return sanitized;
}

/**
 * Test sanitization with sample data (for debugging)
 */
export function testSanitization() {
  const samples = [
    'Contact me at john.doe@example.com or call +49 123 456789',
    'My IBAN is DE89 3704 0044 0532 0130 00',
    'Send payment to 123 Main Street, or card 4532-1234-5678-9010',
    'SSN: 123-45-6789, Phone: (555) 123-4567'
  ];
  
  console.log('Sanitization test:');
  samples.forEach(sample => {
    console.log('\nOriginal:', sample);
    console.log('Sanitized:', sanitizeTextForEmbedding(sample));
  });
}
import crypto from 'crypto';
import { decryptAccount } from './crypto.js';