/** * Unicode Validator - Script purity and normalization enforcement */ import type { UnicodeConfig, NormalizationForm } from './config.js'; // Unicode script ranges const SCRIPT_RANGES: Record = { arabic: [ [0x0600, 0x06FF], // Arabic [0x0750, 0x077F], // Arabic Supplement [0x08A0, 0x08FF], // Arabic Extended-A [0xFB50, 0xFDFF], // Arabic Presentation Forms-A [0xFE70, 0xFEFF], // Arabic Presentation Forms-B ], common: [ [0x0000, 0x007F], // Basic Latin (for punctuation, numbers) [0x00A0, 0x00FF], // Latin-1 Supplement [0x0020, 0x0020], // Space [0x2000, 0x206F], // General Punctuation [0x3000, 0x303F], // CJK Symbols and Punctuation ], inherited: [ [0x0300, 0x036F], // Combining Diacritical Marks [0x064B, 0x065F], // Arabic combining marks [0x0670, 0x0670], // Superscript Alef [0x06D6, 0x06DC], // Arabic small high marks [0x06DF, 0x06E4], // Arabic small marks [0x06E7, 0x06E8], // Arabic small marks [0x06EA, 0x06ED], // Arabic small marks ], }; // Kashmiri-specific characters that must be preserved const KASHMIRI_DIACRITICS = new Set([ '\u0654', // Hamza above '\u0655', // Hamza below '\u0656', // Subscript alef '\u0657', // Inverted damma '\u0658', // Mark noon ghunna '\u0659', // Zwarakay '\u065A', // Vowel sign small v above '\u065B', // Vowel sign inverted small v above '\u065C', // Vowel sign dot below '\u065D', // Reversed damma '\u065E', // Fatha with two dots '\u065F', // Wavy hamza below '\u06C6', // Oe '\u06C7', // U '\u06C8', // Yu '\u06C9', // Kirghiz yu '\u06CB', // Ve '\u06CC', // Farsi yeh '\u06CD', // Yeh with tail '\u06CE', // Yeh with small v '\u06D0', // E '\u06D2', // Yeh barree '\u06D3', // Yeh barree with hamza '\u0620', // Kashmiri yeh ]); export interface ValidationResult { valid: boolean; normalized: string; originalScript: string; mixedScripts: boolean; containsDiacritics: boolean; errors: string[]; warnings: string[]; } /** * Detect the script of a character */ function detectCharacterScript(char: string): string { const codePoint = char.codePointAt(0); if (codePoint === undefined) return 'unknown'; for (const [script, ranges] of Object.entries(SCRIPT_RANGES)) { for (const [start, end] of ranges) { if (codePoint >= start && codePoint <= end) { return script; } } } return 'unknown'; } /** * Check if a character is a Kashmiri diacritic */ function isKashmiriDiacritic(char: string): boolean { return KASHMIRI_DIACRITICS.has(char); } /** * Check if text contains combining marks */ function containsCombiningMarks(text: string): boolean { // Unicode combining marks are in ranges like 0x0300-0x036F, 0x064B-0x065F, etc. for (const char of text) { const cp = char.codePointAt(0); if (cp === undefined) continue; // General combining marks if (cp >= 0x0300 && cp <= 0x036F) return true; // Arabic combining marks if (cp >= 0x064B && cp <= 0x065F) return true; if (cp >= 0x06D6 && cp <= 0x06ED) return true; } return false; } /** * Apply Unicode normalization */ function normalizeText(text: string, form: NormalizationForm): string { switch (form) { case 'NFC': return text.normalize('NFC'); case 'NFD': return text.normalize('NFD'); case 'none': default: return text; } } /** * Detect all scripts present in text */ function detectScripts(text: string): Set { const scripts = new Set(); for (const char of text) { const script = detectCharacterScript(char); if (script !== 'common' && script !== 'inherited' && script !== 'unknown') { scripts.add(script); } } return scripts; } /** * Check if text is pure (single script) */ function isScriptPure(text: string, allowedScripts: string[]): boolean { const scripts = detectScripts(text); for (const script of scripts) { if (!allowedScripts.includes(script)) { return false; } } return true; } /** * Validate text according to Unicode configuration */ export function validateText(text: string, config: UnicodeConfig): ValidationResult { const errors: string[] = []; const warnings: string[] = []; // Apply normalization const normalized = normalizeText(text, config.normalization); // Check for diacritics const containsDiacritics = containsCombiningMarks(normalized); if (config.preserve_diacritics && !containsDiacritics) { warnings.push('Text does not contain any diacritics'); } // Detect scripts const scripts = detectScripts(normalized); const mixedScripts = scripts.size > 1; // Check script purity if (config.reject_mixed && mixedScripts) { errors.push(`Mixed scripts detected: ${[...scripts].join(', ')}`); } // Check allowed scripts const allowedScripts = config.allowed_scripts || [config.enforce_script || 'arabic', 'common']; if (config.enforce_script && !isScriptPure(normalized, allowedScripts)) { const detected = [...detectScripts(normalized)].filter(s => !allowedScripts.includes(s)); errors.push(`Disallowed script detected: ${detected.join(', ')}`); } return { valid: errors.length === 0, normalized, originalScript: [...scripts].join(', ') || 'unknown', mixedScripts, containsDiacritics, errors, warnings, }; } /** * Validate and filter a batch of texts */ export function validateBatch( texts: string[], config: UnicodeConfig ): { valid: string[]; rejected: string[]; stats: { total: number; valid: number; rejected: number } } { const valid: string[] = []; const rejected: string[] = []; for (const text of texts) { const result = validateText(text, config); if (result.valid) { valid.push(result.normalized); } else { rejected.push(text); } } return { valid, rejected, stats: { total: texts.length, valid: valid.length, rejected: rejected.length, }, }; } /** * Quick check if text is valid for the given script */ export function isValidForScript(text: string, script: string): boolean { const allowedScripts = [script, 'common', 'inherited']; return isScriptPure(text, allowedScripts); } /** * Get detailed character information for debugging */ export function analyzeText(text: string): { length: number; graphemes: number; scripts: string[]; diacritics: string[]; codePoints: { char: string; code: string; script: string }[]; } { const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)]; const scripts = new Set(); const diacritics: string[] = []; const codePoints: { char: string; code: string; script: string }[] = []; for (const char of text) { const cp = char.codePointAt(0); const script = detectCharacterScript(char); scripts.add(script); if (isKashmiriDiacritic(char) || (cp && cp >= 0x064B && cp <= 0x065F)) { diacritics.push(char); } codePoints.push({ char, code: `U+${cp?.toString(16).toUpperCase().padStart(4, '0')}`, script, }); } return { length: text.length, graphemes: graphemes.length, scripts: [...scripts], diacritics, codePoints, }; }