Spaces:
Sleeping
Sleeping
| /** | |
| * Unicode Validator - Script purity and normalization enforcement | |
| */ | |
| import type { UnicodeConfig, NormalizationForm } from './config.js'; | |
| // Unicode script ranges | |
| const SCRIPT_RANGES: Record<string, [number, number][]> = { | |
| arabic: [ | |
| [0x0600, 0x06FF], // Arabic | |
| [0x0750, 0x077F], // Arabic Supplement | |
| [0x08A0, 0x08FF], // Arabic Extended-A | |
| [0xFB50, 0xFDFF], // Arabic Presentation Forms-A | |
| [0xFE70, 0xFEFF], // Arabic Presentation Forms-B | |
| ], | |
| common: [ | |
| [0x0000, 0x007F], // Basic Latin (for punctuation, numbers) | |
| [0x00A0, 0x00FF], // Latin-1 Supplement | |
| [0x0020, 0x0020], // Space | |
| [0x2000, 0x206F], // General Punctuation | |
| [0x3000, 0x303F], // CJK Symbols and Punctuation | |
| ], | |
| inherited: [ | |
| [0x0300, 0x036F], // Combining Diacritical Marks | |
| [0x064B, 0x065F], // Arabic combining marks | |
| [0x0670, 0x0670], // Superscript Alef | |
| [0x06D6, 0x06DC], // Arabic small high marks | |
| [0x06DF, 0x06E4], // Arabic small marks | |
| [0x06E7, 0x06E8], // Arabic small marks | |
| [0x06EA, 0x06ED], // Arabic small marks | |
| ], | |
| }; | |
| // Kashmiri-specific characters that must be preserved | |
| const KASHMIRI_DIACRITICS = new Set([ | |
| '\u0654', // Hamza above | |
| '\u0655', // Hamza below | |
| '\u0656', // Subscript alef | |
| '\u0657', // Inverted damma | |
| '\u0658', // Mark noon ghunna | |
| '\u0659', // Zwarakay | |
| '\u065A', // Vowel sign small v above | |
| '\u065B', // Vowel sign inverted small v above | |
| '\u065C', // Vowel sign dot below | |
| '\u065D', // Reversed damma | |
| '\u065E', // Fatha with two dots | |
| '\u065F', // Wavy hamza below | |
| '\u06C6', // Oe | |
| '\u06C7', // U | |
| '\u06C8', // Yu | |
| '\u06C9', // Kirghiz yu | |
| '\u06CB', // Ve | |
| '\u06CC', // Farsi yeh | |
| '\u06CD', // Yeh with tail | |
| '\u06CE', // Yeh with small v | |
| '\u06D0', // E | |
| '\u06D2', // Yeh barree | |
| '\u06D3', // Yeh barree with hamza | |
| '\u0620', // Kashmiri yeh | |
| ]); | |
| export interface ValidationResult { | |
| valid: boolean; | |
| normalized: string; | |
| originalScript: string; | |
| mixedScripts: boolean; | |
| containsDiacritics: boolean; | |
| errors: string[]; | |
| warnings: string[]; | |
| } | |
| /** | |
| * Detect the script of a character | |
| */ | |
| function detectCharacterScript(char: string): string { | |
| const codePoint = char.codePointAt(0); | |
| if (codePoint === undefined) return 'unknown'; | |
| for (const [script, ranges] of Object.entries(SCRIPT_RANGES)) { | |
| for (const [start, end] of ranges) { | |
| if (codePoint >= start && codePoint <= end) { | |
| return script; | |
| } | |
| } | |
| } | |
| return 'unknown'; | |
| } | |
| /** | |
| * Check if a character is a Kashmiri diacritic | |
| */ | |
| function isKashmiriDiacritic(char: string): boolean { | |
| return KASHMIRI_DIACRITICS.has(char); | |
| } | |
| /** | |
| * Check if text contains combining marks | |
| */ | |
| function containsCombiningMarks(text: string): boolean { | |
| // Unicode combining marks are in ranges like 0x0300-0x036F, 0x064B-0x065F, etc. | |
| for (const char of text) { | |
| const cp = char.codePointAt(0); | |
| if (cp === undefined) continue; | |
| // General combining marks | |
| if (cp >= 0x0300 && cp <= 0x036F) return true; | |
| // Arabic combining marks | |
| if (cp >= 0x064B && cp <= 0x065F) return true; | |
| if (cp >= 0x06D6 && cp <= 0x06ED) return true; | |
| } | |
| return false; | |
| } | |
| /** | |
| * Apply Unicode normalization | |
| */ | |
| function normalizeText(text: string, form: NormalizationForm): string { | |
| switch (form) { | |
| case 'NFC': | |
| return text.normalize('NFC'); | |
| case 'NFD': | |
| return text.normalize('NFD'); | |
| case 'none': | |
| default: | |
| return text; | |
| } | |
| } | |
| /** | |
| * Detect all scripts present in text | |
| */ | |
| function detectScripts(text: string): Set<string> { | |
| const scripts = new Set<string>(); | |
| for (const char of text) { | |
| const script = detectCharacterScript(char); | |
| if (script !== 'common' && script !== 'inherited' && script !== 'unknown') { | |
| scripts.add(script); | |
| } | |
| } | |
| return scripts; | |
| } | |
| /** | |
| * Check if text is pure (single script) | |
| */ | |
| function isScriptPure(text: string, allowedScripts: string[]): boolean { | |
| const scripts = detectScripts(text); | |
| for (const script of scripts) { | |
| if (!allowedScripts.includes(script)) { | |
| return false; | |
| } | |
| } | |
| return true; | |
| } | |
| /** | |
| * Validate text according to Unicode configuration | |
| */ | |
| export function validateText(text: string, config: UnicodeConfig): ValidationResult { | |
| const errors: string[] = []; | |
| const warnings: string[] = []; | |
| // Apply normalization | |
| const normalized = normalizeText(text, config.normalization); | |
| // Check for diacritics | |
| const containsDiacritics = containsCombiningMarks(normalized); | |
| if (config.preserve_diacritics && !containsDiacritics) { | |
| warnings.push('Text does not contain any diacritics'); | |
| } | |
| // Detect scripts | |
| const scripts = detectScripts(normalized); | |
| const mixedScripts = scripts.size > 1; | |
| // Check script purity | |
| if (config.reject_mixed && mixedScripts) { | |
| errors.push(`Mixed scripts detected: ${[...scripts].join(', ')}`); | |
| } | |
| // Check allowed scripts | |
| const allowedScripts = config.allowed_scripts || [config.enforce_script || 'arabic', 'common']; | |
| if (config.enforce_script && !isScriptPure(normalized, allowedScripts)) { | |
| const detected = [...detectScripts(normalized)].filter(s => !allowedScripts.includes(s)); | |
| errors.push(`Disallowed script detected: ${detected.join(', ')}`); | |
| } | |
| return { | |
| valid: errors.length === 0, | |
| normalized, | |
| originalScript: [...scripts].join(', ') || 'unknown', | |
| mixedScripts, | |
| containsDiacritics, | |
| errors, | |
| warnings, | |
| }; | |
| } | |
| /** | |
| * Validate and filter a batch of texts | |
| */ | |
| export function validateBatch( | |
| texts: string[], | |
| config: UnicodeConfig | |
| ): { valid: string[]; rejected: string[]; stats: { total: number; valid: number; rejected: number } } { | |
| const valid: string[] = []; | |
| const rejected: string[] = []; | |
| for (const text of texts) { | |
| const result = validateText(text, config); | |
| if (result.valid) { | |
| valid.push(result.normalized); | |
| } else { | |
| rejected.push(text); | |
| } | |
| } | |
| return { | |
| valid, | |
| rejected, | |
| stats: { | |
| total: texts.length, | |
| valid: valid.length, | |
| rejected: rejected.length, | |
| }, | |
| }; | |
| } | |
| /** | |
| * Quick check if text is valid for the given script | |
| */ | |
| export function isValidForScript(text: string, script: string): boolean { | |
| const allowedScripts = [script, 'common', 'inherited']; | |
| return isScriptPure(text, allowedScripts); | |
| } | |
| /** | |
| * Get detailed character information for debugging | |
| */ | |
| export function analyzeText(text: string): { | |
| length: number; | |
| graphemes: number; | |
| scripts: string[]; | |
| diacritics: string[]; | |
| codePoints: { char: string; code: string; script: string }[]; | |
| } { | |
| const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)]; | |
| const scripts = new Set<string>(); | |
| const diacritics: string[] = []; | |
| const codePoints: { char: string; code: string; script: string }[] = []; | |
| for (const char of text) { | |
| const cp = char.codePointAt(0); | |
| const script = detectCharacterScript(char); | |
| scripts.add(script); | |
| if (isKashmiriDiacritic(char) || (cp && cp >= 0x064B && cp <= 0x065F)) { | |
| diacritics.push(char); | |
| } | |
| codePoints.push({ | |
| char, | |
| code: `U+${cp?.toString(16).toUpperCase().padStart(4, '0')}`, | |
| script, | |
| }); | |
| } | |
| return { | |
| length: text.length, | |
| graphemes: graphemes.length, | |
| scripts: [...scripts], | |
| diacritics, | |
| codePoints, | |
| }; | |
| } | |