File size: 7,906 Bytes
24a732c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
/**
 * Unicode Validator - Script purity and normalization enforcement
 */

import type { UnicodeConfig, NormalizationForm } from './config.js';

// Unicode script ranges
const SCRIPT_RANGES: Record<string, [number, number][]> = {
    arabic: [
        [0x0600, 0x06FF],   // Arabic
        [0x0750, 0x077F],   // Arabic Supplement
        [0x08A0, 0x08FF],   // Arabic Extended-A
        [0xFB50, 0xFDFF],   // Arabic Presentation Forms-A
        [0xFE70, 0xFEFF],   // Arabic Presentation Forms-B
    ],
    common: [
        [0x0000, 0x007F],   // Basic Latin (for punctuation, numbers)
        [0x00A0, 0x00FF],   // Latin-1 Supplement
        [0x0020, 0x0020],   // Space
        [0x2000, 0x206F],   // General Punctuation
        [0x3000, 0x303F],   // CJK Symbols and Punctuation
    ],
    inherited: [
        [0x0300, 0x036F],   // Combining Diacritical Marks
        [0x064B, 0x065F],   // Arabic combining marks
        [0x0670, 0x0670],   // Superscript Alef
        [0x06D6, 0x06DC],   // Arabic small high marks
        [0x06DF, 0x06E4],   // Arabic small marks
        [0x06E7, 0x06E8],   // Arabic small marks
        [0x06EA, 0x06ED],   // Arabic small marks
    ],
};

// Kashmiri-specific characters that must be preserved
const KASHMIRI_DIACRITICS = new Set([
    '\u0654', // Hamza above
    '\u0655', // Hamza below
    '\u0656', // Subscript alef
    '\u0657', // Inverted damma
    '\u0658', // Mark noon ghunna
    '\u0659', // Zwarakay
    '\u065A', // Vowel sign small v above
    '\u065B', // Vowel sign inverted small v above
    '\u065C', // Vowel sign dot below
    '\u065D', // Reversed damma
    '\u065E', // Fatha with two dots
    '\u065F', // Wavy hamza below
    '\u06C6', // Oe
    '\u06C7', // U
    '\u06C8', // Yu
    '\u06C9', // Kirghiz yu
    '\u06CB', // Ve
    '\u06CC', // Farsi yeh
    '\u06CD', // Yeh with tail
    '\u06CE', // Yeh with small v
    '\u06D0', // E
    '\u06D2', // Yeh barree
    '\u06D3', // Yeh barree with hamza
    '\u0620', // Kashmiri yeh
]);

export interface ValidationResult {
    valid: boolean;
    normalized: string;
    originalScript: string;
    mixedScripts: boolean;
    containsDiacritics: boolean;
    errors: string[];
    warnings: string[];
}

/**
 * Detect the script of a character
 */
function detectCharacterScript(char: string): string {
    const codePoint = char.codePointAt(0);
    if (codePoint === undefined) return 'unknown';

    for (const [script, ranges] of Object.entries(SCRIPT_RANGES)) {
        for (const [start, end] of ranges) {
            if (codePoint >= start && codePoint <= end) {
                return script;
            }
        }
    }

    return 'unknown';
}

/**
 * Check if a character is a Kashmiri diacritic
 */
function isKashmiriDiacritic(char: string): boolean {
    return KASHMIRI_DIACRITICS.has(char);
}

/**
 * Check if text contains combining marks
 */
function containsCombiningMarks(text: string): boolean {
    // Unicode combining marks are in ranges like 0x0300-0x036F, 0x064B-0x065F, etc.
    for (const char of text) {
        const cp = char.codePointAt(0);
        if (cp === undefined) continue;

        // General combining marks
        if (cp >= 0x0300 && cp <= 0x036F) return true;
        // Arabic combining marks
        if (cp >= 0x064B && cp <= 0x065F) return true;
        if (cp >= 0x06D6 && cp <= 0x06ED) return true;
    }
    return false;
}

/**
 * Apply Unicode normalization
 */
function normalizeText(text: string, form: NormalizationForm): string {
    switch (form) {
        case 'NFC':
            return text.normalize('NFC');
        case 'NFD':
            return text.normalize('NFD');
        case 'none':
        default:
            return text;
    }
}

/**
 * Detect all scripts present in text
 */
function detectScripts(text: string): Set<string> {
    const scripts = new Set<string>();

    for (const char of text) {
        const script = detectCharacterScript(char);
        if (script !== 'common' && script !== 'inherited' && script !== 'unknown') {
            scripts.add(script);
        }
    }

    return scripts;
}

/**
 * Check if text is pure (single script)
 */
function isScriptPure(text: string, allowedScripts: string[]): boolean {
    const scripts = detectScripts(text);

    for (const script of scripts) {
        if (!allowedScripts.includes(script)) {
            return false;
        }
    }

    return true;
}

/**
 * Validate text according to Unicode configuration
 */
export function validateText(text: string, config: UnicodeConfig): ValidationResult {
    const errors: string[] = [];
    const warnings: string[] = [];

    // Apply normalization
    const normalized = normalizeText(text, config.normalization);

    // Check for diacritics
    const containsDiacritics = containsCombiningMarks(normalized);

    if (config.preserve_diacritics && !containsDiacritics) {
        warnings.push('Text does not contain any diacritics');
    }

    // Detect scripts
    const scripts = detectScripts(normalized);
    const mixedScripts = scripts.size > 1;

    // Check script purity
    if (config.reject_mixed && mixedScripts) {
        errors.push(`Mixed scripts detected: ${[...scripts].join(', ')}`);
    }

    // Check allowed scripts
    const allowedScripts = config.allowed_scripts || [config.enforce_script || 'arabic', 'common'];
    if (config.enforce_script && !isScriptPure(normalized, allowedScripts)) {
        const detected = [...detectScripts(normalized)].filter(s => !allowedScripts.includes(s));
        errors.push(`Disallowed script detected: ${detected.join(', ')}`);
    }

    return {
        valid: errors.length === 0,
        normalized,
        originalScript: [...scripts].join(', ') || 'unknown',
        mixedScripts,
        containsDiacritics,
        errors,
        warnings,
    };
}

/**
 * Validate and filter a batch of texts
 */
export function validateBatch(
    texts: string[],
    config: UnicodeConfig
): { valid: string[]; rejected: string[]; stats: { total: number; valid: number; rejected: number } } {
    const valid: string[] = [];
    const rejected: string[] = [];

    for (const text of texts) {
        const result = validateText(text, config);
        if (result.valid) {
            valid.push(result.normalized);
        } else {
            rejected.push(text);
        }
    }

    return {
        valid,
        rejected,
        stats: {
            total: texts.length,
            valid: valid.length,
            rejected: rejected.length,
        },
    };
}

/**
 * Quick check if text is valid for the given script
 */
export function isValidForScript(text: string, script: string): boolean {
    const allowedScripts = [script, 'common', 'inherited'];
    return isScriptPure(text, allowedScripts);
}

/**
 * Get detailed character information for debugging
 */
export function analyzeText(text: string): {
    length: number;
    graphemes: number;
    scripts: string[];
    diacritics: string[];
    codePoints: { char: string; code: string; script: string }[];
} {
    const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)];
    const scripts = new Set<string>();
    const diacritics: string[] = [];
    const codePoints: { char: string; code: string; script: string }[] = [];

    for (const char of text) {
        const cp = char.codePointAt(0);
        const script = detectCharacterScript(char);
        scripts.add(script);

        if (isKashmiriDiacritic(char) || (cp && cp >= 0x064B && cp <= 0x065F)) {
            diacritics.push(char);
        }

        codePoints.push({
            char,
            code: `U+${cp?.toString(16).toUpperCase().padStart(4, '0')}`,
            script,
        });
    }

    return {
        length: text.length,
        graphemes: graphemes.length,
        scripts: [...scripts],
        diacritics,
        codePoints,
    };
}