OCR_DATASET_MAKER / src /core /unicode-validator.ts
Omarrran's picture
OCR Dataset Generator for HF Spaces
24a732c
/**
* Unicode Validator - Script purity and normalization enforcement
*/
import type { UnicodeConfig, NormalizationForm } from './config.js';
// Unicode script ranges
const SCRIPT_RANGES: Record<string, [number, number][]> = {
arabic: [
[0x0600, 0x06FF], // Arabic
[0x0750, 0x077F], // Arabic Supplement
[0x08A0, 0x08FF], // Arabic Extended-A
[0xFB50, 0xFDFF], // Arabic Presentation Forms-A
[0xFE70, 0xFEFF], // Arabic Presentation Forms-B
],
common: [
[0x0000, 0x007F], // Basic Latin (for punctuation, numbers)
[0x00A0, 0x00FF], // Latin-1 Supplement
[0x0020, 0x0020], // Space
[0x2000, 0x206F], // General Punctuation
[0x3000, 0x303F], // CJK Symbols and Punctuation
],
inherited: [
[0x0300, 0x036F], // Combining Diacritical Marks
[0x064B, 0x065F], // Arabic combining marks
[0x0670, 0x0670], // Superscript Alef
[0x06D6, 0x06DC], // Arabic small high marks
[0x06DF, 0x06E4], // Arabic small marks
[0x06E7, 0x06E8], // Arabic small marks
[0x06EA, 0x06ED], // Arabic small marks
],
};
// Kashmiri-specific characters that must be preserved
const KASHMIRI_DIACRITICS = new Set([
'\u0654', // Hamza above
'\u0655', // Hamza below
'\u0656', // Subscript alef
'\u0657', // Inverted damma
'\u0658', // Mark noon ghunna
'\u0659', // Zwarakay
'\u065A', // Vowel sign small v above
'\u065B', // Vowel sign inverted small v above
'\u065C', // Vowel sign dot below
'\u065D', // Reversed damma
'\u065E', // Fatha with two dots
'\u065F', // Wavy hamza below
'\u06C6', // Oe
'\u06C7', // U
'\u06C8', // Yu
'\u06C9', // Kirghiz yu
'\u06CB', // Ve
'\u06CC', // Farsi yeh
'\u06CD', // Yeh with tail
'\u06CE', // Yeh with small v
'\u06D0', // E
'\u06D2', // Yeh barree
'\u06D3', // Yeh barree with hamza
'\u0620', // Kashmiri yeh
]);
export interface ValidationResult {
valid: boolean;
normalized: string;
originalScript: string;
mixedScripts: boolean;
containsDiacritics: boolean;
errors: string[];
warnings: string[];
}
/**
* Detect the script of a character
*/
function detectCharacterScript(char: string): string {
const codePoint = char.codePointAt(0);
if (codePoint === undefined) return 'unknown';
for (const [script, ranges] of Object.entries(SCRIPT_RANGES)) {
for (const [start, end] of ranges) {
if (codePoint >= start && codePoint <= end) {
return script;
}
}
}
return 'unknown';
}
/**
* Check if a character is a Kashmiri diacritic
*/
function isKashmiriDiacritic(char: string): boolean {
return KASHMIRI_DIACRITICS.has(char);
}
/**
* Check if text contains combining marks
*/
function containsCombiningMarks(text: string): boolean {
// Unicode combining marks are in ranges like 0x0300-0x036F, 0x064B-0x065F, etc.
for (const char of text) {
const cp = char.codePointAt(0);
if (cp === undefined) continue;
// General combining marks
if (cp >= 0x0300 && cp <= 0x036F) return true;
// Arabic combining marks
if (cp >= 0x064B && cp <= 0x065F) return true;
if (cp >= 0x06D6 && cp <= 0x06ED) return true;
}
return false;
}
/**
* Apply Unicode normalization
*/
function normalizeText(text: string, form: NormalizationForm): string {
switch (form) {
case 'NFC':
return text.normalize('NFC');
case 'NFD':
return text.normalize('NFD');
case 'none':
default:
return text;
}
}
/**
* Detect all scripts present in text
*/
function detectScripts(text: string): Set<string> {
const scripts = new Set<string>();
for (const char of text) {
const script = detectCharacterScript(char);
if (script !== 'common' && script !== 'inherited' && script !== 'unknown') {
scripts.add(script);
}
}
return scripts;
}
/**
* Check if text is pure (single script)
*/
function isScriptPure(text: string, allowedScripts: string[]): boolean {
const scripts = detectScripts(text);
for (const script of scripts) {
if (!allowedScripts.includes(script)) {
return false;
}
}
return true;
}
/**
* Validate text according to Unicode configuration
*/
export function validateText(text: string, config: UnicodeConfig): ValidationResult {
const errors: string[] = [];
const warnings: string[] = [];
// Apply normalization
const normalized = normalizeText(text, config.normalization);
// Check for diacritics
const containsDiacritics = containsCombiningMarks(normalized);
if (config.preserve_diacritics && !containsDiacritics) {
warnings.push('Text does not contain any diacritics');
}
// Detect scripts
const scripts = detectScripts(normalized);
const mixedScripts = scripts.size > 1;
// Check script purity
if (config.reject_mixed && mixedScripts) {
errors.push(`Mixed scripts detected: ${[...scripts].join(', ')}`);
}
// Check allowed scripts
const allowedScripts = config.allowed_scripts || [config.enforce_script || 'arabic', 'common'];
if (config.enforce_script && !isScriptPure(normalized, allowedScripts)) {
const detected = [...detectScripts(normalized)].filter(s => !allowedScripts.includes(s));
errors.push(`Disallowed script detected: ${detected.join(', ')}`);
}
return {
valid: errors.length === 0,
normalized,
originalScript: [...scripts].join(', ') || 'unknown',
mixedScripts,
containsDiacritics,
errors,
warnings,
};
}
/**
* Validate and filter a batch of texts
*/
export function validateBatch(
texts: string[],
config: UnicodeConfig
): { valid: string[]; rejected: string[]; stats: { total: number; valid: number; rejected: number } } {
const valid: string[] = [];
const rejected: string[] = [];
for (const text of texts) {
const result = validateText(text, config);
if (result.valid) {
valid.push(result.normalized);
} else {
rejected.push(text);
}
}
return {
valid,
rejected,
stats: {
total: texts.length,
valid: valid.length,
rejected: rejected.length,
},
};
}
/**
* Quick check if text is valid for the given script
*/
export function isValidForScript(text: string, script: string): boolean {
const allowedScripts = [script, 'common', 'inherited'];
return isScriptPure(text, allowedScripts);
}
/**
* Get detailed character information for debugging
*/
export function analyzeText(text: string): {
length: number;
graphemes: number;
scripts: string[];
diacritics: string[];
codePoints: { char: string; code: string; script: string }[];
} {
const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)];
const scripts = new Set<string>();
const diacritics: string[] = [];
const codePoints: { char: string; code: string; script: string }[] = [];
for (const char of text) {
const cp = char.codePointAt(0);
const script = detectCharacterScript(char);
scripts.add(script);
if (isKashmiriDiacritic(char) || (cp && cp >= 0x064B && cp <= 0x065F)) {
diacritics.push(char);
}
codePoints.push({
char,
code: `U+${cp?.toString(16).toUpperCase().padStart(4, '0')}`,
script,
});
}
return {
length: text.length,
graphemes: graphemes.length,
scripts: [...scripts],
diacritics,
codePoints,
};
}