Spaces:

Omarrran
/

OCR_DATASET_MAKER

Sleeping

App Files Files Community

OCR_DATASET_MAKER / src /core /unicode-validator.ts

Omarrran

OCR Dataset Generator for HF Spaces

24a732c about 2 months ago

raw

history blame contribute delete

7.91 kB

	/**
	* Unicode Validator - Script purity and normalization enforcement
	*/

	import type { UnicodeConfig, NormalizationForm } from './config.js';

	// Unicode script ranges
	const SCRIPT_RANGES: Record<string, [number, number][]> = {
	arabic: [
	[0x0600, 0x06FF], // Arabic
	[0x0750, 0x077F], // Arabic Supplement
	[0x08A0, 0x08FF], // Arabic Extended-A
	[0xFB50, 0xFDFF], // Arabic Presentation Forms-A
	[0xFE70, 0xFEFF], // Arabic Presentation Forms-B
	],
	common: [
	[0x0000, 0x007F], // Basic Latin (for punctuation, numbers)
	[0x00A0, 0x00FF], // Latin-1 Supplement
	[0x0020, 0x0020], // Space
	[0x2000, 0x206F], // General Punctuation
	[0x3000, 0x303F], // CJK Symbols and Punctuation
	],
	inherited: [
	[0x0300, 0x036F], // Combining Diacritical Marks
	[0x064B, 0x065F], // Arabic combining marks
	[0x0670, 0x0670], // Superscript Alef
	[0x06D6, 0x06DC], // Arabic small high marks
	[0x06DF, 0x06E4], // Arabic small marks
	[0x06E7, 0x06E8], // Arabic small marks
	[0x06EA, 0x06ED], // Arabic small marks
	],
	};

	// Kashmiri-specific characters that must be preserved
	const KASHMIRI_DIACRITICS = new Set([
	'\u0654', // Hamza above
	'\u0655', // Hamza below
	'\u0656', // Subscript alef
	'\u0657', // Inverted damma
	'\u0658', // Mark noon ghunna
	'\u0659', // Zwarakay
	'\u065A', // Vowel sign small v above
	'\u065B', // Vowel sign inverted small v above
	'\u065C', // Vowel sign dot below
	'\u065D', // Reversed damma
	'\u065E', // Fatha with two dots
	'\u065F', // Wavy hamza below
	'\u06C6', // Oe
	'\u06C7', // U
	'\u06C8', // Yu
	'\u06C9', // Kirghiz yu
	'\u06CB', // Ve
	'\u06CC', // Farsi yeh
	'\u06CD', // Yeh with tail
	'\u06CE', // Yeh with small v
	'\u06D0', // E
	'\u06D2', // Yeh barree
	'\u06D3', // Yeh barree with hamza
	'\u0620', // Kashmiri yeh
	]);

	export interface ValidationResult {
	valid: boolean;
	normalized: string;
	originalScript: string;
	mixedScripts: boolean;
	containsDiacritics: boolean;
	errors: string[];
	warnings: string[];
	}

	/**
	* Detect the script of a character
	*/
	function detectCharacterScript(char: string): string {
	const codePoint = char.codePointAt(0);
	if (codePoint === undefined) return 'unknown';

	for (const [script, ranges] of Object.entries(SCRIPT_RANGES)) {
	for (const [start, end] of ranges) {
	if (codePoint >= start && codePoint <= end) {
	return script;
	}
	}
	}

	return 'unknown';
	}

	/**
	* Check if a character is a Kashmiri diacritic
	*/
	function isKashmiriDiacritic(char: string): boolean {
	return KASHMIRI_DIACRITICS.has(char);
	}

	/**
	* Check if text contains combining marks
	*/
	function containsCombiningMarks(text: string): boolean {
	// Unicode combining marks are in ranges like 0x0300-0x036F, 0x064B-0x065F, etc.
	for (const char of text) {
	const cp = char.codePointAt(0);
	if (cp === undefined) continue;

	// General combining marks
	if (cp >= 0x0300 && cp <= 0x036F) return true;
	// Arabic combining marks
	if (cp >= 0x064B && cp <= 0x065F) return true;
	if (cp >= 0x06D6 && cp <= 0x06ED) return true;
	}
	return false;
	}

	/**
	* Apply Unicode normalization
	*/
	function normalizeText(text: string, form: NormalizationForm): string {
	switch (form) {
	case 'NFC':
	return text.normalize('NFC');
	case 'NFD':
	return text.normalize('NFD');
	case 'none':
	default:
	return text;
	}
	}

	/**
	* Detect all scripts present in text
	*/
	function detectScripts(text: string): Set<string> {
	const scripts = new Set<string>();

	for (const char of text) {
	const script = detectCharacterScript(char);
	if (script !== 'common' && script !== 'inherited' && script !== 'unknown') {
	scripts.add(script);
	}
	}

	return scripts;
	}

	/**
	* Check if text is pure (single script)
	*/
	function isScriptPure(text: string, allowedScripts: string[]): boolean {
	const scripts = detectScripts(text);

	for (const script of scripts) {
	if (!allowedScripts.includes(script)) {
	return false;
	}
	}

	return true;
	}

	/**
	* Validate text according to Unicode configuration
	*/
	export function validateText(text: string, config: UnicodeConfig): ValidationResult {
	const errors: string[] = [];
	const warnings: string[] = [];

	// Apply normalization
	const normalized = normalizeText(text, config.normalization);

	// Check for diacritics
	const containsDiacritics = containsCombiningMarks(normalized);

	if (config.preserve_diacritics && !containsDiacritics) {
	warnings.push('Text does not contain any diacritics');
	}

	// Detect scripts
	const scripts = detectScripts(normalized);
	const mixedScripts = scripts.size > 1;

	// Check script purity
	if (config.reject_mixed && mixedScripts) {
	errors.push(`Mixed scripts detected: ${[...scripts].join(', ')}`);
	}

	// Check allowed scripts
	const allowedScripts = config.allowed_scripts \|\| [config.enforce_script \|\| 'arabic', 'common'];
	if (config.enforce_script && !isScriptPure(normalized, allowedScripts)) {
	const detected = [...detectScripts(normalized)].filter(s => !allowedScripts.includes(s));
	errors.push(`Disallowed script detected: ${detected.join(', ')}`);
	}

	return {
	valid: errors.length === 0,
	normalized,
	originalScript: [...scripts].join(', ') \|\| 'unknown',
	mixedScripts,
	containsDiacritics,
	errors,
	warnings,
	};
	}

	/**
	* Validate and filter a batch of texts
	*/
	export function validateBatch(
	texts: string[],
	config: UnicodeConfig
	): { valid: string[]; rejected: string[]; stats: { total: number; valid: number; rejected: number } } {
	const valid: string[] = [];
	const rejected: string[] = [];

	for (const text of texts) {
	const result = validateText(text, config);
	if (result.valid) {
	valid.push(result.normalized);
	} else {
	rejected.push(text);
	}
	}

	return {
	valid,
	rejected,
	stats: {
	total: texts.length,
	valid: valid.length,
	rejected: rejected.length,
	},
	};
	}

	/**
	* Quick check if text is valid for the given script
	*/
	export function isValidForScript(text: string, script: string): boolean {
	const allowedScripts = [script, 'common', 'inherited'];
	return isScriptPure(text, allowedScripts);
	}

	/**
	* Get detailed character information for debugging
	*/
	export function analyzeText(text: string): {
	length: number;
	graphemes: number;
	scripts: string[];
	diacritics: string[];
	codePoints: { char: string; code: string; script: string }[];
	} {
	const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)];
	const scripts = new Set<string>();
	const diacritics: string[] = [];
	const codePoints: { char: string; code: string; script: string }[] = [];

	for (const char of text) {
	const cp = char.codePointAt(0);
	const script = detectCharacterScript(char);
	scripts.add(script);

	if (isKashmiriDiacritic(char) \|\| (cp && cp >= 0x064B && cp <= 0x065F)) {
	diacritics.push(char);
	}

	codePoints.push({
	char,
	code: `U+${cp?.toString(16).toUpperCase().padStart(4, '0')}`,
	script,
	});
	}

	return {
	length: text.length,
	graphemes: graphemes.length,
	scripts: [...scripts],
	diacritics,
	codePoints,
	};
	}