Spaces:

Omarrran
/

OCR_DATASET_MAKER

Running

File size: 7,906 Bytes

24a732c

/**
 * Unicode Validator - Script purity and normalization enforcement
 */

import type { UnicodeConfig, NormalizationForm } from './config.js';

// Unicode script ranges
const SCRIPT_RANGES: Record<string, [number, number][]> = {
    arabic: [
        [0x0600, 0x06FF],   // Arabic
        [0x0750, 0x077F],   // Arabic Supplement
        [0x08A0, 0x08FF],   // Arabic Extended-A
        [0xFB50, 0xFDFF],   // Arabic Presentation Forms-A
        [0xFE70, 0xFEFF],   // Arabic Presentation Forms-B
    ],
    common: [
        [0x0000, 0x007F],   // Basic Latin (for punctuation, numbers)
        [0x00A0, 0x00FF],   // Latin-1 Supplement
        [0x0020, 0x0020],   // Space
        [0x2000, 0x206F],   // General Punctuation
        [0x3000, 0x303F],   // CJK Symbols and Punctuation
    ],
    inherited: [
        [0x0300, 0x036F],   // Combining Diacritical Marks
        [0x064B, 0x065F],   // Arabic combining marks
        [0x0670, 0x0670],   // Superscript Alef
        [0x06D6, 0x06DC],   // Arabic small high marks
        [0x06DF, 0x06E4],   // Arabic small marks
        [0x06E7, 0x06E8],   // Arabic small marks
        [0x06EA, 0x06ED],   // Arabic small marks
    ],
};

// Kashmiri-specific characters that must be preserved
const KASHMIRI_DIACRITICS = new Set([
    '\u0654', // Hamza above
    '\u0655', // Hamza below
    '\u0656', // Subscript alef
    '\u0657', // Inverted damma
    '\u0658', // Mark noon ghunna
    '\u0659', // Zwarakay
    '\u065A', // Vowel sign small v above
    '\u065B', // Vowel sign inverted small v above
    '\u065C', // Vowel sign dot below
    '\u065D', // Reversed damma
    '\u065E', // Fatha with two dots
    '\u065F', // Wavy hamza below
    '\u06C6', // Oe
    '\u06C7', // U
    '\u06C8', // Yu
    '\u06C9', // Kirghiz yu
    '\u06CB', // Ve
    '\u06CC', // Farsi yeh
    '\u06CD', // Yeh with tail
    '\u06CE', // Yeh with small v
    '\u06D0', // E
    '\u06D2', // Yeh barree
    '\u06D3', // Yeh barree with hamza
    '\u0620', // Kashmiri yeh
]);

export interface ValidationResult {
    valid: boolean;
    normalized: string;
    originalScript: string;
    mixedScripts: boolean;
    containsDiacritics: boolean;
    errors: string[];
    warnings: string[];
}

/**
 * Detect the script of a character
 */
function detectCharacterScript(char: string): string {
    const codePoint = char.codePointAt(0);
    if (codePoint === undefined) return 'unknown';

    for (const [script, ranges] of Object.entries(SCRIPT_RANGES)) {
        for (const [start, end] of ranges) {
            if (codePoint >= start && codePoint <= end) {
                return script;
            }
        }
    }

    return 'unknown';
}

/**
 * Check if a character is a Kashmiri diacritic
 */
function isKashmiriDiacritic(char: string): boolean {
    return KASHMIRI_DIACRITICS.has(char);
}

/**
 * Check if text contains combining marks
 */
function containsCombiningMarks(text: string): boolean {
    // Unicode combining marks are in ranges like 0x0300-0x036F, 0x064B-0x065F, etc.
    for (const char of text) {
        const cp = char.codePointAt(0);
        if (cp === undefined) continue;

        // General combining marks
        if (cp >= 0x0300 && cp <= 0x036F) return true;
        // Arabic combining marks
        if (cp >= 0x064B && cp <= 0x065F) return true;
        if (cp >= 0x06D6 && cp <= 0x06ED) return true;
    }
    return false;
}

/**
 * Apply Unicode normalization
 */
function normalizeText(text: string, form: NormalizationForm): string {
    switch (form) {
        case 'NFC':
            return text.normalize('NFC');
        case 'NFD':
            return text.normalize('NFD');
        case 'none':
        default:
            return text;
    }
}

/**
 * Detect all scripts present in text
 */
function detectScripts(text: string): Set<string> {
    const scripts = new Set<string>();

    for (const char of text) {
        const script = detectCharacterScript(char);
        if (script !== 'common' && script !== 'inherited' && script !== 'unknown') {
            scripts.add(script);
        }
    }

    return scripts;
}

/**
 * Check if text is pure (single script)
 */
function isScriptPure(text: string, allowedScripts: string[]): boolean {
    const scripts = detectScripts(text);

    for (const script of scripts) {
        if (!allowedScripts.includes(script)) {
            return false;
        }
    }

    return true;
}

/**
 * Validate text according to Unicode configuration
 */
export function validateText(text: string, config: UnicodeConfig): ValidationResult {
    const errors: string[] = [];
    const warnings: string[] = [];

    // Apply normalization
    const normalized = normalizeText(text, config.normalization);

    // Check for diacritics
    const containsDiacritics = containsCombiningMarks(normalized);

    if (config.preserve_diacritics && !containsDiacritics) {
        warnings.push('Text does not contain any diacritics');
    }

    // Detect scripts
    const scripts = detectScripts(normalized);
    const mixedScripts = scripts.size > 1;

    // Check script purity
    if (config.reject_mixed && mixedScripts) {
        errors.push(`Mixed scripts detected: ${[...scripts].join(', ')}`);
    }

    // Check allowed scripts
    const allowedScripts = config.allowed_scripts || [config.enforce_script || 'arabic', 'common'];
    if (config.enforce_script && !isScriptPure(normalized, allowedScripts)) {
        const detected = [...detectScripts(normalized)].filter(s => !allowedScripts.includes(s));
        errors.push(`Disallowed script detected: ${detected.join(', ')}`);
    }

    return {
        valid: errors.length === 0,
        normalized,
        originalScript: [...scripts].join(', ') || 'unknown',
        mixedScripts,
        containsDiacritics,
        errors,
        warnings,
    };
}

/**
 * Validate and filter a batch of texts
 */
export function validateBatch(
    texts: string[],
    config: UnicodeConfig
): { valid: string[]; rejected: string[]; stats: { total: number; valid: number; rejected: number } } {
    const valid: string[] = [];
    const rejected: string[] = [];

    for (const text of texts) {
        const result = validateText(text, config);
        if (result.valid) {
            valid.push(result.normalized);
        } else {
            rejected.push(text);
        }
    }

    return {
        valid,
        rejected,
        stats: {
            total: texts.length,
            valid: valid.length,
            rejected: rejected.length,
        },
    };
}

/**
 * Quick check if text is valid for the given script
 */
export function isValidForScript(text: string, script: string): boolean {
    const allowedScripts = [script, 'common', 'inherited'];
    return isScriptPure(text, allowedScripts);
}

/**
 * Get detailed character information for debugging
 */
export function analyzeText(text: string): {
    length: number;
    graphemes: number;
    scripts: string[];
    diacritics: string[];
    codePoints: { char: string; code: string; script: string }[];
} {
    const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)];
    const scripts = new Set<string>();
    const diacritics: string[] = [];
    const codePoints: { char: string; code: string; script: string }[] = [];

    for (const char of text) {
        const cp = char.codePointAt(0);
        const script = detectCharacterScript(char);
        scripts.add(script);

        if (isKashmiriDiacritic(char) || (cp && cp >= 0x064B && cp <= 0x065F)) {
            diacritics.push(char);
        }

        codePoints.push({
            char,
            code: `U+${cp?.toString(16).toUpperCase().padStart(4, '0')}`,
            script,
        });
    }

    return {
        length: text.length,
        graphemes: graphemes.length,
        scripts: [...scripts],
        diacritics,
        codePoints,
    };
}