Spaces:

Omarrran
/

OCR_DATASET_MAKER

Running

File size: 6,116 Bytes

24a732c

/**
 * Text Parser - UTF-8 parsing with multiple segmentation modes
 */

import { readFile } from 'fs/promises';
import type { SegmentationMode, InputConfig } from './config.js';

export interface ParsedText {
    segments: string[];
    totalChars: number;
    uniqueTokens: Set<string>;
    rawText: string;
}

/**
 * Segment text into characters, preserving combining marks with base characters
 */
function segmentByCharacter(text: string): string[] {
    // Use Unicode segmenter to properly handle combining characters
    const segments: string[] = [];
    const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)];

    for (const { segment } of graphemes) {
        const trimmed = segment.trim();
        if (trimmed.length > 0) {
            segments.push(trimmed);
        }
    }

    return segments;
}

/**
 * Segment text into words
 */
function segmentByWord(text: string): string[] {
    // Split by whitespace and common punctuation
    const words = text.split(/[\s،؛؟!,.;:""''()\[\]{}]+/);
    return words.filter(word => word.trim().length > 0);
}

/**
 * Segment text into n-grams (groups of n words)
 */
function segmentByNgram(text: string, ngramSize: number): string[] {
    const words = segmentByWord(text);
    const ngrams: string[] = [];

    for (let i = 0; i <= words.length - ngramSize; i++) {
        const ngram = words.slice(i, i + ngramSize).join(' ');
        ngrams.push(ngram);
    }

    return ngrams;
}

/**
 * Segment text into sentences
 */
function segmentBySentence(text: string): string[] {
    // Split by sentence-ending punctuation (including Arabic)
    const sentences = text.split(/[.؟!۔]+/);
    return sentences
        .map(s => s.trim())
        .filter(s => s.length > 0);
}

/**
 * Segment text into lines
 */
function segmentByLine(text: string): string[] {
    const lines = text.split(/\r?\n/);
    return lines
        .map(line => line.trim())
        .filter(line => line.length > 0);
}

/**
 * Apply segmentation based on mode
 */
function applySegmentation(
    text: string,
    mode: SegmentationMode,
    ngramSize: number = 3
): string[] {
    switch (mode) {
        case 'character':
            return segmentByCharacter(text);
        case 'word':
            return segmentByWord(text);
        case 'ngram':
            return segmentByNgram(text, ngramSize);
        case 'sentence':
            return segmentBySentence(text);
        case 'line':
            return segmentByLine(text);
        default:
            return segmentByWord(text);
    }
}

/**
 * Filter segments by length
 */
function filterByLength(
    segments: string[],
    minLength: number,
    maxLength: number
): string[] {
    return segments.filter(s => {
        const len = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length;
        return len >= minLength && len <= maxLength;
    });
}

/**
 * Shuffle array using Fisher-Yates algorithm with optional seed
 */
function shuffleArray<T>(array: T[], seed?: number): T[] {
    const result = [...array];

    // Simple seeded random number generator
    const random = seed !== undefined
        ? (() => {
            let s = seed;
            return () => {
                s = (s * 1103515245 + 12345) % 2147483648;
                return s / 2147483648;
            };
        })()
        : Math.random;

    for (let i = result.length - 1; i > 0; i--) {
        const j = Math.floor(random() * (i + 1));
        [result[i], result[j]] = [result[j], result[i]];
    }

    return result;
}

/**
 * Parse text file and segment it according to configuration
 */
export async function parseTextFile(config: InputConfig, seed?: number): Promise<ParsedText> {
    const encoding = (config.encoding || 'utf-8') as BufferEncoding;
    const rawText = await readFile(config.file, { encoding });

    // Apply segmentation
    let segments = applySegmentation(
        rawText,
        config.segmentation,
        config.ngram_size || 3
    );

    // Filter by length
    const minLen = config.min_length ?? 1;
    const maxLen = config.max_length ?? 50;
    segments = filterByLength(segments, minLen, maxLen);

    // Shuffle if requested
    if (config.shuffle) {
        segments = shuffleArray(segments, seed);
    }

    // Count unique tokens
    const uniqueTokens = new Set(segments);

    // Total character count
    const totalChars = segments.reduce((sum, s) => {
        return sum + [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length;
    }, 0);

    return {
        segments,
        totalChars,
        uniqueTokens,
        rawText,
    };
}

/**
 * Get samples from parsed text, repeating if necessary to reach target size
 */
export function getSamples(parsed: ParsedText, targetSize: number, seed?: number): string[] {
    if (parsed.segments.length === 0) {
        throw new Error('No segments found in parsed text');
    }

    const samples: string[] = [];
    let shuffled = shuffleArray(parsed.segments, seed);
    let index = 0;

    while (samples.length < targetSize) {
        if (index >= shuffled.length) {
            // Reshuffle when we've used all segments
            shuffled = shuffleArray(parsed.segments, seed ? seed + samples.length : undefined);
            index = 0;
        }
        samples.push(shuffled[index]);
        index++;
    }

    return samples;
}

/**
 * Calculate statistics about the segments
 */
export function getSegmentStats(segments: string[]): {
    count: number;
    uniqueCount: number;
    avgLength: number;
    minLength: number;
    maxLength: number;
} {
    const unique = new Set(segments);
    const lengths = segments.map(s =>
        [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length
    );

    return {
        count: segments.length,
        uniqueCount: unique.size,
        avgLength: lengths.length > 0 ? lengths.reduce((a, b) => a + b, 0) / lengths.length : 0,
        minLength: lengths.length > 0 ? Math.min(...lengths) : 0,
        maxLength: lengths.length > 0 ? Math.max(...lengths) : 0,
    };
}