OCR_DATASET_MAKER / src /core /text-parser.ts
Omarrran's picture
OCR Dataset Generator for HF Spaces
24a732c
/**
* Text Parser - UTF-8 parsing with multiple segmentation modes
*/
import { readFile } from 'fs/promises';
import type { SegmentationMode, InputConfig } from './config.js';
export interface ParsedText {
segments: string[];
totalChars: number;
uniqueTokens: Set<string>;
rawText: string;
}
/**
* Segment text into characters, preserving combining marks with base characters
*/
function segmentByCharacter(text: string): string[] {
// Use Unicode segmenter to properly handle combining characters
const segments: string[] = [];
const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)];
for (const { segment } of graphemes) {
const trimmed = segment.trim();
if (trimmed.length > 0) {
segments.push(trimmed);
}
}
return segments;
}
/**
* Segment text into words
*/
function segmentByWord(text: string): string[] {
// Split by whitespace and common punctuation
const words = text.split(/[\s،؛؟!,.;:""''()\[\]{}]+/);
return words.filter(word => word.trim().length > 0);
}
/**
* Segment text into n-grams (groups of n words)
*/
function segmentByNgram(text: string, ngramSize: number): string[] {
const words = segmentByWord(text);
const ngrams: string[] = [];
for (let i = 0; i <= words.length - ngramSize; i++) {
const ngram = words.slice(i, i + ngramSize).join(' ');
ngrams.push(ngram);
}
return ngrams;
}
/**
* Segment text into sentences
*/
function segmentBySentence(text: string): string[] {
// Split by sentence-ending punctuation (including Arabic)
const sentences = text.split(/[.؟!۔]+/);
return sentences
.map(s => s.trim())
.filter(s => s.length > 0);
}
/**
* Segment text into lines
*/
function segmentByLine(text: string): string[] {
const lines = text.split(/\r?\n/);
return lines
.map(line => line.trim())
.filter(line => line.length > 0);
}
/**
* Apply segmentation based on mode
*/
function applySegmentation(
text: string,
mode: SegmentationMode,
ngramSize: number = 3
): string[] {
switch (mode) {
case 'character':
return segmentByCharacter(text);
case 'word':
return segmentByWord(text);
case 'ngram':
return segmentByNgram(text, ngramSize);
case 'sentence':
return segmentBySentence(text);
case 'line':
return segmentByLine(text);
default:
return segmentByWord(text);
}
}
/**
* Filter segments by length
*/
function filterByLength(
segments: string[],
minLength: number,
maxLength: number
): string[] {
return segments.filter(s => {
const len = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length;
return len >= minLength && len <= maxLength;
});
}
/**
* Shuffle array using Fisher-Yates algorithm with optional seed
*/
function shuffleArray<T>(array: T[], seed?: number): T[] {
const result = [...array];
// Simple seeded random number generator
const random = seed !== undefined
? (() => {
let s = seed;
return () => {
s = (s * 1103515245 + 12345) % 2147483648;
return s / 2147483648;
};
})()
: Math.random;
for (let i = result.length - 1; i > 0; i--) {
const j = Math.floor(random() * (i + 1));
[result[i], result[j]] = [result[j], result[i]];
}
return result;
}
/**
* Parse text file and segment it according to configuration
*/
export async function parseTextFile(config: InputConfig, seed?: number): Promise<ParsedText> {
const encoding = (config.encoding || 'utf-8') as BufferEncoding;
const rawText = await readFile(config.file, { encoding });
// Apply segmentation
let segments = applySegmentation(
rawText,
config.segmentation,
config.ngram_size || 3
);
// Filter by length
const minLen = config.min_length ?? 1;
const maxLen = config.max_length ?? 50;
segments = filterByLength(segments, minLen, maxLen);
// Shuffle if requested
if (config.shuffle) {
segments = shuffleArray(segments, seed);
}
// Count unique tokens
const uniqueTokens = new Set(segments);
// Total character count
const totalChars = segments.reduce((sum, s) => {
return sum + [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length;
}, 0);
return {
segments,
totalChars,
uniqueTokens,
rawText,
};
}
/**
* Get samples from parsed text, repeating if necessary to reach target size
*/
export function getSamples(parsed: ParsedText, targetSize: number, seed?: number): string[] {
if (parsed.segments.length === 0) {
throw new Error('No segments found in parsed text');
}
const samples: string[] = [];
let shuffled = shuffleArray(parsed.segments, seed);
let index = 0;
while (samples.length < targetSize) {
if (index >= shuffled.length) {
// Reshuffle when we've used all segments
shuffled = shuffleArray(parsed.segments, seed ? seed + samples.length : undefined);
index = 0;
}
samples.push(shuffled[index]);
index++;
}
return samples;
}
/**
* Calculate statistics about the segments
*/
export function getSegmentStats(segments: string[]): {
count: number;
uniqueCount: number;
avgLength: number;
minLength: number;
maxLength: number;
} {
const unique = new Set(segments);
const lengths = segments.map(s =>
[...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length
);
return {
count: segments.length,
uniqueCount: unique.size,
avgLength: lengths.length > 0 ? lengths.reduce((a, b) => a + b, 0) / lengths.length : 0,
minLength: lengths.length > 0 ? Math.min(...lengths) : 0,
maxLength: lengths.length > 0 ? Math.max(...lengths) : 0,
};
}