Spaces:

Omarrran
/

OCR_DATASET_MAKER

Sleeping

App Files Files Community

OCR_DATASET_MAKER / src /core /text-parser.ts

Omarrran

OCR Dataset Generator for HF Spaces

24a732c about 2 months ago

raw

history blame contribute delete

6.12 kB

	/**
	* Text Parser - UTF-8 parsing with multiple segmentation modes
	*/

	import { readFile } from 'fs/promises';
	import type { SegmentationMode, InputConfig } from './config.js';

	export interface ParsedText {
	segments: string[];
	totalChars: number;
	uniqueTokens: Set<string>;
	rawText: string;
	}

	/**
	* Segment text into characters, preserving combining marks with base characters
	*/
	function segmentByCharacter(text: string): string[] {
	// Use Unicode segmenter to properly handle combining characters
	const segments: string[] = [];
	const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)];

	for (const { segment } of graphemes) {
	const trimmed = segment.trim();
	if (trimmed.length > 0) {
	segments.push(trimmed);
	}
	}

	return segments;
	}

	/**
	* Segment text into words
	*/
	function segmentByWord(text: string): string[] {
	// Split by whitespace and common punctuation
	const words = text.split(/[\s،؛؟!,.;:""''()\[\]{}]+/);
	return words.filter(word => word.trim().length > 0);
	}

	/**
	* Segment text into n-grams (groups of n words)
	*/
	function segmentByNgram(text: string, ngramSize: number): string[] {
	const words = segmentByWord(text);
	const ngrams: string[] = [];

	for (let i = 0; i <= words.length - ngramSize; i++) {
	const ngram = words.slice(i, i + ngramSize).join(' ');
	ngrams.push(ngram);
	}

	return ngrams;
	}

	/**
	* Segment text into sentences
	*/
	function segmentBySentence(text: string): string[] {
	// Split by sentence-ending punctuation (including Arabic)
	const sentences = text.split(/[.؟!۔]+/);
	return sentences
	.map(s => s.trim())
	.filter(s => s.length > 0);
	}

	/**
	* Segment text into lines
	*/
	function segmentByLine(text: string): string[] {
	const lines = text.split(/\r?\n/);
	return lines
	.map(line => line.trim())
	.filter(line => line.length > 0);
	}

	/**
	* Apply segmentation based on mode
	*/
	function applySegmentation(
	text: string,
	mode: SegmentationMode,
	ngramSize: number = 3
	): string[] {
	switch (mode) {
	case 'character':
	return segmentByCharacter(text);
	case 'word':
	return segmentByWord(text);
	case 'ngram':
	return segmentByNgram(text, ngramSize);
	case 'sentence':
	return segmentBySentence(text);
	case 'line':
	return segmentByLine(text);
	default:
	return segmentByWord(text);
	}
	}

	/**
	* Filter segments by length
	*/
	function filterByLength(
	segments: string[],
	minLength: number,
	maxLength: number
	): string[] {
	return segments.filter(s => {
	const len = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length;
	return len >= minLength && len <= maxLength;
	});
	}

	/**
	* Shuffle array using Fisher-Yates algorithm with optional seed
	*/
	function shuffleArray<T>(array: T[], seed?: number): T[] {
	const result = [...array];

	// Simple seeded random number generator
	const random = seed !== undefined
	? (() => {
	let s = seed;
	return () => {
	s = (s * 1103515245 + 12345) % 2147483648;
	return s / 2147483648;
	};
	})()
	: Math.random;

	for (let i = result.length - 1; i > 0; i--) {
	const j = Math.floor(random() * (i + 1));
	[result[i], result[j]] = [result[j], result[i]];
	}

	return result;
	}

	/**
	* Parse text file and segment it according to configuration
	*/
	export async function parseTextFile(config: InputConfig, seed?: number): Promise<ParsedText> {
	const encoding = (config.encoding \|\| 'utf-8') as BufferEncoding;
	const rawText = await readFile(config.file, { encoding });

	// Apply segmentation
	let segments = applySegmentation(
	rawText,
	config.segmentation,
	config.ngram_size \|\| 3
	);

	// Filter by length
	const minLen = config.min_length ?? 1;
	const maxLen = config.max_length ?? 50;
	segments = filterByLength(segments, minLen, maxLen);

	// Shuffle if requested
	if (config.shuffle) {
	segments = shuffleArray(segments, seed);
	}

	// Count unique tokens
	const uniqueTokens = new Set(segments);

	// Total character count
	const totalChars = segments.reduce((sum, s) => {
	return sum + [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length;
	}, 0);

	return {
	segments,
	totalChars,
	uniqueTokens,
	rawText,
	};
	}

	/**
	* Get samples from parsed text, repeating if necessary to reach target size
	*/
	export function getSamples(parsed: ParsedText, targetSize: number, seed?: number): string[] {
	if (parsed.segments.length === 0) {
	throw new Error('No segments found in parsed text');
	}

	const samples: string[] = [];
	let shuffled = shuffleArray(parsed.segments, seed);
	let index = 0;

	while (samples.length < targetSize) {
	if (index >= shuffled.length) {
	// Reshuffle when we've used all segments
	shuffled = shuffleArray(parsed.segments, seed ? seed + samples.length : undefined);
	index = 0;
	}
	samples.push(shuffled[index]);
	index++;
	}

	return samples;
	}

	/**
	* Calculate statistics about the segments
	*/
	export function getSegmentStats(segments: string[]): {
	count: number;
	uniqueCount: number;
	avgLength: number;
	minLength: number;
	maxLength: number;
	} {
	const unique = new Set(segments);
	const lengths = segments.map(s =>
	[...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length
	);

	return {
	count: segments.length,
	uniqueCount: unique.size,
	avgLength: lengths.length > 0 ? lengths.reduce((a, b) => a + b, 0) / lengths.length : 0,
	minLength: lengths.length > 0 ? Math.min(...lengths) : 0,
	maxLength: lengths.length > 0 ? Math.max(...lengths) : 0,
	};
	}