/** * Text Parser - UTF-8 parsing with multiple segmentation modes */ import { readFile } from 'fs/promises'; import type { SegmentationMode, InputConfig } from './config.js'; export interface ParsedText { segments: string[]; totalChars: number; uniqueTokens: Set; rawText: string; } /** * Segment text into characters, preserving combining marks with base characters */ function segmentByCharacter(text: string): string[] { // Use Unicode segmenter to properly handle combining characters const segments: string[] = []; const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)]; for (const { segment } of graphemes) { const trimmed = segment.trim(); if (trimmed.length > 0) { segments.push(trimmed); } } return segments; } /** * Segment text into words */ function segmentByWord(text: string): string[] { // Split by whitespace and common punctuation const words = text.split(/[\s،؛؟!,.;:""''()\[\]{}]+/); return words.filter(word => word.trim().length > 0); } /** * Segment text into n-grams (groups of n words) */ function segmentByNgram(text: string, ngramSize: number): string[] { const words = segmentByWord(text); const ngrams: string[] = []; for (let i = 0; i <= words.length - ngramSize; i++) { const ngram = words.slice(i, i + ngramSize).join(' '); ngrams.push(ngram); } return ngrams; } /** * Segment text into sentences */ function segmentBySentence(text: string): string[] { // Split by sentence-ending punctuation (including Arabic) const sentences = text.split(/[.؟!۔]+/); return sentences .map(s => s.trim()) .filter(s => s.length > 0); } /** * Segment text into lines */ function segmentByLine(text: string): string[] { const lines = text.split(/\r?\n/); return lines .map(line => line.trim()) .filter(line => line.length > 0); } /** * Apply segmentation based on mode */ function applySegmentation( text: string, mode: SegmentationMode, ngramSize: number = 3 ): string[] { switch (mode) { case 'character': return segmentByCharacter(text); case 'word': return segmentByWord(text); case 'ngram': return segmentByNgram(text, ngramSize); case 'sentence': return segmentBySentence(text); case 'line': return segmentByLine(text); default: return segmentByWord(text); } } /** * Filter segments by length */ function filterByLength( segments: string[], minLength: number, maxLength: number ): string[] { return segments.filter(s => { const len = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length; return len >= minLength && len <= maxLength; }); } /** * Shuffle array using Fisher-Yates algorithm with optional seed */ function shuffleArray(array: T[], seed?: number): T[] { const result = [...array]; // Simple seeded random number generator const random = seed !== undefined ? (() => { let s = seed; return () => { s = (s * 1103515245 + 12345) % 2147483648; return s / 2147483648; }; })() : Math.random; for (let i = result.length - 1; i > 0; i--) { const j = Math.floor(random() * (i + 1)); [result[i], result[j]] = [result[j], result[i]]; } return result; } /** * Parse text file and segment it according to configuration */ export async function parseTextFile(config: InputConfig, seed?: number): Promise { const encoding = (config.encoding || 'utf-8') as BufferEncoding; const rawText = await readFile(config.file, { encoding }); // Apply segmentation let segments = applySegmentation( rawText, config.segmentation, config.ngram_size || 3 ); // Filter by length const minLen = config.min_length ?? 1; const maxLen = config.max_length ?? 50; segments = filterByLength(segments, minLen, maxLen); // Shuffle if requested if (config.shuffle) { segments = shuffleArray(segments, seed); } // Count unique tokens const uniqueTokens = new Set(segments); // Total character count const totalChars = segments.reduce((sum, s) => { return sum + [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length; }, 0); return { segments, totalChars, uniqueTokens, rawText, }; } /** * Get samples from parsed text, repeating if necessary to reach target size */ export function getSamples(parsed: ParsedText, targetSize: number, seed?: number): string[] { if (parsed.segments.length === 0) { throw new Error('No segments found in parsed text'); } const samples: string[] = []; let shuffled = shuffleArray(parsed.segments, seed); let index = 0; while (samples.length < targetSize) { if (index >= shuffled.length) { // Reshuffle when we've used all segments shuffled = shuffleArray(parsed.segments, seed ? seed + samples.length : undefined); index = 0; } samples.push(shuffled[index]); index++; } return samples; } /** * Calculate statistics about the segments */ export function getSegmentStats(segments: string[]): { count: number; uniqueCount: number; avgLength: number; minLength: number; maxLength: number; } { const unique = new Set(segments); const lengths = segments.map(s => [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length ); return { count: segments.length, uniqueCount: unique.size, avgLength: lengths.length > 0 ? lengths.reduce((a, b) => a + b, 0) / lengths.length : 0, minLength: lengths.length > 0 ? Math.min(...lengths) : 0, maxLength: lengths.length > 0 ? Math.max(...lengths) : 0, }; }