Spaces:
Sleeping
Sleeping
| /** | |
| * Text Parser - UTF-8 parsing with multiple segmentation modes | |
| */ | |
| import { readFile } from 'fs/promises'; | |
| import type { SegmentationMode, InputConfig } from './config.js'; | |
| export interface ParsedText { | |
| segments: string[]; | |
| totalChars: number; | |
| uniqueTokens: Set<string>; | |
| rawText: string; | |
| } | |
| /** | |
| * Segment text into characters, preserving combining marks with base characters | |
| */ | |
| function segmentByCharacter(text: string): string[] { | |
| // Use Unicode segmenter to properly handle combining characters | |
| const segments: string[] = []; | |
| const graphemes = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(text)]; | |
| for (const { segment } of graphemes) { | |
| const trimmed = segment.trim(); | |
| if (trimmed.length > 0) { | |
| segments.push(trimmed); | |
| } | |
| } | |
| return segments; | |
| } | |
| /** | |
| * Segment text into words | |
| */ | |
| function segmentByWord(text: string): string[] { | |
| // Split by whitespace and common punctuation | |
| const words = text.split(/[\s،؛؟!,.;:""''()\[\]{}]+/); | |
| return words.filter(word => word.trim().length > 0); | |
| } | |
| /** | |
| * Segment text into n-grams (groups of n words) | |
| */ | |
| function segmentByNgram(text: string, ngramSize: number): string[] { | |
| const words = segmentByWord(text); | |
| const ngrams: string[] = []; | |
| for (let i = 0; i <= words.length - ngramSize; i++) { | |
| const ngram = words.slice(i, i + ngramSize).join(' '); | |
| ngrams.push(ngram); | |
| } | |
| return ngrams; | |
| } | |
| /** | |
| * Segment text into sentences | |
| */ | |
| function segmentBySentence(text: string): string[] { | |
| // Split by sentence-ending punctuation (including Arabic) | |
| const sentences = text.split(/[.؟!۔]+/); | |
| return sentences | |
| .map(s => s.trim()) | |
| .filter(s => s.length > 0); | |
| } | |
| /** | |
| * Segment text into lines | |
| */ | |
| function segmentByLine(text: string): string[] { | |
| const lines = text.split(/\r?\n/); | |
| return lines | |
| .map(line => line.trim()) | |
| .filter(line => line.length > 0); | |
| } | |
| /** | |
| * Apply segmentation based on mode | |
| */ | |
| function applySegmentation( | |
| text: string, | |
| mode: SegmentationMode, | |
| ngramSize: number = 3 | |
| ): string[] { | |
| switch (mode) { | |
| case 'character': | |
| return segmentByCharacter(text); | |
| case 'word': | |
| return segmentByWord(text); | |
| case 'ngram': | |
| return segmentByNgram(text, ngramSize); | |
| case 'sentence': | |
| return segmentBySentence(text); | |
| case 'line': | |
| return segmentByLine(text); | |
| default: | |
| return segmentByWord(text); | |
| } | |
| } | |
| /** | |
| * Filter segments by length | |
| */ | |
| function filterByLength( | |
| segments: string[], | |
| minLength: number, | |
| maxLength: number | |
| ): string[] { | |
| return segments.filter(s => { | |
| const len = [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length; | |
| return len >= minLength && len <= maxLength; | |
| }); | |
| } | |
| /** | |
| * Shuffle array using Fisher-Yates algorithm with optional seed | |
| */ | |
| function shuffleArray<T>(array: T[], seed?: number): T[] { | |
| const result = [...array]; | |
| // Simple seeded random number generator | |
| const random = seed !== undefined | |
| ? (() => { | |
| let s = seed; | |
| return () => { | |
| s = (s * 1103515245 + 12345) % 2147483648; | |
| return s / 2147483648; | |
| }; | |
| })() | |
| : Math.random; | |
| for (let i = result.length - 1; i > 0; i--) { | |
| const j = Math.floor(random() * (i + 1)); | |
| [result[i], result[j]] = [result[j], result[i]]; | |
| } | |
| return result; | |
| } | |
| /** | |
| * Parse text file and segment it according to configuration | |
| */ | |
| export async function parseTextFile(config: InputConfig, seed?: number): Promise<ParsedText> { | |
| const encoding = (config.encoding || 'utf-8') as BufferEncoding; | |
| const rawText = await readFile(config.file, { encoding }); | |
| // Apply segmentation | |
| let segments = applySegmentation( | |
| rawText, | |
| config.segmentation, | |
| config.ngram_size || 3 | |
| ); | |
| // Filter by length | |
| const minLen = config.min_length ?? 1; | |
| const maxLen = config.max_length ?? 50; | |
| segments = filterByLength(segments, minLen, maxLen); | |
| // Shuffle if requested | |
| if (config.shuffle) { | |
| segments = shuffleArray(segments, seed); | |
| } | |
| // Count unique tokens | |
| const uniqueTokens = new Set(segments); | |
| // Total character count | |
| const totalChars = segments.reduce((sum, s) => { | |
| return sum + [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length; | |
| }, 0); | |
| return { | |
| segments, | |
| totalChars, | |
| uniqueTokens, | |
| rawText, | |
| }; | |
| } | |
| /** | |
| * Get samples from parsed text, repeating if necessary to reach target size | |
| */ | |
| export function getSamples(parsed: ParsedText, targetSize: number, seed?: number): string[] { | |
| if (parsed.segments.length === 0) { | |
| throw new Error('No segments found in parsed text'); | |
| } | |
| const samples: string[] = []; | |
| let shuffled = shuffleArray(parsed.segments, seed); | |
| let index = 0; | |
| while (samples.length < targetSize) { | |
| if (index >= shuffled.length) { | |
| // Reshuffle when we've used all segments | |
| shuffled = shuffleArray(parsed.segments, seed ? seed + samples.length : undefined); | |
| index = 0; | |
| } | |
| samples.push(shuffled[index]); | |
| index++; | |
| } | |
| return samples; | |
| } | |
| /** | |
| * Calculate statistics about the segments | |
| */ | |
| export function getSegmentStats(segments: string[]): { | |
| count: number; | |
| uniqueCount: number; | |
| avgLength: number; | |
| minLength: number; | |
| maxLength: number; | |
| } { | |
| const unique = new Set(segments); | |
| const lengths = segments.map(s => | |
| [...new Intl.Segmenter('ar', { granularity: 'grapheme' }).segment(s)].length | |
| ); | |
| return { | |
| count: segments.length, | |
| uniqueCount: unique.size, | |
| avgLength: lengths.length > 0 ? lengths.reduce((a, b) => a + b, 0) / lengths.length : 0, | |
| minLength: lengths.length > 0 ? Math.min(...lengths) : 0, | |
| maxLength: lengths.length > 0 ? Math.max(...lengths) : 0, | |
| }; | |
| } | |