'use client'; import React, { useState, useRef } from 'react'; import { Upload, FileText } from 'lucide-react'; import { toast } from 'sonner'; import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card'; interface TextInputProps { onSentencesLoaded: (sentences: string[]) => void; } /** * Smart sentence splitting that handles various text formats */ function splitIntoSentences(inputText: string): string[] { const text = inputText.trim(); if (!text) return []; // Step 1: Normalize line endings const normalized = text.replace(/\r\n/g, '\n').replace(/\r/g, '\n'); // Step 2: Check if text is already line-separated (common for TTS datasets) const lines = normalized.split('\n').map(s => s.trim()).filter(s => s.length > 0); // If we have multiple non-empty lines, assume each line is a sentence if (lines.length > 1) { return lines.filter(line => line.length >= 2); } // Step 3: Single block of text - need to split by sentence boundaries // This regex handles: // - Standard punctuation: . ! ? // - Ellipsis: ... // - Hindi/Urdu: । ۔ // - Followed by whitespace or end of string // - Preserves abbreviations like "Mr.", "Dr.", "etc." const singleLine = lines[0] || text; // Common abbreviations to protect const abbreviations = [ 'Mr', 'Mrs', 'Ms', 'Dr', 'Prof', 'Sr', 'Jr', 'vs', 'etc', 'i.e', 'e.g', 'Inc', 'Ltd', 'Corp', 'Co', 'No', 'Vol', 'Rev', 'Fig', 'Eq' ]; // Create placeholder for abbreviations let processed = singleLine; const placeholders: { [key: string]: string } = {}; abbreviations.forEach((abbr, index) => { const placeholder = `__ABBR${index}__`; const regex = new RegExp(`\\b${abbr}\\.\\s`, 'gi'); processed = processed.replace(regex, (match) => { placeholders[placeholder] = match; return placeholder; }); }); // Also protect decimal numbers (e.g., "3.14") processed = processed.replace(/(\d+)\.(\d+)/g, '$1__DECIMAL__$2'); // Split by sentence-ending punctuation // Using a more robust pattern that handles multiple punctuation marks const sentenceEnders = /([.!?।۔]+)\s+/g; const parts = processed.split(sentenceEnders); // Reconstruct sentences (split creates alternating text and punctuation) const sentences: string[] = []; for (let i = 0; i < parts.length; i += 2) { let sentence = parts[i]; if (i + 1 < parts.length) { sentence += parts[i + 1]; // Add punctuation back } sentence = sentence.trim(); if (sentence) { sentences.push(sentence); } } // Restore placeholders const restored = sentences.map(s => { let result = s; Object.entries(placeholders).forEach(([placeholder, original]) => { result = result.replace(placeholder, original); }); result = result.replace(/__DECIMAL__/g, '.'); return result.trim(); }); // Filter out very short sentences (less than 2 characters) return restored.filter(s => s.length >= 2); } export default function TextInput({ onSentencesLoaded }: TextInputProps) { const [text, setText] = useState(''); const fileInputRef = useRef(null); const processText = (inputText: string) => { if (!inputText.trim()) { toast.error('Please enter some text'); return; } const sentences = splitIntoSentences(inputText); if (sentences.length > 0) { onSentencesLoaded(sentences); toast.success(`Loaded ${sentences.length} sentence${sentences.length > 1 ? 's' : ''}`); setText(''); } else { toast.error('No valid sentences found. Try putting each sentence on a new line.'); } }; const handleTextChange = (e: React.ChangeEvent) => { setText(e.target.value); }; const handlePaste = () => { processText(text); }; const handleFileUpload = (e: React.ChangeEvent) => { const file = e.target.files?.[0]; if (!file) return; const reader = new FileReader(); reader.onload = (event) => { const content = event.target?.result as string; setText(content); processText(content); toast.success(`File loaded: ${file.name}`); }; reader.onerror = () => toast.error('Failed to read file'); reader.readAsText(file); }; return ( Input Data
fileInputRef.current?.click()} >

Drop text file or click to upload

.txt and .csv files supported

Or paste text