Spaces:
Sleeping
Sleeping
File size: 7,602 Bytes
88b6846 d4b5ccf 88b6846 d4b5ccf 88b6846 d4b5ccf 88b6846 d4b5ccf 88b6846 d4b5ccf 88b6846 d4b5ccf 88b6846 d4b5ccf 88b6846 d4b5ccf 88b6846 d4b5ccf 88b6846 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
'use client';
import React, { useState, useRef } from 'react';
import { Upload, FileText } from 'lucide-react';
import { toast } from 'sonner';
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';
interface TextInputProps {
onSentencesLoaded: (sentences: string[]) => void;
}
/**
* Smart sentence splitting that handles various text formats
*/
function splitIntoSentences(inputText: string): string[] {
const text = inputText.trim();
if (!text) return [];
// Step 1: Normalize line endings
const normalized = text.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
// Step 2: Check if text is already line-separated (common for TTS datasets)
const lines = normalized.split('\n').map(s => s.trim()).filter(s => s.length > 0);
// If we have multiple non-empty lines, assume each line is a sentence
if (lines.length > 1) {
return lines.filter(line => line.length >= 2);
}
// Step 3: Single block of text - need to split by sentence boundaries
// This regex handles:
// - Standard punctuation: . ! ?
// - Ellipsis: ...
// - Hindi/Urdu: । ۔
// - Followed by whitespace or end of string
// - Preserves abbreviations like "Mr.", "Dr.", "etc."
const singleLine = lines[0] || text;
// Common abbreviations to protect
const abbreviations = [
'Mr', 'Mrs', 'Ms', 'Dr', 'Prof', 'Sr', 'Jr', 'vs', 'etc', 'i.e', 'e.g',
'Inc', 'Ltd', 'Corp', 'Co', 'No', 'Vol', 'Rev', 'Fig', 'Eq'
];
// Create placeholder for abbreviations
let processed = singleLine;
const placeholders: { [key: string]: string } = {};
abbreviations.forEach((abbr, index) => {
const placeholder = `__ABBR${index}__`;
const regex = new RegExp(`\\b${abbr}\\.\\s`, 'gi');
processed = processed.replace(regex, (match) => {
placeholders[placeholder] = match;
return placeholder;
});
});
// Also protect decimal numbers (e.g., "3.14")
processed = processed.replace(/(\d+)\.(\d+)/g, '$1__DECIMAL__$2');
// Split by sentence-ending punctuation
// Using a more robust pattern that handles multiple punctuation marks
const sentenceEnders = /([.!?।۔]+)\s+/g;
const parts = processed.split(sentenceEnders);
// Reconstruct sentences (split creates alternating text and punctuation)
const sentences: string[] = [];
for (let i = 0; i < parts.length; i += 2) {
let sentence = parts[i];
if (i + 1 < parts.length) {
sentence += parts[i + 1]; // Add punctuation back
}
sentence = sentence.trim();
if (sentence) {
sentences.push(sentence);
}
}
// Restore placeholders
const restored = sentences.map(s => {
let result = s;
Object.entries(placeholders).forEach(([placeholder, original]) => {
result = result.replace(placeholder, original);
});
result = result.replace(/__DECIMAL__/g, '.');
return result.trim();
});
// Filter out very short sentences (less than 2 characters)
return restored.filter(s => s.length >= 2);
}
export default function TextInput({ onSentencesLoaded }: TextInputProps) {
const [text, setText] = useState('');
const fileInputRef = useRef<HTMLInputElement>(null);
const processText = (inputText: string) => {
if (!inputText.trim()) {
toast.error('Please enter some text');
return;
}
const sentences = splitIntoSentences(inputText);
if (sentences.length > 0) {
onSentencesLoaded(sentences);
toast.success(`Loaded ${sentences.length} sentence${sentences.length > 1 ? 's' : ''}`);
setText('');
} else {
toast.error('No valid sentences found. Try putting each sentence on a new line.');
}
};
const handleTextChange = (e: React.ChangeEvent<HTMLTextAreaElement>) => {
setText(e.target.value);
};
const handlePaste = () => {
processText(text);
};
const handleFileUpload = (e: React.ChangeEvent<HTMLInputElement>) => {
const file = e.target.files?.[0];
if (!file) return;
const reader = new FileReader();
reader.onload = (event) => {
const content = event.target?.result as string;
setText(content);
processText(content);
toast.success(`File loaded: ${file.name}`);
};
reader.onerror = () => toast.error('Failed to read file');
reader.readAsText(file);
};
return (
<Card>
<CardHeader>
<CardTitle className="text-lg flex items-center gap-2">
<FileText className="w-4 h-4" />
Input Data
</CardTitle>
</CardHeader>
<CardContent className="space-y-4">
<div
className="border-2 border-dashed border-border rounded-xl p-6 text-center hover:bg-secondary/50 transition-colors cursor-pointer relative group"
onClick={() => fileInputRef.current?.click()}
>
<input
type="file"
accept=".txt,.csv"
ref={fileInputRef}
className="hidden"
onChange={handleFileUpload}
/>
<Upload className="w-8 h-8 mx-auto mb-2 text-muted-foreground group-hover:text-primary transition-colors" />
<p className="text-sm font-medium">Drop text file or click to upload</p>
<p className="text-xs text-muted-foreground mt-1">.txt and .csv files supported</p>
</div>
<div className="relative">
<div className="absolute inset-0 flex items-center">
<span className="w-full border-t border-border" />
</div>
<div className="relative flex justify-center text-xs uppercase">
<span className="bg-card px-2 text-muted-foreground">Or paste text</span>
</div>
</div>
<div className="space-y-2">
<textarea
className="input min-h-[120px] resize-y"
placeholder="Paste your text here... • One sentence per line works best • Or paste a paragraph - it will be split automatically"
value={text}
onChange={handleTextChange}
/>
<div className="flex gap-2">
<button
onClick={handlePaste}
disabled={!text.trim()}
className="btn btn-primary flex-1"
>
Load Sentences
</button>
<button
onClick={() => setText('')}
disabled={!text.trim()}
className="btn btn-secondary"
>
Clear
</button>
</div>
<p className="text-xs text-muted-foreground text-center">
Tip: For best results, put each sentence on a new line
</p>
</div>
</CardContent>
</Card>
);
}
|