Spaces:
Sleeping
Sleeping
Smart sentence splitting with abbreviation & decimal protection
Browse files- src/components/TextInput.tsx +112 -21
src/components/TextInput.tsx
CHANGED
|
@@ -9,27 +9,106 @@ interface TextInputProps {
|
|
| 9 |
onSentencesLoaded: (sentences: string[]) => void;
|
| 10 |
}
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
export default function TextInput({ onSentencesLoaded }: TextInputProps) {
|
| 13 |
const [text, setText] = useState('');
|
| 14 |
const fileInputRef = useRef<HTMLInputElement>(null);
|
| 15 |
|
| 16 |
const processText = (inputText: string) => {
|
| 17 |
-
if (!inputText.trim())
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
-
|
| 20 |
-
// Split by . ! ? followed by space or newline
|
| 21 |
-
const sentences = inputText
|
| 22 |
-
.replace(/([.!?])\s+/g, '$1|')
|
| 23 |
-
.split('|')
|
| 24 |
-
.map(s => s.trim())
|
| 25 |
-
.filter(s => s.length > 0);
|
| 26 |
|
| 27 |
if (sentences.length > 0) {
|
| 28 |
onSentencesLoaded(sentences);
|
| 29 |
-
toast.success(`Loaded ${sentences.length} sentences`);
|
| 30 |
setText('');
|
| 31 |
} else {
|
| 32 |
-
toast.error('No valid sentences found');
|
| 33 |
}
|
| 34 |
};
|
| 35 |
|
|
@@ -71,14 +150,14 @@ export default function TextInput({ onSentencesLoaded }: TextInputProps) {
|
|
| 71 |
>
|
| 72 |
<input
|
| 73 |
type="file"
|
| 74 |
-
accept=".txt"
|
| 75 |
ref={fileInputRef}
|
| 76 |
className="hidden"
|
| 77 |
onChange={handleFileUpload}
|
| 78 |
/>
|
| 79 |
<Upload className="w-8 h-8 mx-auto mb-2 text-muted-foreground group-hover:text-primary transition-colors" />
|
| 80 |
<p className="text-sm font-medium">Drop text file or click to upload</p>
|
| 81 |
-
<p className="text-xs text-muted-foreground mt-1">.txt files supported</p>
|
| 82 |
</div>
|
| 83 |
|
| 84 |
<div className="relative">
|
|
@@ -92,18 +171,30 @@ export default function TextInput({ onSentencesLoaded }: TextInputProps) {
|
|
| 92 |
|
| 93 |
<div className="space-y-2">
|
| 94 |
<textarea
|
| 95 |
-
className="input min-h-[
|
| 96 |
-
placeholder="Paste your
|
| 97 |
value={text}
|
| 98 |
onChange={handleTextChange}
|
| 99 |
/>
|
| 100 |
-
<
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
</div>
|
| 108 |
</CardContent>
|
| 109 |
</Card>
|
|
|
|
| 9 |
onSentencesLoaded: (sentences: string[]) => void;
|
| 10 |
}
|
| 11 |
|
| 12 |
+
/**
|
| 13 |
+
* Smart sentence splitting that handles various text formats
|
| 14 |
+
*/
|
| 15 |
+
function splitIntoSentences(inputText: string): string[] {
|
| 16 |
+
const text = inputText.trim();
|
| 17 |
+
if (!text) return [];
|
| 18 |
+
|
| 19 |
+
// Step 1: Normalize line endings
|
| 20 |
+
const normalized = text.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
|
| 21 |
+
|
| 22 |
+
// Step 2: Check if text is already line-separated (common for TTS datasets)
|
| 23 |
+
const lines = normalized.split('\n').map(s => s.trim()).filter(s => s.length > 0);
|
| 24 |
+
|
| 25 |
+
// If we have multiple non-empty lines, assume each line is a sentence
|
| 26 |
+
if (lines.length > 1) {
|
| 27 |
+
return lines.filter(line => line.length >= 2);
|
| 28 |
+
}
|
| 29 |
+
|
| 30 |
+
// Step 3: Single block of text - need to split by sentence boundaries
|
| 31 |
+
// This regex handles:
|
| 32 |
+
// - Standard punctuation: . ! ?
|
| 33 |
+
// - Ellipsis: ...
|
| 34 |
+
// - Hindi/Urdu: । ۔
|
| 35 |
+
// - Followed by whitespace or end of string
|
| 36 |
+
// - Preserves abbreviations like "Mr.", "Dr.", "etc."
|
| 37 |
+
|
| 38 |
+
const singleLine = lines[0] || text;
|
| 39 |
+
|
| 40 |
+
// Common abbreviations to protect
|
| 41 |
+
const abbreviations = [
|
| 42 |
+
'Mr', 'Mrs', 'Ms', 'Dr', 'Prof', 'Sr', 'Jr', 'vs', 'etc', 'i.e', 'e.g',
|
| 43 |
+
'Inc', 'Ltd', 'Corp', 'Co', 'No', 'Vol', 'Rev', 'Fig', 'Eq'
|
| 44 |
+
];
|
| 45 |
+
|
| 46 |
+
// Create placeholder for abbreviations
|
| 47 |
+
let processed = singleLine;
|
| 48 |
+
const placeholders: { [key: string]: string } = {};
|
| 49 |
+
|
| 50 |
+
abbreviations.forEach((abbr, index) => {
|
| 51 |
+
const placeholder = `__ABBR${index}__`;
|
| 52 |
+
const regex = new RegExp(`\\b${abbr}\\.\\s`, 'gi');
|
| 53 |
+
processed = processed.replace(regex, (match) => {
|
| 54 |
+
placeholders[placeholder] = match;
|
| 55 |
+
return placeholder;
|
| 56 |
+
});
|
| 57 |
+
});
|
| 58 |
+
|
| 59 |
+
// Also protect decimal numbers (e.g., "3.14")
|
| 60 |
+
processed = processed.replace(/(\d+)\.(\d+)/g, '$1__DECIMAL__$2');
|
| 61 |
+
|
| 62 |
+
// Split by sentence-ending punctuation
|
| 63 |
+
// Using a more robust pattern that handles multiple punctuation marks
|
| 64 |
+
const sentenceEnders = /([.!?।۔]+)\s+/g;
|
| 65 |
+
const parts = processed.split(sentenceEnders);
|
| 66 |
+
|
| 67 |
+
// Reconstruct sentences (split creates alternating text and punctuation)
|
| 68 |
+
const sentences: string[] = [];
|
| 69 |
+
for (let i = 0; i < parts.length; i += 2) {
|
| 70 |
+
let sentence = parts[i];
|
| 71 |
+
if (i + 1 < parts.length) {
|
| 72 |
+
sentence += parts[i + 1]; // Add punctuation back
|
| 73 |
+
}
|
| 74 |
+
sentence = sentence.trim();
|
| 75 |
+
if (sentence) {
|
| 76 |
+
sentences.push(sentence);
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
// Restore placeholders
|
| 81 |
+
const restored = sentences.map(s => {
|
| 82 |
+
let result = s;
|
| 83 |
+
Object.entries(placeholders).forEach(([placeholder, original]) => {
|
| 84 |
+
result = result.replace(placeholder, original);
|
| 85 |
+
});
|
| 86 |
+
result = result.replace(/__DECIMAL__/g, '.');
|
| 87 |
+
return result.trim();
|
| 88 |
+
});
|
| 89 |
+
|
| 90 |
+
// Filter out very short sentences (less than 2 characters)
|
| 91 |
+
return restored.filter(s => s.length >= 2);
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
export default function TextInput({ onSentencesLoaded }: TextInputProps) {
|
| 95 |
const [text, setText] = useState('');
|
| 96 |
const fileInputRef = useRef<HTMLInputElement>(null);
|
| 97 |
|
| 98 |
const processText = (inputText: string) => {
|
| 99 |
+
if (!inputText.trim()) {
|
| 100 |
+
toast.error('Please enter some text');
|
| 101 |
+
return;
|
| 102 |
+
}
|
| 103 |
|
| 104 |
+
const sentences = splitIntoSentences(inputText);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
if (sentences.length > 0) {
|
| 107 |
onSentencesLoaded(sentences);
|
| 108 |
+
toast.success(`Loaded ${sentences.length} sentence${sentences.length > 1 ? 's' : ''}`);
|
| 109 |
setText('');
|
| 110 |
} else {
|
| 111 |
+
toast.error('No valid sentences found. Try putting each sentence on a new line.');
|
| 112 |
}
|
| 113 |
};
|
| 114 |
|
|
|
|
| 150 |
>
|
| 151 |
<input
|
| 152 |
type="file"
|
| 153 |
+
accept=".txt,.csv"
|
| 154 |
ref={fileInputRef}
|
| 155 |
className="hidden"
|
| 156 |
onChange={handleFileUpload}
|
| 157 |
/>
|
| 158 |
<Upload className="w-8 h-8 mx-auto mb-2 text-muted-foreground group-hover:text-primary transition-colors" />
|
| 159 |
<p className="text-sm font-medium">Drop text file or click to upload</p>
|
| 160 |
+
<p className="text-xs text-muted-foreground mt-1">.txt and .csv files supported</p>
|
| 161 |
</div>
|
| 162 |
|
| 163 |
<div className="relative">
|
|
|
|
| 171 |
|
| 172 |
<div className="space-y-2">
|
| 173 |
<textarea
|
| 174 |
+
className="input min-h-[120px] resize-y"
|
| 175 |
+
placeholder="Paste your text here... • One sentence per line works best • Or paste a paragraph - it will be split automatically"
|
| 176 |
value={text}
|
| 177 |
onChange={handleTextChange}
|
| 178 |
/>
|
| 179 |
+
<div className="flex gap-2">
|
| 180 |
+
<button
|
| 181 |
+
onClick={handlePaste}
|
| 182 |
+
disabled={!text.trim()}
|
| 183 |
+
className="btn btn-primary flex-1"
|
| 184 |
+
>
|
| 185 |
+
Load Sentences
|
| 186 |
+
</button>
|
| 187 |
+
<button
|
| 188 |
+
onClick={() => setText('')}
|
| 189 |
+
disabled={!text.trim()}
|
| 190 |
+
className="btn btn-secondary"
|
| 191 |
+
>
|
| 192 |
+
Clear
|
| 193 |
+
</button>
|
| 194 |
+
</div>
|
| 195 |
+
<p className="text-xs text-muted-foreground text-center">
|
| 196 |
+
Tip: For best results, put each sentence on a new line
|
| 197 |
+
</p>
|
| 198 |
</div>
|
| 199 |
</CardContent>
|
| 200 |
</Card>
|