Omarrran commited on
Commit
d4b5ccf
·
1 Parent(s): b3e360f

Smart sentence splitting with abbreviation & decimal protection

Browse files
Files changed (1) hide show
  1. src/components/TextInput.tsx +112 -21
src/components/TextInput.tsx CHANGED
@@ -9,27 +9,106 @@ interface TextInputProps {
9
  onSentencesLoaded: (sentences: string[]) => void;
10
  }
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  export default function TextInput({ onSentencesLoaded }: TextInputProps) {
13
  const [text, setText] = useState('');
14
  const fileInputRef = useRef<HTMLInputElement>(null);
15
 
16
  const processText = (inputText: string) => {
17
- if (!inputText.trim()) return;
 
 
 
18
 
19
- // Simple sentence splitting (can be improved or use API)
20
- // Split by . ! ? followed by space or newline
21
- const sentences = inputText
22
- .replace(/([.!?])\s+/g, '$1|')
23
- .split('|')
24
- .map(s => s.trim())
25
- .filter(s => s.length > 0);
26
 
27
  if (sentences.length > 0) {
28
  onSentencesLoaded(sentences);
29
- toast.success(`Loaded ${sentences.length} sentences`);
30
  setText('');
31
  } else {
32
- toast.error('No valid sentences found');
33
  }
34
  };
35
 
@@ -71,14 +150,14 @@ export default function TextInput({ onSentencesLoaded }: TextInputProps) {
71
  >
72
  <input
73
  type="file"
74
- accept=".txt"
75
  ref={fileInputRef}
76
  className="hidden"
77
  onChange={handleFileUpload}
78
  />
79
  <Upload className="w-8 h-8 mx-auto mb-2 text-muted-foreground group-hover:text-primary transition-colors" />
80
  <p className="text-sm font-medium">Drop text file or click to upload</p>
81
- <p className="text-xs text-muted-foreground mt-1">.txt files supported</p>
82
  </div>
83
 
84
  <div className="relative">
@@ -92,18 +171,30 @@ export default function TextInput({ onSentencesLoaded }: TextInputProps) {
92
 
93
  <div className="space-y-2">
94
  <textarea
95
- className="input min-h-[100px] resize-none"
96
- placeholder="Paste your sentences here (one per line)..."
97
  value={text}
98
  onChange={handleTextChange}
99
  />
100
- <button
101
- onClick={handlePaste}
102
- disabled={!text.trim()}
103
- className="btn btn-secondary w-full"
104
- >
105
- Load Sentences
106
- </button>
 
 
 
 
 
 
 
 
 
 
 
 
107
  </div>
108
  </CardContent>
109
  </Card>
 
9
  onSentencesLoaded: (sentences: string[]) => void;
10
  }
11
 
12
+ /**
13
+ * Smart sentence splitting that handles various text formats
14
+ */
15
+ function splitIntoSentences(inputText: string): string[] {
16
+ const text = inputText.trim();
17
+ if (!text) return [];
18
+
19
+ // Step 1: Normalize line endings
20
+ const normalized = text.replace(/\r\n/g, '\n').replace(/\r/g, '\n');
21
+
22
+ // Step 2: Check if text is already line-separated (common for TTS datasets)
23
+ const lines = normalized.split('\n').map(s => s.trim()).filter(s => s.length > 0);
24
+
25
+ // If we have multiple non-empty lines, assume each line is a sentence
26
+ if (lines.length > 1) {
27
+ return lines.filter(line => line.length >= 2);
28
+ }
29
+
30
+ // Step 3: Single block of text - need to split by sentence boundaries
31
+ // This regex handles:
32
+ // - Standard punctuation: . ! ?
33
+ // - Ellipsis: ...
34
+ // - Hindi/Urdu: । ۔
35
+ // - Followed by whitespace or end of string
36
+ // - Preserves abbreviations like "Mr.", "Dr.", "etc."
37
+
38
+ const singleLine = lines[0] || text;
39
+
40
+ // Common abbreviations to protect
41
+ const abbreviations = [
42
+ 'Mr', 'Mrs', 'Ms', 'Dr', 'Prof', 'Sr', 'Jr', 'vs', 'etc', 'i.e', 'e.g',
43
+ 'Inc', 'Ltd', 'Corp', 'Co', 'No', 'Vol', 'Rev', 'Fig', 'Eq'
44
+ ];
45
+
46
+ // Create placeholder for abbreviations
47
+ let processed = singleLine;
48
+ const placeholders: { [key: string]: string } = {};
49
+
50
+ abbreviations.forEach((abbr, index) => {
51
+ const placeholder = `__ABBR${index}__`;
52
+ const regex = new RegExp(`\\b${abbr}\\.\\s`, 'gi');
53
+ processed = processed.replace(regex, (match) => {
54
+ placeholders[placeholder] = match;
55
+ return placeholder;
56
+ });
57
+ });
58
+
59
+ // Also protect decimal numbers (e.g., "3.14")
60
+ processed = processed.replace(/(\d+)\.(\d+)/g, '$1__DECIMAL__$2');
61
+
62
+ // Split by sentence-ending punctuation
63
+ // Using a more robust pattern that handles multiple punctuation marks
64
+ const sentenceEnders = /([.!?।۔]+)\s+/g;
65
+ const parts = processed.split(sentenceEnders);
66
+
67
+ // Reconstruct sentences (split creates alternating text and punctuation)
68
+ const sentences: string[] = [];
69
+ for (let i = 0; i < parts.length; i += 2) {
70
+ let sentence = parts[i];
71
+ if (i + 1 < parts.length) {
72
+ sentence += parts[i + 1]; // Add punctuation back
73
+ }
74
+ sentence = sentence.trim();
75
+ if (sentence) {
76
+ sentences.push(sentence);
77
+ }
78
+ }
79
+
80
+ // Restore placeholders
81
+ const restored = sentences.map(s => {
82
+ let result = s;
83
+ Object.entries(placeholders).forEach(([placeholder, original]) => {
84
+ result = result.replace(placeholder, original);
85
+ });
86
+ result = result.replace(/__DECIMAL__/g, '.');
87
+ return result.trim();
88
+ });
89
+
90
+ // Filter out very short sentences (less than 2 characters)
91
+ return restored.filter(s => s.length >= 2);
92
+ }
93
+
94
  export default function TextInput({ onSentencesLoaded }: TextInputProps) {
95
  const [text, setText] = useState('');
96
  const fileInputRef = useRef<HTMLInputElement>(null);
97
 
98
  const processText = (inputText: string) => {
99
+ if (!inputText.trim()) {
100
+ toast.error('Please enter some text');
101
+ return;
102
+ }
103
 
104
+ const sentences = splitIntoSentences(inputText);
 
 
 
 
 
 
105
 
106
  if (sentences.length > 0) {
107
  onSentencesLoaded(sentences);
108
+ toast.success(`Loaded ${sentences.length} sentence${sentences.length > 1 ? 's' : ''}`);
109
  setText('');
110
  } else {
111
+ toast.error('No valid sentences found. Try putting each sentence on a new line.');
112
  }
113
  };
114
 
 
150
  >
151
  <input
152
  type="file"
153
+ accept=".txt,.csv"
154
  ref={fileInputRef}
155
  className="hidden"
156
  onChange={handleFileUpload}
157
  />
158
  <Upload className="w-8 h-8 mx-auto mb-2 text-muted-foreground group-hover:text-primary transition-colors" />
159
  <p className="text-sm font-medium">Drop text file or click to upload</p>
160
+ <p className="text-xs text-muted-foreground mt-1">.txt and .csv files supported</p>
161
  </div>
162
 
163
  <div className="relative">
 
171
 
172
  <div className="space-y-2">
173
  <textarea
174
+ className="input min-h-[120px] resize-y"
175
+ placeholder="Paste your text here...&#10;&#10;• One sentence per line works best&#10;• Or paste a paragraph - it will be split automatically"
176
  value={text}
177
  onChange={handleTextChange}
178
  />
179
+ <div className="flex gap-2">
180
+ <button
181
+ onClick={handlePaste}
182
+ disabled={!text.trim()}
183
+ className="btn btn-primary flex-1"
184
+ >
185
+ Load Sentences
186
+ </button>
187
+ <button
188
+ onClick={() => setText('')}
189
+ disabled={!text.trim()}
190
+ className="btn btn-secondary"
191
+ >
192
+ Clear
193
+ </button>
194
+ </div>
195
+ <p className="text-xs text-muted-foreground text-center">
196
+ Tip: For best results, put each sentence on a new line
197
+ </p>
198
  </div>
199
  </CardContent>
200
  </Card>