File size: 7,602 Bytes
88b6846
 
 
 
 
 
 
 
 
 
 
d4b5ccf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88b6846
 
 
 
 
d4b5ccf
 
 
 
88b6846
d4b5ccf
88b6846
 
 
d4b5ccf
88b6846
 
d4b5ccf
88b6846
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d4b5ccf
88b6846
 
 
 
 
 
d4b5ccf
88b6846
 
 
 
 
 
 
 
 
 
 
 
 
d4b5ccf
 
88b6846
 
 
d4b5ccf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88b6846
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
'use client';

import React, { useState, useRef } from 'react';
import { Upload, FileText } from 'lucide-react';
import { toast } from 'sonner';
import { Card, CardContent, CardHeader, CardTitle } from '@/components/ui/card';

interface TextInputProps {
    onSentencesLoaded: (sentences: string[]) => void;
}

/**
 * Smart sentence splitting that handles various text formats
 */
function splitIntoSentences(inputText: string): string[] {
    const text = inputText.trim();
    if (!text) return [];

    // Step 1: Normalize line endings
    const normalized = text.replace(/\r\n/g, '\n').replace(/\r/g, '\n');

    // Step 2: Check if text is already line-separated (common for TTS datasets)
    const lines = normalized.split('\n').map(s => s.trim()).filter(s => s.length > 0);

    // If we have multiple non-empty lines, assume each line is a sentence
    if (lines.length > 1) {
        return lines.filter(line => line.length >= 2);
    }

    // Step 3: Single block of text - need to split by sentence boundaries
    // This regex handles:
    // - Standard punctuation: . ! ?
    // - Ellipsis: ...
    // - Hindi/Urdu: । ۔
    // - Followed by whitespace or end of string
    // - Preserves abbreviations like "Mr.", "Dr.", "etc."

    const singleLine = lines[0] || text;

    // Common abbreviations to protect
    const abbreviations = [
        'Mr', 'Mrs', 'Ms', 'Dr', 'Prof', 'Sr', 'Jr', 'vs', 'etc', 'i.e', 'e.g',
        'Inc', 'Ltd', 'Corp', 'Co', 'No', 'Vol', 'Rev', 'Fig', 'Eq'
    ];

    // Create placeholder for abbreviations
    let processed = singleLine;
    const placeholders: { [key: string]: string } = {};

    abbreviations.forEach((abbr, index) => {
        const placeholder = `__ABBR${index}__`;
        const regex = new RegExp(`\\b${abbr}\\.\\s`, 'gi');
        processed = processed.replace(regex, (match) => {
            placeholders[placeholder] = match;
            return placeholder;
        });
    });

    // Also protect decimal numbers (e.g., "3.14")
    processed = processed.replace(/(\d+)\.(\d+)/g, '$1__DECIMAL__$2');

    // Split by sentence-ending punctuation
    // Using a more robust pattern that handles multiple punctuation marks
    const sentenceEnders = /([.!?।۔]+)\s+/g;
    const parts = processed.split(sentenceEnders);

    // Reconstruct sentences (split creates alternating text and punctuation)
    const sentences: string[] = [];
    for (let i = 0; i < parts.length; i += 2) {
        let sentence = parts[i];
        if (i + 1 < parts.length) {
            sentence += parts[i + 1]; // Add punctuation back
        }
        sentence = sentence.trim();
        if (sentence) {
            sentences.push(sentence);
        }
    }

    // Restore placeholders
    const restored = sentences.map(s => {
        let result = s;
        Object.entries(placeholders).forEach(([placeholder, original]) => {
            result = result.replace(placeholder, original);
        });
        result = result.replace(/__DECIMAL__/g, '.');
        return result.trim();
    });

    // Filter out very short sentences (less than 2 characters)
    return restored.filter(s => s.length >= 2);
}

export default function TextInput({ onSentencesLoaded }: TextInputProps) {
    const [text, setText] = useState('');
    const fileInputRef = useRef<HTMLInputElement>(null);

    const processText = (inputText: string) => {
        if (!inputText.trim()) {
            toast.error('Please enter some text');
            return;
        }

        const sentences = splitIntoSentences(inputText);

        if (sentences.length > 0) {
            onSentencesLoaded(sentences);
            toast.success(`Loaded ${sentences.length} sentence${sentences.length > 1 ? 's' : ''}`);
            setText('');
        } else {
            toast.error('No valid sentences found. Try putting each sentence on a new line.');
        }
    };

    const handleTextChange = (e: React.ChangeEvent<HTMLTextAreaElement>) => {
        setText(e.target.value);
    };

    const handlePaste = () => {
        processText(text);
    };

    const handleFileUpload = (e: React.ChangeEvent<HTMLInputElement>) => {
        const file = e.target.files?.[0];
        if (!file) return;

        const reader = new FileReader();
        reader.onload = (event) => {
            const content = event.target?.result as string;
            setText(content);
            processText(content);
            toast.success(`File loaded: ${file.name}`);
        };
        reader.onerror = () => toast.error('Failed to read file');
        reader.readAsText(file);
    };

    return (
        <Card>
            <CardHeader>
                <CardTitle className="text-lg flex items-center gap-2">
                    <FileText className="w-4 h-4" />
                    Input Data
                </CardTitle>
            </CardHeader>
            <CardContent className="space-y-4">
                <div
                    className="border-2 border-dashed border-border rounded-xl p-6 text-center hover:bg-secondary/50 transition-colors cursor-pointer relative group"
                    onClick={() => fileInputRef.current?.click()}
                >
                    <input
                        type="file"
                        accept=".txt,.csv"
                        ref={fileInputRef}
                        className="hidden"
                        onChange={handleFileUpload}
                    />
                    <Upload className="w-8 h-8 mx-auto mb-2 text-muted-foreground group-hover:text-primary transition-colors" />
                    <p className="text-sm font-medium">Drop text file or click to upload</p>
                    <p className="text-xs text-muted-foreground mt-1">.txt and .csv files supported</p>
                </div>

                <div className="relative">
                    <div className="absolute inset-0 flex items-center">
                        <span className="w-full border-t border-border" />
                    </div>
                    <div className="relative flex justify-center text-xs uppercase">
                        <span className="bg-card px-2 text-muted-foreground">Or paste text</span>
                    </div>
                </div>

                <div className="space-y-2">
                    <textarea
                        className="input min-h-[120px] resize-y"
                        placeholder="Paste your text here...&#10;&#10;• One sentence per line works best&#10;• Or paste a paragraph - it will be split automatically"
                        value={text}
                        onChange={handleTextChange}
                    />
                    <div className="flex gap-2">
                        <button
                            onClick={handlePaste}
                            disabled={!text.trim()}
                            className="btn btn-primary flex-1"
                        >
                            Load Sentences
                        </button>
                        <button
                            onClick={() => setText('')}
                            disabled={!text.trim()}
                            className="btn btn-secondary"
                        >
                            Clear
                        </button>
                    </div>
                    <p className="text-xs text-muted-foreground text-center">
                        Tip: For best results, put each sentence on a new line
                    </p>
                </div>
            </CardContent>
        </Card>
    );
}