prachi1507 commited on
Commit
3909dfe
Β·
verified Β·
1 Parent(s): cd41c00

create app.py

Browse files
Files changed (1) hide show
  1. app.py +461 -0
app.py ADDED
@@ -0,0 +1,461 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import whisper
3
+ import tempfile
4
+ import os
5
+ import torch
6
+ from datetime import datetime
7
+ import warnings
8
+ import gc
9
+
10
+ # Suppress warnings
11
+ warnings.filterwarnings("ignore")
12
+
13
+ # Configure Streamlit page
14
+ st.set_page_config(
15
+ page_title="Audio Transcriber & Translator",
16
+ page_icon="🎡",
17
+ layout="centered"
18
+ )
19
+
20
+ # Custom CSS for better UI
21
+ st.markdown("""
22
+ <style>
23
+ .main-header {
24
+ text-align: center;
25
+ padding: 2rem 0;
26
+ background: linear-gradient(90deg, #667eea 0%, #764ba2 100%);
27
+ color: white;
28
+ border-radius: 10px;
29
+ margin-bottom: 2rem;
30
+ }
31
+ .result-section {
32
+ background: #f8f9fa;
33
+ padding: 1.5rem;
34
+ border-radius: 10px;
35
+ margin: 1rem 0;
36
+ border-left: 4px solid #667eea;
37
+ }
38
+ .download-section {
39
+ background: #e8f5e8;
40
+ padding: 1.5rem;
41
+ border-radius: 10px;
42
+ margin-top: 1.5rem;
43
+ text-align: center;
44
+ }
45
+ .language-badge {
46
+ background: #667eea;
47
+ color: white;
48
+ padding: 0.5rem 1rem;
49
+ border-radius: 20px;
50
+ font-weight: bold;
51
+ display: inline-block;
52
+ margin-bottom: 1rem;
53
+ }
54
+ .warning-box {
55
+ background: #fff3cd;
56
+ border: 1px solid #ffeaa7;
57
+ padding: 1rem;
58
+ border-radius: 8px;
59
+ margin: 1rem 0;
60
+ }
61
+ </style>
62
+ """, unsafe_allow_html=True)
63
+
64
+ class M2M100Translator:
65
+ def __init__(self):
66
+ self.model_name = "facebook/m2m100_418M"
67
+ self.tokenizer = None
68
+ self.model = None
69
+
70
+ # M2M100 language codes
71
+ self.supported_languages = {
72
+ 'af': 'Afrikaans', 'ar': 'Arabic', 'bg': 'Bulgarian', 'bn': 'Bengali',
73
+ 'ca': 'Catalan', 'cs': 'Czech', 'da': 'Danish', 'de': 'German',
74
+ 'el': 'Greek', 'en': 'English', 'es': 'Spanish', 'et': 'Estonian',
75
+ 'fa': 'Persian', 'fi': 'Finnish', 'fr': 'French', 'gu': 'Gujarati',
76
+ 'he': 'Hebrew', 'hi': 'Hindi', 'hr': 'Croatian', 'hu': 'Hungarian',
77
+ 'id': 'Indonesian', 'it': 'Italian', 'ja': 'Japanese', 'ka': 'Georgian',
78
+ 'kk': 'Kazakh', 'km': 'Khmer', 'kn': 'Kannada', 'ko': 'Korean',
79
+ 'lt': 'Lithuanian', 'lv': 'Latvian', 'mk': 'Macedonian', 'ml': 'Malayalam',
80
+ 'mn': 'Mongolian', 'mr': 'Marathi', 'ms': 'Malay', 'my': 'Myanmar',
81
+ 'ne': 'Nepali', 'nl': 'Dutch', 'no': 'Norwegian', 'pl': 'Polish',
82
+ 'pt': 'Portuguese', 'ro': 'Romanian', 'ru': 'Russian', 'si': 'Sinhala',
83
+ 'sk': 'Slovak', 'sl': 'Slovenian', 'sq': 'Albanian', 'sr': 'Serbian',
84
+ 'sv': 'Swedish', 'sw': 'Swahili', 'ta': 'Tamil', 'te': 'Telugu',
85
+ 'th': 'Thai', 'tl': 'Tagalog', 'tr': 'Turkish', 'uk': 'Ukrainian',
86
+ 'ur': 'Urdu', 'vi': 'Vietnamese', 'zh': 'Chinese'
87
+ }
88
+
89
+ def load_model(self):
90
+ if self.model is None:
91
+ try:
92
+ from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer
93
+
94
+ with st.spinner("πŸ”„ Loading M2M100 translation model..."):
95
+ # Load tokenizer and model - simplified for HF Spaces
96
+ self.tokenizer = M2M100Tokenizer.from_pretrained(self.model_name)
97
+ self.model = M2M100ForConditionalGeneration.from_pretrained(
98
+ self.model_name,
99
+ torch_dtype=torch.float32 # Use float32 for CPU compatibility
100
+ )
101
+
102
+ st.success("βœ… Translation model loaded successfully!")
103
+
104
+ except Exception as e:
105
+ st.error(f"❌ Failed to load translation model: {str(e)}")
106
+ st.info("πŸ’‘ Translation will be skipped. You can still get transcripts.")
107
+ return False
108
+ return True
109
+
110
+ def get_language_name(self, lang_code):
111
+ return self.supported_languages.get(lang_code, lang_code.upper())
112
+
113
+ def translate_text(self, text, source_language):
114
+ if not text or not text.strip():
115
+ return {"success": False, "error": "Empty text provided"}
116
+
117
+ # If already English, return as is
118
+ if source_language == 'en':
119
+ return {
120
+ "success": True,
121
+ "original_text": text,
122
+ "translated_text": text,
123
+ "source_language": source_language,
124
+ "note": "Source is already English"
125
+ }
126
+
127
+ # Check if source language is supported
128
+ if source_language not in self.supported_languages:
129
+ return {
130
+ "success": False,
131
+ "error": f"Language '{source_language}' not supported",
132
+ "original_text": text,
133
+ "source_language": source_language
134
+ }
135
+
136
+ if not self.load_model():
137
+ return {
138
+ "success": False,
139
+ "error": "Translation model not available",
140
+ "original_text": text,
141
+ "source_language": source_language
142
+ }
143
+
144
+ try:
145
+ # Set source language
146
+ self.tokenizer.src_lang = source_language
147
+
148
+ # Tokenize input with length limits for HF Spaces
149
+ inputs = self.tokenizer(
150
+ text,
151
+ return_tensors="pt",
152
+ padding=True,
153
+ truncation=True,
154
+ max_length=200 # Reduced for faster processing
155
+ )
156
+
157
+ # Generate translation
158
+ with torch.no_grad():
159
+ generated_tokens = self.model.generate(
160
+ **inputs,
161
+ forced_bos_token_id=self.tokenizer.get_lang_id("en"),
162
+ max_length=250,
163
+ num_beams=2, # Reduced beams for speed
164
+ early_stopping=True,
165
+ do_sample=False
166
+ )
167
+
168
+ # Decode translation
169
+ translated_text = self.tokenizer.batch_decode(
170
+ generated_tokens,
171
+ skip_special_tokens=True
172
+ )[0]
173
+
174
+ # Clear memory
175
+ del inputs, generated_tokens
176
+ gc.collect()
177
+
178
+ return {
179
+ "success": True,
180
+ "original_text": text,
181
+ "translated_text": translated_text.strip(),
182
+ "source_language": source_language,
183
+ "model_used": self.model_name
184
+ }
185
+
186
+ except Exception as e:
187
+ return {
188
+ "success": False,
189
+ "error": str(e),
190
+ "original_text": text,
191
+ "source_language": source_language
192
+ }
193
+
194
+ @st.cache_resource
195
+ def load_whisper_model():
196
+ """Load Whisper model with caching - optimized for HF Spaces"""
197
+ try:
198
+ # Use tiny model for faster loading and processing on HF Spaces
199
+ model = whisper.load_model("tiny")
200
+ return model
201
+ except Exception as e:
202
+ st.error(f"Failed to load Whisper model: {e}")
203
+ return None
204
+
205
+ @st.cache_resource
206
+ def load_translator():
207
+ """Load translator with caching"""
208
+ return M2M100Translator()
209
+
210
+ def transcribe_audio(audio_file):
211
+ """Transcribe uploaded audio file"""
212
+ try:
213
+ # Create temporary file
214
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
215
+ tmp_file.write(audio_file.read())
216
+ tmp_file_path = tmp_file.name
217
+
218
+ model = load_whisper_model()
219
+ if model is None:
220
+ return {"success": False, "error": "Whisper model not available"}
221
+
222
+ # Transcribe with optimized settings for HF Spaces
223
+ result = model.transcribe(
224
+ tmp_file_path,
225
+ fp16=False, # Use fp32 for better compatibility
226
+ task="transcribe"
227
+ )
228
+
229
+ # Clean up
230
+ os.unlink(tmp_file_path)
231
+ gc.collect()
232
+
233
+ return {
234
+ "success": True,
235
+ "transcript": result["text"].strip(),
236
+ "language": result["language"]
237
+ }
238
+
239
+ except Exception as e:
240
+ if 'tmp_file_path' in locals():
241
+ try:
242
+ os.unlink(tmp_file_path)
243
+ except:
244
+ pass
245
+ return {"success": False, "error": str(e)}
246
+
247
+ def main():
248
+ # Header
249
+ st.markdown("""
250
+ <div class="main-header">
251
+ <h1>🎡 Audio Transcriber & Translator</h1>
252
+ <p>Upload audio files and get transcripts with English translation</p>
253
+ <small>Optimized for Hugging Face Spaces</small>
254
+ </div>
255
+ """, unsafe_allow_html=True)
256
+
257
+ # HF Spaces notice
258
+ st.markdown("""
259
+ <div class="warning-box">
260
+ <strong>πŸš€ Hugging Face Spaces Version</strong><br>
261
+ β€’ Using Whisper-tiny for faster processing<br>
262
+ β€’ File limit: 10MB, Duration: 5 minutes<br>
263
+ β€’ Processing may take 1-2 minutes
264
+ </div>
265
+ """, unsafe_allow_html=True)
266
+
267
+ # Show system info in sidebar
268
+ with st.sidebar:
269
+ st.header("πŸ”§ System Info")
270
+ st.info("Running on Hugging Face Spaces")
271
+ st.info(f"PyTorch: {torch.__version__}")
272
+ st.warning("Using CPU (optimized for HF Spaces)")
273
+
274
+ st.header("🌍 Models")
275
+ st.info("β€’ Whisper: tiny (fast)")
276
+ st.info("β€’ Translation: M2M100-418M")
277
+
278
+ with st.expander("πŸ’‘ Tips"):
279
+ st.caption("β€’ Use shorter audio files (< 5 min)")
280
+ st.caption("β€’ MP3/WAV work best")
281
+ st.caption("β€’ Clear speech gives better results")
282
+ st.caption("β€’ Processing takes 1-2 minutes")
283
+
284
+ # File uploader with restrictions for HF Spaces
285
+ uploaded_file = st.file_uploader(
286
+ "🎡 Choose an audio file",
287
+ type=['mp3', 'wav', 'mp4', 'm4a'],
288
+ help="Supported: MP3, WAV, MP4, M4A | Max: 10MB, 5 minutes"
289
+ )
290
+
291
+ if uploaded_file is not None:
292
+ # File size check
293
+ file_size_mb = uploaded_file.size / (1024 * 1024)
294
+
295
+ if file_size_mb > 10:
296
+ st.error("❌ File too large! Please use files under 10MB for optimal performance on HF Spaces.")
297
+ return
298
+
299
+ st.success(f"πŸ“ **{uploaded_file.name}** ({file_size_mb:.2f} MB)")
300
+
301
+ # Processing options
302
+ col1, col2 = st.columns(2)
303
+ with col1:
304
+ transcribe_only = st.checkbox("Transcribe only (faster)", value=False)
305
+ with col2:
306
+ if st.button("🧹 Clear Cache", help="Clear models from memory"):
307
+ st.cache_resource.clear()
308
+ st.success("Cache cleared!")
309
+
310
+ # Process button
311
+ if st.button("πŸš€ Process Audio", type="primary", use_container_width=True):
312
+ start_time = datetime.now()
313
+
314
+ # Step 1: Transcription
315
+ with st.spinner("🎀 Transcribing audio... (this may take 1-2 minutes)"):
316
+ transcription_result = transcribe_audio(uploaded_file)
317
+
318
+ if transcription_result["success"]:
319
+ transcript = transcription_result["transcript"]
320
+ detected_language = transcription_result["language"]
321
+
322
+ # Get language name
323
+ translator = load_translator()
324
+ language_name = translator.get_language_name(detected_language)
325
+
326
+ # Display transcription results
327
+ st.markdown("""
328
+ <div class="result-section">
329
+ <h3>πŸ“ Transcription Results</h3>
330
+ </div>
331
+ """, unsafe_allow_html=True)
332
+
333
+ # Language badge
334
+ st.markdown(f"""
335
+ <div class="language-badge">
336
+ 🌍 Detected: {language_name} ({detected_language})
337
+ </div>
338
+ """, unsafe_allow_html=True)
339
+
340
+ # Transcript
341
+ st.text_area(
342
+ "Original Transcript",
343
+ transcript,
344
+ height=150,
345
+ key="transcript"
346
+ )
347
+
348
+ # Step 2: Translation (if requested)
349
+ if not transcribe_only and detected_language != 'en':
350
+ with st.spinner("🌍 Translating to English..."):
351
+ translation_result = translator.translate_text(transcript, detected_language)
352
+
353
+ if translation_result["success"]:
354
+ translated_text = translation_result["translated_text"]
355
+
356
+ st.markdown("""
357
+ <div class="result-section">
358
+ <h3>🌍 English Translation</h3>
359
+ </div>
360
+ """, unsafe_allow_html=True)
361
+
362
+ st.text_area(
363
+ "English Translation",
364
+ translated_text,
365
+ height=150,
366
+ key="translation"
367
+ )
368
+
369
+ # Download section
370
+ st.markdown("""
371
+ <div class="download-section">
372
+ <h4>πŸ“₯ Download Results</h4>
373
+ </div>
374
+ """, unsafe_allow_html=True)
375
+
376
+ # Prepare download content
377
+ full_content = f"""Audio Transcription & Translation
378
+ {'='*60}
379
+ File: {uploaded_file.name}
380
+ Size: {file_size_mb:.2f} MB
381
+ Detected Language: {language_name} ({detected_language})
382
+ Processing Time: {(datetime.now() - start_time).total_seconds():.1f} seconds
383
+ Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
384
+ {'='*60}
385
+
386
+ ORIGINAL TRANSCRIPT ({language_name}):
387
+ {transcript}
388
+
389
+ ENGLISH TRANSLATION:
390
+ {translated_text}
391
+
392
+ {'='*60}
393
+ Processed with Whisper (tiny) + M2M100 on Hugging Face Spaces
394
+ """
395
+
396
+ st.download_button(
397
+ "πŸ“„ Download Complete Results",
398
+ full_content,
399
+ file_name=f"{os.path.splitext(uploaded_file.name)[0]}_results.txt",
400
+ mime="text/plain",
401
+ use_container_width=True
402
+ )
403
+
404
+ else:
405
+ st.error(f"❌ Translation failed: {translation_result['error']}")
406
+ # Still offer transcript download
407
+ transcript_content = f"""Audio Transcription
408
+ {'='*50}
409
+ File: {uploaded_file.name}
410
+ Language: {language_name} ({detected_language})
411
+ Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
412
+ {'='*50}
413
+
414
+ {transcript}
415
+ """
416
+ st.download_button(
417
+ "πŸ“„ Download Transcript",
418
+ transcript_content,
419
+ file_name=f"{os.path.splitext(uploaded_file.name)[0]}_transcript.txt",
420
+ mime="text/plain"
421
+ )
422
+
423
+ elif transcribe_only or detected_language == 'en':
424
+ # Transcript only
425
+ transcript_content = f"""Audio Transcription
426
+ {'='*50}
427
+ File: {uploaded_file.name}
428
+ Language: {language_name} ({detected_language})
429
+ Processing Time: {(datetime.now() - start_time).total_seconds():.1f} seconds
430
+ Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}
431
+ {'='*50}
432
+
433
+ {transcript}
434
+ """
435
+ st.download_button(
436
+ "πŸ“„ Download Transcript",
437
+ transcript_content,
438
+ file_name=f"{os.path.splitext(uploaded_file.name)[0]}_transcript.txt",
439
+ mime="text/plain",
440
+ use_container_width=True
441
+ )
442
+
443
+ # Show processing time
444
+ processing_time = (datetime.now() - start_time).total_seconds()
445
+ st.success(f"βœ… Processing completed in {processing_time:.1f} seconds")
446
+
447
+ else:
448
+ st.error(f"❌ Transcription failed: {transcription_result['error']}")
449
+ st.info("πŸ’‘ Try with a different audio file or format")
450
+
451
+ # Footer
452
+ st.markdown("---")
453
+ st.markdown("""
454
+ <div style="text-align: center; color: #666; padding: 1rem;">
455
+ <p>🎡 Powered by OpenAI Whisper & Facebook M2M100</p>
456
+ <p>Running on Hugging Face Spaces πŸ€—</p>
457
+ </div>
458
+ """, unsafe_allow_html=True)
459
+
460
+ if __name__ == "__main__":
461
+ main()