nihun commited on
Commit
85c18a5
·
verified ·
1 Parent(s): db1995c

Upload 19 files

Browse files
Dockerfile CHANGED
@@ -1,20 +1,52 @@
1
- FROM python:3.13.5-slim
2
-
3
- WORKDIR /app
4
-
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- git \
9
- && rm -rf /var/lib/apt/lists/*
10
-
11
- COPY requirements.txt ./
12
- COPY src/ ./src/
13
-
14
- RUN pip3 install -r requirements.txt
15
-
16
- EXPOSE 8501
17
-
18
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
19
-
20
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use Python base image
2
+ FROM python:3.10-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies including FFmpeg
8
+ RUN apt-get update && apt-get install -y --no-install-recommends \
9
+ ffmpeg \
10
+ libsm6 \
11
+ libxext6 \
12
+ libgl1-mesa-glb \
13
+ git \
14
+ && apt-get clean \
15
+ && rm -rf /var/lib/apt/lists/*
16
+
17
+ # Create non-root user for security (required by HF Spaces)
18
+ RUN useradd -m -u 1000 user
19
+ USER user
20
+ ENV HOME=/home/user \
21
+ PATH=/home/user/.local/bin:$PATH
22
+
23
+ # Set working directory for user
24
+ WORKDIR $HOME/app
25
+
26
+ # Copy requirements first (for caching)
27
+ COPY --chown=user:user requirements.txt .
28
+
29
+ # Install Python dependencies
30
+ RUN pip install --no-cache-dir --upgrade pip && \
31
+ pip install --no-cache-dir -r requirements.txt
32
+
33
+ # Copy application files
34
+ COPY --chown=user:user . .
35
+
36
+ # Create temp directory
37
+ RUN mkdir -p temp
38
+
39
+ # Expose Streamlit port
40
+ EXPOSE 7860
41
+
42
+ # Health check
43
+ HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health || exit 1
44
+
45
+ # Set environment variables for Streamlit
46
+ ENV STREAMLIT_SERVER_PORT=7860 \
47
+ STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
48
+ STREAMLIT_SERVER_HEADLESS=true \
49
+ STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
50
+
51
+ # Run the application
52
+ CMD ["streamlit", "run", "app.py"]
app.py ADDED
@@ -0,0 +1,607 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ 🎌 Anime Translator with Lip-Sync
3
+ =================================
4
+
5
+ A Streamlit application that translates text between English and Hindi,
6
+ converts it to speech, and generates a lip-synced anime avatar animation.
7
+ """
8
+
9
+ import streamlit as st
10
+ from pathlib import Path
11
+ import tempfile
12
+ import time
13
+ import shutil
14
+ import os
15
+ import subprocess
16
+ from shutil import which
17
+ from typing import Tuple, Optional
18
+
19
+ # Import utility modules
20
+ from utils.translator import translate_text, detect_language
21
+ from utils.tts_engine import synthesize_speech, get_audio_duration
22
+ from utils.lipsync import generate_lipsync_gif
23
+ from utils.speech_to_text import transcribe_audio, get_language_code
24
+ from utils.avatar_manager import list_avatars, get_avatar_preview, ensure_sample_avatar
25
+
26
+ # =============================================================================
27
+ # FFmpeg Configuration
28
+ # =============================================================================
29
+
30
+ def configure_ffmpeg():
31
+ """Configure FFmpeg path for pydub on Windows."""
32
+ possible_paths = [
33
+ r"C:\ffmpeg\bin",
34
+ r"C:\Program Files\ffmpeg\bin",
35
+ r"C:\Program Files (x86)\ffmpeg\bin",
36
+ os.path.expanduser("~\\ffmpeg\\bin"),
37
+ r"C:\Users\Nishant Pratap\ffmpeg\bin", # Add your user-specific path
38
+ ]
39
+
40
+ if which("ffmpeg") is not None:
41
+ return True
42
+
43
+ for path in possible_paths:
44
+ ffmpeg_exe = os.path.join(path, "ffmpeg.exe")
45
+ if os.path.exists(ffmpeg_exe):
46
+ os.environ["PATH"] = path + os.pathsep + os.environ.get("PATH", "")
47
+ # Also set for pydub specifically
48
+ try:
49
+ from pydub import AudioSegment
50
+ AudioSegment.converter = os.path.join(path, "ffmpeg.exe")
51
+ AudioSegment.ffprobe = os.path.join(path, "ffprobe.exe")
52
+ except:
53
+ pass
54
+ return True
55
+
56
+ return False
57
+
58
+
59
+ def check_ffmpeg_detailed():
60
+ """Check FFmpeg installation and return detailed status."""
61
+ status = {
62
+ "ffmpeg_in_path": False,
63
+ "ffmpeg_works": False,
64
+ "ffprobe_works": False,
65
+ "pydub_works": False,
66
+ "error_message": None
67
+ }
68
+
69
+ ffmpeg_path = which("ffmpeg")
70
+ status["ffmpeg_in_path"] = ffmpeg_path is not None
71
+
72
+ try:
73
+ result = subprocess.run(
74
+ ["ffmpeg", "-version"],
75
+ capture_output=True,
76
+ text=True,
77
+ timeout=5
78
+ )
79
+ status["ffmpeg_works"] = result.returncode == 0
80
+ except Exception as e:
81
+ status["error_message"] = str(e)
82
+
83
+ try:
84
+ result = subprocess.run(
85
+ ["ffprobe", "-version"],
86
+ capture_output=True,
87
+ text=True,
88
+ timeout=5
89
+ )
90
+ status["ffprobe_works"] = result.returncode == 0
91
+ except Exception:
92
+ pass
93
+
94
+ try:
95
+ from pydub import AudioSegment
96
+ silence = AudioSegment.silent(duration=100)
97
+ status["pydub_works"] = True
98
+ except Exception as e:
99
+ status["pydub_works"] = False
100
+ if not status["error_message"]:
101
+ status["error_message"] = str(e)
102
+
103
+ return status
104
+
105
+
106
+ ffmpeg_found = configure_ffmpeg()
107
+
108
+ # =============================================================================
109
+ # Configuration
110
+ # =============================================================================
111
+
112
+ AVATARS_DIR = Path("./avatars")
113
+ TEMP_DIR = Path(tempfile.gettempdir()) / "anime_translator"
114
+
115
+ AVATARS_DIR.mkdir(parents=True, exist_ok=True)
116
+ TEMP_DIR.mkdir(parents=True, exist_ok=True)
117
+
118
+ # Page configuration
119
+ st.set_page_config(
120
+ page_title="🎌 Anime Translator",
121
+ page_icon="🎌",
122
+ layout="wide",
123
+ initial_sidebar_state="expanded"
124
+ )
125
+
126
+ # =============================================================================
127
+ # Custom CSS Styling
128
+ # =============================================================================
129
+
130
+ st.markdown("""
131
+ <style>
132
+ .main {
133
+ background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%);
134
+ }
135
+
136
+ .main-header {
137
+ background: linear-gradient(90deg, #e94560, #ff6b6b);
138
+ -webkit-background-clip: text;
139
+ -webkit-text-fill-color: transparent;
140
+ font-size: 3rem;
141
+ font-weight: bold;
142
+ text-align: center;
143
+ padding: 1rem;
144
+ margin-bottom: 2rem;
145
+ }
146
+
147
+ .stButton > button {
148
+ background: linear-gradient(90deg, #e94560, #ff6b6b);
149
+ color: white;
150
+ border: none;
151
+ border-radius: 25px;
152
+ padding: 0.75rem 2rem;
153
+ font-weight: bold;
154
+ transition: all 0.3s ease;
155
+ width: 100%;
156
+ }
157
+
158
+ .stButton > button:hover {
159
+ transform: translateY(-2px);
160
+ box-shadow: 0 5px 20px rgba(233, 69, 96, 0.4);
161
+ }
162
+
163
+ .result-box {
164
+ background: linear-gradient(135deg, rgba(233, 69, 96, 0.1), rgba(255, 107, 107, 0.1));
165
+ border-radius: 15px;
166
+ padding: 1.5rem;
167
+ border: 1px solid rgba(233, 69, 96, 0.3);
168
+ margin: 1rem 0;
169
+ }
170
+
171
+ .info-box {
172
+ background: rgba(100, 200, 255, 0.1);
173
+ border-left: 4px solid #64c8ff;
174
+ padding: 1rem;
175
+ border-radius: 0 10px 10px 0;
176
+ margin: 1rem 0;
177
+ }
178
+
179
+ .success-box {
180
+ background: rgba(100, 255, 150, 0.1);
181
+ border-left: 4px solid #64ff96;
182
+ padding: 1rem;
183
+ border-radius: 0 10px 10px 0;
184
+ }
185
+
186
+ #MainMenu {visibility: hidden;}
187
+ footer {visibility: hidden;}
188
+
189
+ .stTabs [data-baseweb="tab-list"] {
190
+ gap: 8px;
191
+ }
192
+
193
+ .stTabs [data-baseweb="tab"] {
194
+ background: rgba(255, 255, 255, 0.05);
195
+ border-radius: 10px;
196
+ padding: 10px 20px;
197
+ }
198
+
199
+ .stTabs [aria-selected="true"] {
200
+ background: linear-gradient(90deg, #e94560, #ff6b6b);
201
+ }
202
+ </style>
203
+ """, unsafe_allow_html=True)
204
+
205
+ # =============================================================================
206
+ # Helper Functions
207
+ # =============================================================================
208
+
209
+ def cleanup_temp_files(older_than_sec: int = 3600) -> None:
210
+ """Clean up old temporary files."""
211
+ now = time.time()
212
+ try:
213
+ for path in TEMP_DIR.iterdir():
214
+ try:
215
+ if now - path.stat().st_mtime > older_than_sec:
216
+ if path.is_file():
217
+ path.unlink()
218
+ elif path.is_dir():
219
+ shutil.rmtree(path)
220
+ except Exception:
221
+ pass
222
+ except Exception:
223
+ pass
224
+
225
+
226
+ def process_translation_pipeline(
227
+ text: str,
228
+ source_lang: str,
229
+ target_lang: str,
230
+ avatar_name: str
231
+ ) -> Tuple[str, Optional[str], Optional[str]]:
232
+ """Main processing pipeline: translate, synthesize speech, generate animation."""
233
+
234
+ # Step 1: Translate text
235
+ try:
236
+ translated_text = translate_text(text, source_lang, target_lang)
237
+ except Exception as e:
238
+ raise Exception(f"Translation failed: {str(e)}")
239
+
240
+ # Step 2: Synthesize speech
241
+ try:
242
+ audio_path = synthesize_speech(translated_text, target_lang, TEMP_DIR)
243
+ except Exception as e:
244
+ raise Exception(f"Speech synthesis failed: {str(e)}")
245
+
246
+ # Step 3: Generate lip-sync animation
247
+ gif_path = None
248
+ try:
249
+ gif_path = generate_lipsync_gif(
250
+ avatar_name=avatar_name,
251
+ audio_path=audio_path,
252
+ avatars_dir=AVATARS_DIR,
253
+ output_dir=TEMP_DIR,
254
+ fps=12
255
+ )
256
+ except Exception as e:
257
+ # Don't fail completely if animation fails
258
+ print(f"Animation generation warning: {str(e)}")
259
+ gif_path = None
260
+
261
+ return translated_text, audio_path, gif_path
262
+
263
+
264
+ # =============================================================================
265
+ # Sidebar
266
+ # =============================================================================
267
+
268
+ with st.sidebar:
269
+ st.markdown("## ⚙️ Settings")
270
+
271
+ # Avatar selection
272
+ st.markdown("### 🎭 Avatar Selection")
273
+ avatars = list_avatars(AVATARS_DIR)
274
+
275
+ if avatars:
276
+ selected_avatar = st.selectbox(
277
+ "Choose your avatar",
278
+ options=avatars,
279
+ index=0,
280
+ help="Select an anime avatar for lip-sync animation"
281
+ )
282
+
283
+ preview = get_avatar_preview(selected_avatar, AVATARS_DIR)
284
+ if preview:
285
+ st.image(preview, caption=f"Preview: {selected_avatar}", width="stretch")
286
+ else:
287
+ st.warning("No avatars found. Creating sample avatar...")
288
+ ensure_sample_avatar(AVATARS_DIR)
289
+ selected_avatar = "sample"
290
+ st.rerun()
291
+
292
+ st.markdown("---")
293
+
294
+ # Language settings
295
+ st.markdown("### 🌐 Language Settings")
296
+
297
+ source_language = st.selectbox(
298
+ "Source Language",
299
+ options=["auto", "en", "hi"],
300
+ format_func=lambda x: {"auto": "🔄 Auto-detect", "en": "🇬🇧 English", "hi": "🇮🇳 Hindi"}[x],
301
+ index=0
302
+ )
303
+
304
+ target_language = st.selectbox(
305
+ "Target Language",
306
+ options=["en", "hi"],
307
+ format_func=lambda x: {"en": "🇬🇧 English", "hi": "🇮🇳 Hindi"}[x],
308
+ index=1
309
+ )
310
+
311
+ st.markdown("---")
312
+
313
+ # System status
314
+ st.markdown("### 🔧 System Status")
315
+
316
+ ffmpeg_status = check_ffmpeg_detailed()
317
+
318
+ if ffmpeg_status["ffmpeg_works"]:
319
+ st.success("✅ FFmpeg: Working")
320
+ else:
321
+ st.error("❌ FFmpeg: Not working")
322
+
323
+ if ffmpeg_status["pydub_works"]:
324
+ st.success("✅ Pydub: Working")
325
+ else:
326
+ st.warning("⚠️ Pydub: Limited (fallback mode)")
327
+
328
+ if ffmpeg_status["error_message"]:
329
+ with st.expander("🔍 Error Details"):
330
+ st.code(ffmpeg_status["error_message"])
331
+ st.markdown("""
332
+ **To fix FFmpeg:**
333
+ ```bash
334
+ conda install -c conda-forge ffmpeg
335
+ ```
336
+ Or download from: https://www.gyan.dev/ffmpeg/builds/
337
+ """)
338
+
339
+ st.markdown("---")
340
+
341
+ # Info section
342
+ st.markdown("### ℹ️ About")
343
+ st.markdown("""
344
+ Translate text between English and Hindi with lip-synced avatar animation.
345
+
346
+ **Features:**
347
+ - 🎤 Voice input
348
+ - 🔄 Auto detection
349
+ - 🗣️ Text-to-speech
350
+ - 🎬 Lip-sync animation
351
+ """)
352
+
353
+ if st.button("🧹 Clear Temp Files"):
354
+ cleanup_temp_files(older_than_sec=0)
355
+ st.success("Cleared!")
356
+
357
+
358
+ # =============================================================================
359
+ # Main Content
360
+ # =============================================================================
361
+
362
+ st.markdown('<h1 class="main-header">🎌 Anime Translator</h1>', unsafe_allow_html=True)
363
+ st.markdown(
364
+ '<p style="text-align: center; color: #888; font-size: 1.2rem;">'
365
+ 'Translate • Speak • Animate</p>',
366
+ unsafe_allow_html=True
367
+ )
368
+
369
+ # Tabs
370
+ tab1, tab2 = st.tabs(["📝 Text Input", "🎤 Voice Input"])
371
+
372
+ # =============================================================================
373
+ # Tab 1: Text Input
374
+ # =============================================================================
375
+
376
+ with tab1:
377
+ col1, col2 = st.columns([1, 1])
378
+
379
+ with col1:
380
+ st.markdown("### 📝 Enter Your Text")
381
+
382
+ text_input = st.text_area(
383
+ "Type or paste your text here",
384
+ height=150,
385
+ placeholder="Enter text in English or Hindi...\nउदाहरण: नमस्ते, आप कैसे हैं?\nExample: Hello, how are you?",
386
+ key="text_input"
387
+ )
388
+
389
+ if text_input:
390
+ detected = detect_language(text_input)
391
+ st.markdown(
392
+ f'<div class="info-box">'
393
+ f'📊 Characters: {len(text_input)} | '
394
+ f'🔍 Detected: {"🇮🇳 Hindi" if detected == "hi" else "🇬🇧 English"}'
395
+ f'</div>',
396
+ unsafe_allow_html=True
397
+ )
398
+
399
+ translate_btn = st.button(
400
+ "🚀 Translate & Animate",
401
+ key="translate_text_btn",
402
+ use_container_width=True
403
+ )
404
+
405
+ with col2:
406
+ st.markdown("### 🎬 Result")
407
+
408
+ if translate_btn and text_input:
409
+ with st.spinner("🔄 Processing..."):
410
+ progress = st.progress(0)
411
+ status_text = st.empty()
412
+
413
+ try:
414
+ status_text.text("📝 Translating...")
415
+ progress.progress(33)
416
+
417
+ translated, audio_path, gif_path = process_translation_pipeline(
418
+ text_input,
419
+ source_language,
420
+ target_language,
421
+ selected_avatar
422
+ )
423
+
424
+ status_text.text("🗣️ Generating speech...")
425
+ progress.progress(66)
426
+
427
+ status_text.text("🎬 Creating animation...")
428
+ progress.progress(100)
429
+
430
+ progress.empty()
431
+ status_text.empty()
432
+
433
+ # Display translated text
434
+ st.markdown(
435
+ f'<div class="result-box">'
436
+ f'<h4>📜 Translated Text:</h4>'
437
+ f'<p style="font-size: 1.2rem;">{translated}</p>'
438
+ f'</div>',
439
+ unsafe_allow_html=True
440
+ )
441
+
442
+ # Audio player
443
+ if audio_path and os.path.exists(audio_path):
444
+ st.markdown("#### 🔊 Audio")
445
+ st.audio(audio_path, format="audio/mp3")
446
+
447
+ # Animation display
448
+ if gif_path and os.path.exists(gif_path):
449
+ st.markdown("#### 🎭 Lip-Sync Animation")
450
+ st.image(gif_path, width="stretch")
451
+
452
+ with open(gif_path, "rb") as f:
453
+ st.download_button(
454
+ label="📥 Download Animation",
455
+ data=f,
456
+ file_name="lipsync_animation.gif",
457
+ mime="image/gif"
458
+ )
459
+ else:
460
+ st.info("ℹ️ Animation not available (FFmpeg may be missing)")
461
+
462
+ except Exception as e:
463
+ progress.empty()
464
+ status_text.empty()
465
+ st.error(f"❌ Error: {str(e)}")
466
+
467
+ elif translate_btn:
468
+ st.warning("⚠️ Please enter some text to translate.")
469
+
470
+
471
+ # =============================================================================
472
+ # Tab 2: Voice Input
473
+ # =============================================================================
474
+
475
+ with tab2:
476
+ col1, col2 = st.columns([1, 1])
477
+
478
+ with col1:
479
+ st.markdown("### 🎤 Voice Recording")
480
+
481
+ st.markdown("""
482
+ <div class="info-box">
483
+ <strong>Instructions:</strong><br>
484
+ 1. Upload an audio file (WAV, MP3, etc.)<br>
485
+ 2. Or use the audio recorder below<br>
486
+ 3. Click "Transcribe & Translate"
487
+ </div>
488
+ """, unsafe_allow_html=True)
489
+
490
+ uploaded_audio = st.file_uploader(
491
+ "Upload an audio file",
492
+ type=["wav", "mp3", "ogg", "flac", "m4a"],
493
+ help="Supported formats: WAV, MP3, OGG, FLAC, M4A"
494
+ )
495
+
496
+ recorded_audio = None
497
+ try:
498
+ from audio_recorder_streamlit import audio_recorder
499
+ st.markdown("**Or record directly:**")
500
+ recorded_audio = audio_recorder(
501
+ text="🎙️ Click to record",
502
+ recording_color="#e94560",
503
+ neutral_color="#6c757d",
504
+ icon_name="microphone",
505
+ icon_size="2x"
506
+ )
507
+ except ImportError:
508
+ st.info("💡 For recording: `pip install audio-recorder-streamlit`")
509
+
510
+ voice_lang = st.selectbox(
511
+ "Recording Language",
512
+ options=["en", "hi"],
513
+ format_func=lambda x: {"en": "🇬🇧 English", "hi": "🇮🇳 Hindi"}[x]
514
+ )
515
+
516
+ voice_btn = st.button(
517
+ "🎯 Transcribe & Translate",
518
+ key="voice_btn",
519
+ use_container_width=True
520
+ )
521
+
522
+ with col2:
523
+ st.markdown("### 🎬 Result")
524
+
525
+ audio_to_process = None
526
+
527
+ if uploaded_audio is not None:
528
+ temp_audio_path = TEMP_DIR / f"uploaded_{int(time.time()*1000)}.wav"
529
+ with open(temp_audio_path, "wb") as f:
530
+ f.write(uploaded_audio.getbuffer())
531
+ audio_to_process = str(temp_audio_path)
532
+ st.audio(uploaded_audio)
533
+
534
+ elif recorded_audio is not None:
535
+ temp_audio_path = TEMP_DIR / f"recorded_{int(time.time()*1000)}.wav"
536
+ with open(temp_audio_path, "wb") as f:
537
+ f.write(recorded_audio)
538
+ audio_to_process = str(temp_audio_path)
539
+ st.audio(recorded_audio, format="audio/wav")
540
+
541
+ if voice_btn:
542
+ if audio_to_process:
543
+ with st.spinner("🔄 Processing voice..."):
544
+ try:
545
+ st.text("🎤 Transcribing...")
546
+ lang_code = get_language_code(voice_lang)
547
+ transcribed_text, success = transcribe_audio(audio_to_process, lang_code)
548
+
549
+ if success:
550
+ st.markdown(
551
+ f'<div class="success-box">'
552
+ f'<strong>📝 Transcribed:</strong> {transcribed_text}'
553
+ f'</div>',
554
+ unsafe_allow_html=True
555
+ )
556
+
557
+ translated, audio_path, gif_path = process_translation_pipeline(
558
+ transcribed_text,
559
+ voice_lang,
560
+ target_language,
561
+ selected_avatar
562
+ )
563
+
564
+ st.markdown(
565
+ f'<div class="result-box">'
566
+ f'<h4>📜 Translated:</h4>'
567
+ f'<p style="font-size: 1.2rem;">{translated}</p>'
568
+ f'</div>',
569
+ unsafe_allow_html=True
570
+ )
571
+
572
+ if audio_path and os.path.exists(audio_path):
573
+ st.markdown("#### 🔊 Audio")
574
+ st.audio(audio_path, format="audio/mp3")
575
+
576
+ if gif_path and os.path.exists(gif_path):
577
+ st.markdown("#### 🎭 Animation")
578
+ st.image(gif_path, width="stretch")
579
+
580
+ with open(gif_path, "rb") as f:
581
+ st.download_button(
582
+ label="📥 Download",
583
+ data=f,
584
+ file_name="lipsync.gif",
585
+ mime="image/gif"
586
+ )
587
+ else:
588
+ st.error(f"❌ {transcribed_text}")
589
+ except Exception as e:
590
+ st.error(f"❌ Error: {str(e)}")
591
+ else:
592
+ st.warning("⚠️ Please upload or record audio first.")
593
+
594
+
595
+ # =============================================================================
596
+ # Footer
597
+ # =============================================================================
598
+
599
+ st.markdown("---")
600
+ st.markdown(
601
+ """
602
+ <div style="text-align: center; color: #666; padding: 1rem;">
603
+ <p>Made By Praveen</p>
604
+ </div>
605
+ """,
606
+ unsafe_allow_html=True
607
+ )
avatars/sample/base.png ADDED
avatars/sample/mouth_0.png ADDED
avatars/sample/mouth_1.png ADDED
avatars/sample/mouth_2.png ADDED
requirements.txt CHANGED
@@ -1,3 +1,11 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
1
+ streamlit>=1.28.0
2
+ deep-translator>=1.11.4
3
+ gTTS>=2.4.0
4
+ pydub>=0.25.1
5
+ Pillow>=10.0.0
6
+ imageio>=2.31.0
7
+ numpy>=1.24.0
8
+ SpeechRecognition>=3.10.0
9
+ streamlit-webrtc>=0.47.0
10
+ av>=10.0.0
11
+ audio-recorder-streamlit>=0.0.8
utils/__init__.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Utility modules for the Anime Translator application.
3
+
4
+ This package contains:
5
+ - translator: Text translation between languages
6
+ - tts_engine: Text-to-speech synthesis
7
+ - lipsync: Lip-sync animation generation
8
+ - speech_to_text: Voice input processing
9
+ - avatar_manager: Avatar image management
10
+ """
11
+
12
+ from .translator import translate_text, detect_language
13
+ from .tts_engine import synthesize_speech
14
+ from .lipsync import generate_lipsync_gif, audio_to_rms_chunks
15
+ from .speech_to_text import transcribe_audio
16
+ from .avatar_manager import list_avatars, ensure_sample_avatar, get_avatar_preview
utils/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (845 Bytes). View file
 
utils/__pycache__/avatar_manager.cpython-310.pyc ADDED
Binary file (4.47 kB). View file
 
utils/__pycache__/lipsync.cpython-310.pyc ADDED
Binary file (5.35 kB). View file
 
utils/__pycache__/speech_to_text.cpython-310.pyc ADDED
Binary file (3.06 kB). View file
 
utils/__pycache__/translator.cpython-310.pyc ADDED
Binary file (2.18 kB). View file
 
utils/__pycache__/tts_engine.cpython-310.pyc ADDED
Binary file (2.3 kB). View file
 
utils/avatar_manager.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Avatar Manager Module
3
+ =====================
4
+ Handles avatar discovery, creation, and management.
5
+
6
+ Functions:
7
+ - ensure_sample_avatar: Create default sample avatar
8
+ - list_avatars: Get list of available avatars
9
+ - get_avatar_preview: Get preview image of an avatar
10
+ """
11
+
12
+ from PIL import Image, ImageDraw
13
+ from pathlib import Path
14
+ from typing import List, Optional
15
+ import numpy as np
16
+
17
+
18
+ def ensure_sample_avatar(avatars_dir: Path) -> None:
19
+ """
20
+ Create a sample avatar if none exists.
21
+
22
+ Generates a simple animated avatar with:
23
+ - Base face image
24
+ - Three mouth positions (closed, medium, open)
25
+
26
+ Args:
27
+ avatars_dir: Base directory for avatars
28
+
29
+ Example:
30
+ >>> ensure_sample_avatar(Path("./avatars"))
31
+ # Creates ./avatars/sample/ with base.png and mouth_*.png
32
+
33
+ Note:
34
+ This creates a basic placeholder avatar. For better results,
35
+ create custom avatars with proper artwork.
36
+ """
37
+ sample_dir = avatars_dir / "sample"
38
+
39
+ # Check if sample already exists with content
40
+ if sample_dir.exists() and any(sample_dir.iterdir()):
41
+ return
42
+
43
+ # Create directory
44
+ sample_dir.mkdir(parents=True, exist_ok=True)
45
+
46
+ # Image dimensions
47
+ width, height = 512, 512
48
+
49
+ # Create base image (simple face background)
50
+ base = Image.new("RGBA", (width, height), (255, 220, 200, 255))
51
+ draw_base = ImageDraw.Draw(base)
52
+
53
+ # Draw simple face features on base
54
+ # Face circle
55
+ draw_base.ellipse([56, 56, 456, 456], fill=(255, 230, 210, 255), outline=(200, 150, 130, 255), width=3)
56
+
57
+ # Eyes
58
+ draw_base.ellipse([150, 180, 200, 230], fill=(255, 255, 255, 255), outline=(0, 0, 0, 255), width=2)
59
+ draw_base.ellipse([312, 180, 362, 230], fill=(255, 255, 255, 255), outline=(0, 0, 0, 255), width=2)
60
+
61
+ # Pupils
62
+ draw_base.ellipse([165, 195, 185, 215], fill=(50, 50, 50, 255))
63
+ draw_base.ellipse([327, 195, 347, 215], fill=(50, 50, 50, 255))
64
+
65
+ # Eyebrows
66
+ draw_base.arc([140, 150, 210, 190], start=200, end=340, fill=(100, 70, 50, 255), width=3)
67
+ draw_base.arc([302, 150, 372, 190], start=200, end=340, fill=(100, 70, 50, 255), width=3)
68
+
69
+ # Nose
70
+ draw_base.polygon([(256, 250), (240, 310), (272, 310)], fill=(240, 200, 180, 255))
71
+
72
+ # Hair (simple)
73
+ draw_base.arc([40, 20, 472, 300], start=180, end=360, fill=(80, 50, 30, 255), width=30)
74
+
75
+ base.save(sample_dir / "base.png")
76
+
77
+ # Create mouth frames (transparent overlays)
78
+ mouth_positions = [
79
+ # (y_offset, height) - Mouth closed to open
80
+ (0, 8), # mouth_0: Nearly closed
81
+ (0, 20), # mouth_1: Slightly open
82
+ (0, 35), # mouth_2: Wide open
83
+ ]
84
+
85
+ for i, (y_off, mouth_height) in enumerate(mouth_positions):
86
+ # Create transparent image for mouth overlay
87
+ mouth_img = Image.new("RGBA", (width, height), (0, 0, 0, 0))
88
+ draw_mouth = ImageDraw.Draw(mouth_img)
89
+
90
+ # Calculate mouth position
91
+ mouth_y = 340 + y_off
92
+ mouth_left = 200
93
+ mouth_right = 312
94
+
95
+ # Draw mouth (ellipse shape)
96
+ draw_mouth.ellipse(
97
+ [mouth_left, mouth_y, mouth_right, mouth_y + mouth_height],
98
+ fill=(180, 80, 80, 255),
99
+ outline=(120, 50, 50, 255),
100
+ width=2
101
+ )
102
+
103
+ # Add inner mouth detail for open mouths
104
+ if mouth_height > 15:
105
+ inner_offset = 5
106
+ draw_mouth.ellipse(
107
+ [mouth_left + inner_offset, mouth_y + inner_offset,
108
+ mouth_right - inner_offset, mouth_y + mouth_height - inner_offset],
109
+ fill=(100, 40, 40, 255)
110
+ )
111
+
112
+ mouth_img.save(sample_dir / f"mouth_{i}.png")
113
+
114
+
115
+ def list_avatars(avatars_dir: Path) -> List[str]:
116
+ """
117
+ Get list of available avatar names.
118
+
119
+ Scans the avatars directory for valid avatar folders
120
+ (containing base.png and mouth_*.png files).
121
+
122
+ Args:
123
+ avatars_dir: Base directory containing avatar folders
124
+
125
+ Returns:
126
+ List of avatar folder names
127
+
128
+ Example:
129
+ >>> avatars = list_avatars(Path("./avatars"))
130
+ >>> print(avatars)
131
+ ['sample', 'anime_girl', 'anime_boy']
132
+ """
133
+ # Ensure sample avatar exists
134
+ ensure_sample_avatar(avatars_dir)
135
+
136
+ # Find all valid avatar directories
137
+ avatars = []
138
+
139
+ if avatars_dir.exists():
140
+ for path in avatars_dir.iterdir():
141
+ if path.is_dir():
142
+ # Check for required files
143
+ has_base = (path / "base.png").exists()
144
+ has_mouth = any(path.glob("mouth_*.png"))
145
+
146
+ if has_base and has_mouth:
147
+ avatars.append(path.name)
148
+
149
+ return sorted(avatars)
150
+
151
+
152
+ def get_avatar_preview(avatar_name: str, avatars_dir: Path) -> Optional[Image.Image]:
153
+ """
154
+ Get a preview image of an avatar.
155
+
156
+ Composites the base image with the first mouth frame
157
+ to show what the avatar looks like.
158
+
159
+ Args:
160
+ avatar_name: Name of the avatar folder
161
+ avatars_dir: Base directory containing avatar folders
162
+
163
+ Returns:
164
+ PIL Image object or None if avatar not found
165
+
166
+ Example:
167
+ >>> preview = get_avatar_preview("sample", Path("./avatars"))
168
+ >>> preview.show()
169
+ """
170
+ avatar_folder = avatars_dir / avatar_name
171
+ base_path = avatar_folder / "base.png"
172
+
173
+ if not base_path.exists():
174
+ return None
175
+
176
+ # Load base image
177
+ base = Image.open(base_path).convert("RGBA")
178
+
179
+ # Find first mouth frame
180
+ mouth_frames = sorted(avatar_folder.glob("mouth_*.png"))
181
+
182
+ if mouth_frames:
183
+ mouth = Image.open(mouth_frames[0]).convert("RGBA").resize(base.size)
184
+ # Composite mouth onto base
185
+ preview = Image.alpha_composite(base, mouth)
186
+ else:
187
+ preview = base
188
+
189
+ return preview
utils/lipsync.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Lip-Sync Animation Module
3
+ =========================
4
+ Generates animated GIFs with lip-sync based on audio amplitude.
5
+
6
+ Functions:
7
+ - audio_to_rms_chunks: Extract amplitude data from audio
8
+ - generate_lipsync_gif: Create lip-sync animation GIF
9
+ """
10
+
11
+ from PIL import Image
12
+ import imageio
13
+ from pathlib import Path
14
+ import time
15
+ from typing import List, Optional
16
+ import os
17
+
18
+
19
+ def audio_to_rms_chunks(audio_path: str, chunk_ms: int = 80) -> List[float]:
20
+ """
21
+ Extract RMS (Root Mean Square) amplitude values from audio.
22
+
23
+ Splits audio into chunks and calculates the RMS value for each,
24
+ which represents the "loudness" of that segment.
25
+
26
+ Args:
27
+ audio_path: Path to the audio file (MP3)
28
+ chunk_ms: Duration of each chunk in milliseconds
29
+
30
+ Returns:
31
+ List of RMS values, one per chunk
32
+ """
33
+ try:
34
+ from pydub import AudioSegment
35
+ from pydub.utils import make_chunks
36
+
37
+ # Check if file exists
38
+ if not os.path.exists(audio_path):
39
+ raise FileNotFoundError(f"Audio file not found: {audio_path}")
40
+
41
+ # Load audio file
42
+ audio = AudioSegment.from_file(audio_path)
43
+
44
+ # Split into chunks
45
+ chunks = make_chunks(audio, chunk_ms)
46
+
47
+ # Calculate RMS for each chunk
48
+ rms_values = [chunk.rms for chunk in chunks if len(chunk) > 0]
49
+
50
+ return rms_values if rms_values else [0]
51
+
52
+ except Exception as e:
53
+ print(f"Error processing audio: {e}")
54
+ # Return default values if audio processing fails
55
+ return [100, 200, 150, 300, 250, 100, 200, 150] # Fallback animation
56
+
57
+
58
+ def audio_to_rms_chunks_simple(audio_path: str, chunk_ms: int = 80) -> List[float]:
59
+ """
60
+ Simple fallback method to generate fake RMS values based on file size.
61
+ Used when pydub/ffmpeg fails.
62
+
63
+ Args:
64
+ audio_path: Path to the audio file
65
+ chunk_ms: Duration of each chunk in milliseconds
66
+
67
+ Returns:
68
+ List of simulated RMS values
69
+ """
70
+ import math
71
+
72
+ try:
73
+ # Estimate duration based on file size (rough approximation)
74
+ file_size = os.path.getsize(audio_path)
75
+
76
+ # Approximate: MP3 at 128kbps = 16KB per second
77
+ estimated_duration_sec = file_size / 16000
78
+
79
+ # Calculate number of chunks
80
+ num_chunks = max(int(estimated_duration_sec * 1000 / chunk_ms), 10)
81
+
82
+ # Generate wave-like RMS values for natural-looking lip sync
83
+ rms_values = []
84
+ for i in range(num_chunks):
85
+ # Create a wave pattern
86
+ value = 150 + 100 * math.sin(i * 0.5) + 50 * math.sin(i * 1.2)
87
+ rms_values.append(max(50, value))
88
+
89
+ return rms_values
90
+
91
+ except Exception:
92
+ return [100, 200, 150, 300, 250, 100, 200, 150, 100, 200]
93
+
94
+
95
+ def generate_lipsync_gif(
96
+ avatar_name: str,
97
+ audio_path: str,
98
+ avatars_dir: Path,
99
+ output_dir: Path,
100
+ fps: int = 12,
101
+ output_path: Optional[str] = None
102
+ ) -> str:
103
+ """
104
+ Generate a lip-sync animated GIF from avatar images and audio.
105
+
106
+ The animation works by:
107
+ 1. Analyzing audio amplitude (RMS) over time
108
+ 2. Selecting mouth frame based on amplitude level
109
+ 3. Compositing mouth frame onto base avatar image
110
+ 4. Combining all frames into an animated GIF
111
+
112
+ Args:
113
+ avatar_name: Name of avatar folder (e.g., 'sample')
114
+ audio_path: Path to the audio file to sync with
115
+ avatars_dir: Base directory containing avatar folders
116
+ output_dir: Directory to save the output GIF
117
+ fps: Frames per second for the animation
118
+ output_path: Optional custom output path
119
+
120
+ Returns:
121
+ Path to the generated GIF file
122
+
123
+ Raises:
124
+ FileNotFoundError: If avatar base.png or mouth frames not found
125
+ """
126
+ # Locate avatar folder and files
127
+ avatar_folder = avatars_dir / avatar_name
128
+ base_path = avatar_folder / "base.png"
129
+ mouth_frames_paths = sorted(avatar_folder.glob("mouth_*.png"))
130
+
131
+ # Validate avatar files exist
132
+ if not base_path.exists():
133
+ raise FileNotFoundError(f"Base image not found: {base_path}")
134
+ if not mouth_frames_paths:
135
+ raise FileNotFoundError(f"No mouth frames found in: {avatar_folder}")
136
+
137
+ # Load base image (the avatar face)
138
+ base_image = Image.open(base_path).convert("RGBA")
139
+ size = base_image.size
140
+
141
+ # Load all mouth frame images
142
+ mouth_frames = [
143
+ Image.open(path).convert("RGBA").resize(size)
144
+ for path in mouth_frames_paths
145
+ ]
146
+
147
+ # Calculate chunk duration to match target FPS
148
+ chunk_ms = int(1000 / fps)
149
+
150
+ # Try to extract audio amplitude data
151
+ try:
152
+ rms_values = audio_to_rms_chunks(audio_path, chunk_ms=chunk_ms)
153
+ except Exception as e:
154
+ print(f"Primary audio processing failed: {e}")
155
+ print("Using fallback animation method...")
156
+ rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms)
157
+
158
+ # Handle edge case of empty or invalid audio
159
+ if not rms_values or all(v == 0 for v in rms_values):
160
+ rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms)
161
+
162
+ # Normalize RMS values to 0-1 range
163
+ max_rms = max(rms_values) if max(rms_values) > 0 else 1
164
+
165
+ # Generate animation frames
166
+ frames = []
167
+ num_mouth_frames = len(mouth_frames)
168
+
169
+ for rms in rms_values:
170
+ # Calculate mouth openness ratio (0 to 1)
171
+ ratio = rms / max_rms
172
+
173
+ # Map ratio to mouth frame index
174
+ mouth_index = int(ratio * (num_mouth_frames - 1))
175
+ mouth_index = max(0, min(mouth_index, num_mouth_frames - 1))
176
+
177
+ # Composite mouth onto base image
178
+ mouth = mouth_frames[mouth_index]
179
+ frame = Image.alpha_composite(base_image, mouth)
180
+
181
+ # Convert to RGB for GIF compatibility
182
+ frame_rgb = Image.new("RGB", frame.size, (255, 255, 255))
183
+ frame_rgb.paste(frame, mask=frame.split()[-1] if frame.mode == 'RGBA' else None)
184
+
185
+ frames.append(frame_rgb)
186
+
187
+ # Ensure output directory exists
188
+ output_dir.mkdir(parents=True, exist_ok=True)
189
+
190
+ # Generate output filename
191
+ if output_path is None:
192
+ timestamp = int(time.time() * 1000)
193
+ output_path = str(output_dir / f"lipsync_{timestamp}.gif")
194
+
195
+ # Save as animated GIF
196
+ if frames:
197
+ # Use imageio to save GIF
198
+ imageio.mimsave(
199
+ output_path,
200
+ frames,
201
+ fps=fps,
202
+ loop=0 # Loop forever
203
+ )
204
+ else:
205
+ raise ValueError("No frames generated for animation")
206
+
207
+ return output_path
utils/speech_to_text.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Speech-to-Text Module
3
+ =====================
4
+ Converts voice/audio input to text using speech recognition.
5
+
6
+ Functions:
7
+ - transcribe_audio: Convert audio file to text
8
+ - transcribe_from_microphone: Real-time microphone transcription
9
+ """
10
+
11
+ import speech_recognition as sr
12
+ from pathlib import Path
13
+ from typing import Optional, Tuple
14
+ import tempfile
15
+ from pydub import AudioSegment
16
+
17
+
18
+ def transcribe_audio(
19
+ audio_path: str,
20
+ language: str = "en-US"
21
+ ) -> Tuple[str, bool]:
22
+ """
23
+ Transcribe audio file to text using Google Speech Recognition.
24
+
25
+ Supports various audio formats (WAV, MP3, etc.) and converts
26
+ them automatically for processing.
27
+
28
+ Args:
29
+ audio_path: Path to the audio file
30
+ language: Language code for recognition
31
+ - 'en-US' for English (US)
32
+ - 'hi-IN' for Hindi (India)
33
+
34
+ Returns:
35
+ Tuple of (transcribed_text, success_flag)
36
+ - If successful: (text, True)
37
+ - If failed: (error_message, False)
38
+
39
+ Example:
40
+ >>> text, success = transcribe_audio("recording.wav", "en-US")
41
+ >>> if success:
42
+ ... print(f"You said: {text}")
43
+ ... else:
44
+ ... print(f"Error: {text}")
45
+
46
+ Supported Formats:
47
+ - WAV (recommended)
48
+ - MP3
49
+ - FLAC
50
+ - OGG
51
+ """
52
+ recognizer = sr.Recognizer()
53
+
54
+ try:
55
+ # Convert audio to WAV format if needed
56
+ audio_path = Path(audio_path)
57
+
58
+ if audio_path.suffix.lower() != '.wav':
59
+ # Convert to WAV using pydub
60
+ audio = AudioSegment.from_file(str(audio_path))
61
+
62
+ # Create temporary WAV file
63
+ with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
64
+ wav_path = tmp.name
65
+ audio.export(wav_path, format='wav')
66
+ else:
67
+ wav_path = str(audio_path)
68
+
69
+ # Load audio file for recognition
70
+ with sr.AudioFile(wav_path) as source:
71
+ # Adjust for ambient noise
72
+ recognizer.adjust_for_ambient_noise(source, duration=0.5)
73
+
74
+ # Record the audio
75
+ audio_data = recognizer.record(source)
76
+
77
+ # Perform speech recognition
78
+ text = recognizer.recognize_google(audio_data, language=language)
79
+
80
+ return text, True
81
+
82
+ except sr.UnknownValueError:
83
+ return "Could not understand the audio. Please speak clearly.", False
84
+
85
+ except sr.RequestError as e:
86
+ return f"Speech recognition service error: {str(e)}", False
87
+
88
+ except Exception as e:
89
+ return f"Error processing audio: {str(e)}", False
90
+
91
+
92
+ def get_language_code(lang: str) -> str:
93
+ """
94
+ Convert short language code to full speech recognition code.
95
+
96
+ Args:
97
+ lang: Short language code ('en' or 'hi')
98
+
99
+ Returns:
100
+ Full language code for speech recognition
101
+
102
+ Example:
103
+ >>> get_language_code('en')
104
+ 'en-US'
105
+ >>> get_language_code('hi')
106
+ 'hi-IN'
107
+ """
108
+ language_map = {
109
+ 'en': 'en-US',
110
+ 'hi': 'hi-IN',
111
+ 'auto': 'en-US' # Default to English for auto
112
+ }
113
+ return language_map.get(lang, 'en-US')
utils/translator.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Translation Module
3
+ ==================
4
+ Handles text translation between English and Hindi using deep-translator.
5
+
6
+ Functions:
7
+ - detect_language: Auto-detect if text is English or Hindi
8
+ - translate_text: Translate text between languages
9
+ """
10
+
11
+ from deep_translator import GoogleTranslator
12
+ from typing import Literal
13
+
14
+
15
+ def detect_language(text: str) -> Literal["en", "hi"]:
16
+ """
17
+ Detect if the input text is English or Hindi.
18
+
19
+ Uses Unicode range detection for Devanagari script (Hindi).
20
+
21
+ Args:
22
+ text: Input text string to analyze
23
+
24
+ Returns:
25
+ 'hi' if Hindi/Devanagari characters found, 'en' otherwise
26
+
27
+ Example:
28
+ >>> detect_language("Hello World")
29
+ 'en'
30
+ >>> detect_language("नमस्ते")
31
+ 'hi'
32
+ """
33
+ # Check for Devanagari Unicode range (U+0900 to U+097F)
34
+ for char in text:
35
+ if '\u0900' <= char <= '\u097F':
36
+ return "hi"
37
+ return "en"
38
+
39
+
40
+ def translate_text(
41
+ text: str,
42
+ source_lang: Literal["auto", "en", "hi"],
43
+ target_lang: Literal["en", "hi"]
44
+ ) -> str:
45
+ """
46
+ Translate text from source language to target language.
47
+
48
+ Uses Google Translator via deep-translator library for accurate
49
+ translations between English and Hindi.
50
+
51
+ Args:
52
+ text: The text to translate
53
+ source_lang: Source language code ('auto', 'en', or 'hi')
54
+ target_lang: Target language code ('en' or 'hi')
55
+
56
+ Returns:
57
+ Translated text string
58
+
59
+ Raises:
60
+ ValueError: If text is empty
61
+
62
+ Example:
63
+ >>> translate_text("Hello", "en", "hi")
64
+ 'नमस्ते'
65
+ >>> translate_text("नमस्ते", "auto", "en")
66
+ 'Hello'
67
+ """
68
+ # Validate input
69
+ if not text or not text.strip():
70
+ return ""
71
+
72
+ # Auto-detect source language if needed
73
+ if source_lang == "auto":
74
+ source_lang = detect_language(text)
75
+
76
+ # Skip translation if source and target are the same
77
+ if source_lang == target_lang:
78
+ return text
79
+
80
+ # Perform translation using Google Translator
81
+ translator = GoogleTranslator(source=source_lang, target=target_lang)
82
+ translated = translator.translate(text)
83
+
84
+ return translated
utils/tts_engine.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Text-to-Speech Engine
3
+ =====================
4
+ Converts text to speech audio using Google Text-to-Speech (gTTS).
5
+
6
+ Functions:
7
+ - synthesize_speech: Convert text to MP3 audio file
8
+ - get_audio_duration: Get duration of audio file
9
+ """
10
+
11
+ from gtts import gTTS
12
+ from pathlib import Path
13
+ import time
14
+ from typing import Literal
15
+ import os
16
+
17
+
18
+ def synthesize_speech(
19
+ text: str,
20
+ language: Literal["en", "hi"],
21
+ output_dir: Path,
22
+ slow: bool = False
23
+ ) -> str:
24
+ """
25
+ Convert text to speech and save as MP3 file.
26
+
27
+ Uses Google Text-to-Speech (gTTS) for natural-sounding
28
+ speech synthesis in English and Hindi.
29
+
30
+ Args:
31
+ text: Text to convert to speech
32
+ language: Language code ('en' for English, 'hi' for Hindi)
33
+ output_dir: Directory to save the audio file
34
+ slow: If True, speak slowly (useful for language learning)
35
+
36
+ Returns:
37
+ Path to the generated MP3 file
38
+ """
39
+ # Ensure output directory exists
40
+ output_dir = Path(output_dir)
41
+ output_dir.mkdir(parents=True, exist_ok=True)
42
+
43
+ # Generate unique filename using timestamp
44
+ timestamp = int(time.time() * 1000)
45
+ output_path = output_dir / f"tts_{timestamp}.mp3"
46
+
47
+ try:
48
+ # Create TTS object and save to file
49
+ tts = gTTS(text=text, lang=language, slow=slow)
50
+ tts.save(str(output_path))
51
+
52
+ # Verify file was created
53
+ if not output_path.exists():
54
+ raise FileNotFoundError(f"TTS file was not created: {output_path}")
55
+
56
+ # Verify file has content
57
+ if output_path.stat().st_size == 0:
58
+ raise ValueError("TTS file is empty")
59
+
60
+ return str(output_path)
61
+
62
+ except Exception as e:
63
+ print(f"TTS Error: {e}")
64
+ raise
65
+
66
+
67
+ def get_audio_duration(audio_path: str) -> float:
68
+ """
69
+ Get the duration of an audio file in seconds.
70
+
71
+ Args:
72
+ audio_path: Path to the audio file
73
+
74
+ Returns:
75
+ Duration in seconds (estimated if pydub fails)
76
+ """
77
+ try:
78
+ from pydub import AudioSegment
79
+ audio = AudioSegment.from_file(audio_path)
80
+ return len(audio) / 1000.0 # Convert milliseconds to seconds
81
+ except Exception:
82
+ # Fallback: estimate based on file size
83
+ # Approximate: MP3 at 128kbps = 16KB per second
84
+ try:
85
+ file_size = os.path.getsize(audio_path)
86
+ return file_size / 16000
87
+ except Exception:
88
+ return 3.0 # Default 3 seconds