Spaces:
Sleeping
Sleeping
Upload 19 files
Browse files- Dockerfile +52 -20
- app.py +607 -0
- avatars/sample/base.png +0 -0
- avatars/sample/mouth_0.png +0 -0
- avatars/sample/mouth_1.png +0 -0
- avatars/sample/mouth_2.png +0 -0
- requirements.txt +11 -3
- utils/__init__.py +16 -0
- utils/__pycache__/__init__.cpython-310.pyc +0 -0
- utils/__pycache__/avatar_manager.cpython-310.pyc +0 -0
- utils/__pycache__/lipsync.cpython-310.pyc +0 -0
- utils/__pycache__/speech_to_text.cpython-310.pyc +0 -0
- utils/__pycache__/translator.cpython-310.pyc +0 -0
- utils/__pycache__/tts_engine.cpython-310.pyc +0 -0
- utils/avatar_manager.py +189 -0
- utils/lipsync.py +207 -0
- utils/speech_to_text.py +113 -0
- utils/translator.py +84 -0
- utils/tts_engine.py +88 -0
Dockerfile
CHANGED
|
@@ -1,20 +1,52 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Use Python base image
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
+
|
| 4 |
+
# Set working directory
|
| 5 |
+
WORKDIR /app
|
| 6 |
+
|
| 7 |
+
# Install system dependencies including FFmpeg
|
| 8 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 9 |
+
ffmpeg \
|
| 10 |
+
libsm6 \
|
| 11 |
+
libxext6 \
|
| 12 |
+
libgl1-mesa-glb \
|
| 13 |
+
git \
|
| 14 |
+
&& apt-get clean \
|
| 15 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
+
|
| 17 |
+
# Create non-root user for security (required by HF Spaces)
|
| 18 |
+
RUN useradd -m -u 1000 user
|
| 19 |
+
USER user
|
| 20 |
+
ENV HOME=/home/user \
|
| 21 |
+
PATH=/home/user/.local/bin:$PATH
|
| 22 |
+
|
| 23 |
+
# Set working directory for user
|
| 24 |
+
WORKDIR $HOME/app
|
| 25 |
+
|
| 26 |
+
# Copy requirements first (for caching)
|
| 27 |
+
COPY --chown=user:user requirements.txt .
|
| 28 |
+
|
| 29 |
+
# Install Python dependencies
|
| 30 |
+
RUN pip install --no-cache-dir --upgrade pip && \
|
| 31 |
+
pip install --no-cache-dir -r requirements.txt
|
| 32 |
+
|
| 33 |
+
# Copy application files
|
| 34 |
+
COPY --chown=user:user . .
|
| 35 |
+
|
| 36 |
+
# Create temp directory
|
| 37 |
+
RUN mkdir -p temp
|
| 38 |
+
|
| 39 |
+
# Expose Streamlit port
|
| 40 |
+
EXPOSE 7860
|
| 41 |
+
|
| 42 |
+
# Health check
|
| 43 |
+
HEALTHCHECK CMD curl --fail http://localhost:7860/_stcore/health || exit 1
|
| 44 |
+
|
| 45 |
+
# Set environment variables for Streamlit
|
| 46 |
+
ENV STREAMLIT_SERVER_PORT=7860 \
|
| 47 |
+
STREAMLIT_SERVER_ADDRESS=0.0.0.0 \
|
| 48 |
+
STREAMLIT_SERVER_HEADLESS=true \
|
| 49 |
+
STREAMLIT_BROWSER_GATHER_USAGE_STATS=false
|
| 50 |
+
|
| 51 |
+
# Run the application
|
| 52 |
+
CMD ["streamlit", "run", "app.py"]
|
app.py
ADDED
|
@@ -0,0 +1,607 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
🎌 Anime Translator with Lip-Sync
|
| 3 |
+
=================================
|
| 4 |
+
|
| 5 |
+
A Streamlit application that translates text between English and Hindi,
|
| 6 |
+
converts it to speech, and generates a lip-synced anime avatar animation.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
import streamlit as st
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
import tempfile
|
| 12 |
+
import time
|
| 13 |
+
import shutil
|
| 14 |
+
import os
|
| 15 |
+
import subprocess
|
| 16 |
+
from shutil import which
|
| 17 |
+
from typing import Tuple, Optional
|
| 18 |
+
|
| 19 |
+
# Import utility modules
|
| 20 |
+
from utils.translator import translate_text, detect_language
|
| 21 |
+
from utils.tts_engine import synthesize_speech, get_audio_duration
|
| 22 |
+
from utils.lipsync import generate_lipsync_gif
|
| 23 |
+
from utils.speech_to_text import transcribe_audio, get_language_code
|
| 24 |
+
from utils.avatar_manager import list_avatars, get_avatar_preview, ensure_sample_avatar
|
| 25 |
+
|
| 26 |
+
# =============================================================================
|
| 27 |
+
# FFmpeg Configuration
|
| 28 |
+
# =============================================================================
|
| 29 |
+
|
| 30 |
+
def configure_ffmpeg():
|
| 31 |
+
"""Configure FFmpeg path for pydub on Windows."""
|
| 32 |
+
possible_paths = [
|
| 33 |
+
r"C:\ffmpeg\bin",
|
| 34 |
+
r"C:\Program Files\ffmpeg\bin",
|
| 35 |
+
r"C:\Program Files (x86)\ffmpeg\bin",
|
| 36 |
+
os.path.expanduser("~\\ffmpeg\\bin"),
|
| 37 |
+
r"C:\Users\Nishant Pratap\ffmpeg\bin", # Add your user-specific path
|
| 38 |
+
]
|
| 39 |
+
|
| 40 |
+
if which("ffmpeg") is not None:
|
| 41 |
+
return True
|
| 42 |
+
|
| 43 |
+
for path in possible_paths:
|
| 44 |
+
ffmpeg_exe = os.path.join(path, "ffmpeg.exe")
|
| 45 |
+
if os.path.exists(ffmpeg_exe):
|
| 46 |
+
os.environ["PATH"] = path + os.pathsep + os.environ.get("PATH", "")
|
| 47 |
+
# Also set for pydub specifically
|
| 48 |
+
try:
|
| 49 |
+
from pydub import AudioSegment
|
| 50 |
+
AudioSegment.converter = os.path.join(path, "ffmpeg.exe")
|
| 51 |
+
AudioSegment.ffprobe = os.path.join(path, "ffprobe.exe")
|
| 52 |
+
except:
|
| 53 |
+
pass
|
| 54 |
+
return True
|
| 55 |
+
|
| 56 |
+
return False
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
def check_ffmpeg_detailed():
|
| 60 |
+
"""Check FFmpeg installation and return detailed status."""
|
| 61 |
+
status = {
|
| 62 |
+
"ffmpeg_in_path": False,
|
| 63 |
+
"ffmpeg_works": False,
|
| 64 |
+
"ffprobe_works": False,
|
| 65 |
+
"pydub_works": False,
|
| 66 |
+
"error_message": None
|
| 67 |
+
}
|
| 68 |
+
|
| 69 |
+
ffmpeg_path = which("ffmpeg")
|
| 70 |
+
status["ffmpeg_in_path"] = ffmpeg_path is not None
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
result = subprocess.run(
|
| 74 |
+
["ffmpeg", "-version"],
|
| 75 |
+
capture_output=True,
|
| 76 |
+
text=True,
|
| 77 |
+
timeout=5
|
| 78 |
+
)
|
| 79 |
+
status["ffmpeg_works"] = result.returncode == 0
|
| 80 |
+
except Exception as e:
|
| 81 |
+
status["error_message"] = str(e)
|
| 82 |
+
|
| 83 |
+
try:
|
| 84 |
+
result = subprocess.run(
|
| 85 |
+
["ffprobe", "-version"],
|
| 86 |
+
capture_output=True,
|
| 87 |
+
text=True,
|
| 88 |
+
timeout=5
|
| 89 |
+
)
|
| 90 |
+
status["ffprobe_works"] = result.returncode == 0
|
| 91 |
+
except Exception:
|
| 92 |
+
pass
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
from pydub import AudioSegment
|
| 96 |
+
silence = AudioSegment.silent(duration=100)
|
| 97 |
+
status["pydub_works"] = True
|
| 98 |
+
except Exception as e:
|
| 99 |
+
status["pydub_works"] = False
|
| 100 |
+
if not status["error_message"]:
|
| 101 |
+
status["error_message"] = str(e)
|
| 102 |
+
|
| 103 |
+
return status
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
ffmpeg_found = configure_ffmpeg()
|
| 107 |
+
|
| 108 |
+
# =============================================================================
|
| 109 |
+
# Configuration
|
| 110 |
+
# =============================================================================
|
| 111 |
+
|
| 112 |
+
AVATARS_DIR = Path("./avatars")
|
| 113 |
+
TEMP_DIR = Path(tempfile.gettempdir()) / "anime_translator"
|
| 114 |
+
|
| 115 |
+
AVATARS_DIR.mkdir(parents=True, exist_ok=True)
|
| 116 |
+
TEMP_DIR.mkdir(parents=True, exist_ok=True)
|
| 117 |
+
|
| 118 |
+
# Page configuration
|
| 119 |
+
st.set_page_config(
|
| 120 |
+
page_title="🎌 Anime Translator",
|
| 121 |
+
page_icon="🎌",
|
| 122 |
+
layout="wide",
|
| 123 |
+
initial_sidebar_state="expanded"
|
| 124 |
+
)
|
| 125 |
+
|
| 126 |
+
# =============================================================================
|
| 127 |
+
# Custom CSS Styling
|
| 128 |
+
# =============================================================================
|
| 129 |
+
|
| 130 |
+
st.markdown("""
|
| 131 |
+
<style>
|
| 132 |
+
.main {
|
| 133 |
+
background: linear-gradient(135deg, #1a1a2e 0%, #16213e 50%, #0f3460 100%);
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.main-header {
|
| 137 |
+
background: linear-gradient(90deg, #e94560, #ff6b6b);
|
| 138 |
+
-webkit-background-clip: text;
|
| 139 |
+
-webkit-text-fill-color: transparent;
|
| 140 |
+
font-size: 3rem;
|
| 141 |
+
font-weight: bold;
|
| 142 |
+
text-align: center;
|
| 143 |
+
padding: 1rem;
|
| 144 |
+
margin-bottom: 2rem;
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
.stButton > button {
|
| 148 |
+
background: linear-gradient(90deg, #e94560, #ff6b6b);
|
| 149 |
+
color: white;
|
| 150 |
+
border: none;
|
| 151 |
+
border-radius: 25px;
|
| 152 |
+
padding: 0.75rem 2rem;
|
| 153 |
+
font-weight: bold;
|
| 154 |
+
transition: all 0.3s ease;
|
| 155 |
+
width: 100%;
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
.stButton > button:hover {
|
| 159 |
+
transform: translateY(-2px);
|
| 160 |
+
box-shadow: 0 5px 20px rgba(233, 69, 96, 0.4);
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
.result-box {
|
| 164 |
+
background: linear-gradient(135deg, rgba(233, 69, 96, 0.1), rgba(255, 107, 107, 0.1));
|
| 165 |
+
border-radius: 15px;
|
| 166 |
+
padding: 1.5rem;
|
| 167 |
+
border: 1px solid rgba(233, 69, 96, 0.3);
|
| 168 |
+
margin: 1rem 0;
|
| 169 |
+
}
|
| 170 |
+
|
| 171 |
+
.info-box {
|
| 172 |
+
background: rgba(100, 200, 255, 0.1);
|
| 173 |
+
border-left: 4px solid #64c8ff;
|
| 174 |
+
padding: 1rem;
|
| 175 |
+
border-radius: 0 10px 10px 0;
|
| 176 |
+
margin: 1rem 0;
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
.success-box {
|
| 180 |
+
background: rgba(100, 255, 150, 0.1);
|
| 181 |
+
border-left: 4px solid #64ff96;
|
| 182 |
+
padding: 1rem;
|
| 183 |
+
border-radius: 0 10px 10px 0;
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
#MainMenu {visibility: hidden;}
|
| 187 |
+
footer {visibility: hidden;}
|
| 188 |
+
|
| 189 |
+
.stTabs [data-baseweb="tab-list"] {
|
| 190 |
+
gap: 8px;
|
| 191 |
+
}
|
| 192 |
+
|
| 193 |
+
.stTabs [data-baseweb="tab"] {
|
| 194 |
+
background: rgba(255, 255, 255, 0.05);
|
| 195 |
+
border-radius: 10px;
|
| 196 |
+
padding: 10px 20px;
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
.stTabs [aria-selected="true"] {
|
| 200 |
+
background: linear-gradient(90deg, #e94560, #ff6b6b);
|
| 201 |
+
}
|
| 202 |
+
</style>
|
| 203 |
+
""", unsafe_allow_html=True)
|
| 204 |
+
|
| 205 |
+
# =============================================================================
|
| 206 |
+
# Helper Functions
|
| 207 |
+
# =============================================================================
|
| 208 |
+
|
| 209 |
+
def cleanup_temp_files(older_than_sec: int = 3600) -> None:
|
| 210 |
+
"""Clean up old temporary files."""
|
| 211 |
+
now = time.time()
|
| 212 |
+
try:
|
| 213 |
+
for path in TEMP_DIR.iterdir():
|
| 214 |
+
try:
|
| 215 |
+
if now - path.stat().st_mtime > older_than_sec:
|
| 216 |
+
if path.is_file():
|
| 217 |
+
path.unlink()
|
| 218 |
+
elif path.is_dir():
|
| 219 |
+
shutil.rmtree(path)
|
| 220 |
+
except Exception:
|
| 221 |
+
pass
|
| 222 |
+
except Exception:
|
| 223 |
+
pass
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
def process_translation_pipeline(
|
| 227 |
+
text: str,
|
| 228 |
+
source_lang: str,
|
| 229 |
+
target_lang: str,
|
| 230 |
+
avatar_name: str
|
| 231 |
+
) -> Tuple[str, Optional[str], Optional[str]]:
|
| 232 |
+
"""Main processing pipeline: translate, synthesize speech, generate animation."""
|
| 233 |
+
|
| 234 |
+
# Step 1: Translate text
|
| 235 |
+
try:
|
| 236 |
+
translated_text = translate_text(text, source_lang, target_lang)
|
| 237 |
+
except Exception as e:
|
| 238 |
+
raise Exception(f"Translation failed: {str(e)}")
|
| 239 |
+
|
| 240 |
+
# Step 2: Synthesize speech
|
| 241 |
+
try:
|
| 242 |
+
audio_path = synthesize_speech(translated_text, target_lang, TEMP_DIR)
|
| 243 |
+
except Exception as e:
|
| 244 |
+
raise Exception(f"Speech synthesis failed: {str(e)}")
|
| 245 |
+
|
| 246 |
+
# Step 3: Generate lip-sync animation
|
| 247 |
+
gif_path = None
|
| 248 |
+
try:
|
| 249 |
+
gif_path = generate_lipsync_gif(
|
| 250 |
+
avatar_name=avatar_name,
|
| 251 |
+
audio_path=audio_path,
|
| 252 |
+
avatars_dir=AVATARS_DIR,
|
| 253 |
+
output_dir=TEMP_DIR,
|
| 254 |
+
fps=12
|
| 255 |
+
)
|
| 256 |
+
except Exception as e:
|
| 257 |
+
# Don't fail completely if animation fails
|
| 258 |
+
print(f"Animation generation warning: {str(e)}")
|
| 259 |
+
gif_path = None
|
| 260 |
+
|
| 261 |
+
return translated_text, audio_path, gif_path
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
# =============================================================================
|
| 265 |
+
# Sidebar
|
| 266 |
+
# =============================================================================
|
| 267 |
+
|
| 268 |
+
with st.sidebar:
|
| 269 |
+
st.markdown("## ⚙️ Settings")
|
| 270 |
+
|
| 271 |
+
# Avatar selection
|
| 272 |
+
st.markdown("### 🎭 Avatar Selection")
|
| 273 |
+
avatars = list_avatars(AVATARS_DIR)
|
| 274 |
+
|
| 275 |
+
if avatars:
|
| 276 |
+
selected_avatar = st.selectbox(
|
| 277 |
+
"Choose your avatar",
|
| 278 |
+
options=avatars,
|
| 279 |
+
index=0,
|
| 280 |
+
help="Select an anime avatar for lip-sync animation"
|
| 281 |
+
)
|
| 282 |
+
|
| 283 |
+
preview = get_avatar_preview(selected_avatar, AVATARS_DIR)
|
| 284 |
+
if preview:
|
| 285 |
+
st.image(preview, caption=f"Preview: {selected_avatar}", width="stretch")
|
| 286 |
+
else:
|
| 287 |
+
st.warning("No avatars found. Creating sample avatar...")
|
| 288 |
+
ensure_sample_avatar(AVATARS_DIR)
|
| 289 |
+
selected_avatar = "sample"
|
| 290 |
+
st.rerun()
|
| 291 |
+
|
| 292 |
+
st.markdown("---")
|
| 293 |
+
|
| 294 |
+
# Language settings
|
| 295 |
+
st.markdown("### 🌐 Language Settings")
|
| 296 |
+
|
| 297 |
+
source_language = st.selectbox(
|
| 298 |
+
"Source Language",
|
| 299 |
+
options=["auto", "en", "hi"],
|
| 300 |
+
format_func=lambda x: {"auto": "🔄 Auto-detect", "en": "🇬🇧 English", "hi": "🇮🇳 Hindi"}[x],
|
| 301 |
+
index=0
|
| 302 |
+
)
|
| 303 |
+
|
| 304 |
+
target_language = st.selectbox(
|
| 305 |
+
"Target Language",
|
| 306 |
+
options=["en", "hi"],
|
| 307 |
+
format_func=lambda x: {"en": "🇬🇧 English", "hi": "🇮🇳 Hindi"}[x],
|
| 308 |
+
index=1
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
st.markdown("---")
|
| 312 |
+
|
| 313 |
+
# System status
|
| 314 |
+
st.markdown("### 🔧 System Status")
|
| 315 |
+
|
| 316 |
+
ffmpeg_status = check_ffmpeg_detailed()
|
| 317 |
+
|
| 318 |
+
if ffmpeg_status["ffmpeg_works"]:
|
| 319 |
+
st.success("✅ FFmpeg: Working")
|
| 320 |
+
else:
|
| 321 |
+
st.error("❌ FFmpeg: Not working")
|
| 322 |
+
|
| 323 |
+
if ffmpeg_status["pydub_works"]:
|
| 324 |
+
st.success("✅ Pydub: Working")
|
| 325 |
+
else:
|
| 326 |
+
st.warning("⚠️ Pydub: Limited (fallback mode)")
|
| 327 |
+
|
| 328 |
+
if ffmpeg_status["error_message"]:
|
| 329 |
+
with st.expander("🔍 Error Details"):
|
| 330 |
+
st.code(ffmpeg_status["error_message"])
|
| 331 |
+
st.markdown("""
|
| 332 |
+
**To fix FFmpeg:**
|
| 333 |
+
```bash
|
| 334 |
+
conda install -c conda-forge ffmpeg
|
| 335 |
+
```
|
| 336 |
+
Or download from: https://www.gyan.dev/ffmpeg/builds/
|
| 337 |
+
""")
|
| 338 |
+
|
| 339 |
+
st.markdown("---")
|
| 340 |
+
|
| 341 |
+
# Info section
|
| 342 |
+
st.markdown("### ℹ️ About")
|
| 343 |
+
st.markdown("""
|
| 344 |
+
Translate text between English and Hindi with lip-synced avatar animation.
|
| 345 |
+
|
| 346 |
+
**Features:**
|
| 347 |
+
- 🎤 Voice input
|
| 348 |
+
- 🔄 Auto detection
|
| 349 |
+
- 🗣️ Text-to-speech
|
| 350 |
+
- 🎬 Lip-sync animation
|
| 351 |
+
""")
|
| 352 |
+
|
| 353 |
+
if st.button("🧹 Clear Temp Files"):
|
| 354 |
+
cleanup_temp_files(older_than_sec=0)
|
| 355 |
+
st.success("Cleared!")
|
| 356 |
+
|
| 357 |
+
|
| 358 |
+
# =============================================================================
|
| 359 |
+
# Main Content
|
| 360 |
+
# =============================================================================
|
| 361 |
+
|
| 362 |
+
st.markdown('<h1 class="main-header">🎌 Anime Translator</h1>', unsafe_allow_html=True)
|
| 363 |
+
st.markdown(
|
| 364 |
+
'<p style="text-align: center; color: #888; font-size: 1.2rem;">'
|
| 365 |
+
'Translate • Speak • Animate</p>',
|
| 366 |
+
unsafe_allow_html=True
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
# Tabs
|
| 370 |
+
tab1, tab2 = st.tabs(["📝 Text Input", "🎤 Voice Input"])
|
| 371 |
+
|
| 372 |
+
# =============================================================================
|
| 373 |
+
# Tab 1: Text Input
|
| 374 |
+
# =============================================================================
|
| 375 |
+
|
| 376 |
+
with tab1:
|
| 377 |
+
col1, col2 = st.columns([1, 1])
|
| 378 |
+
|
| 379 |
+
with col1:
|
| 380 |
+
st.markdown("### 📝 Enter Your Text")
|
| 381 |
+
|
| 382 |
+
text_input = st.text_area(
|
| 383 |
+
"Type or paste your text here",
|
| 384 |
+
height=150,
|
| 385 |
+
placeholder="Enter text in English or Hindi...\nउदाहरण: नमस्ते, आप कैसे हैं?\nExample: Hello, how are you?",
|
| 386 |
+
key="text_input"
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
if text_input:
|
| 390 |
+
detected = detect_language(text_input)
|
| 391 |
+
st.markdown(
|
| 392 |
+
f'<div class="info-box">'
|
| 393 |
+
f'📊 Characters: {len(text_input)} | '
|
| 394 |
+
f'🔍 Detected: {"🇮🇳 Hindi" if detected == "hi" else "🇬🇧 English"}'
|
| 395 |
+
f'</div>',
|
| 396 |
+
unsafe_allow_html=True
|
| 397 |
+
)
|
| 398 |
+
|
| 399 |
+
translate_btn = st.button(
|
| 400 |
+
"🚀 Translate & Animate",
|
| 401 |
+
key="translate_text_btn",
|
| 402 |
+
use_container_width=True
|
| 403 |
+
)
|
| 404 |
+
|
| 405 |
+
with col2:
|
| 406 |
+
st.markdown("### 🎬 Result")
|
| 407 |
+
|
| 408 |
+
if translate_btn and text_input:
|
| 409 |
+
with st.spinner("🔄 Processing..."):
|
| 410 |
+
progress = st.progress(0)
|
| 411 |
+
status_text = st.empty()
|
| 412 |
+
|
| 413 |
+
try:
|
| 414 |
+
status_text.text("📝 Translating...")
|
| 415 |
+
progress.progress(33)
|
| 416 |
+
|
| 417 |
+
translated, audio_path, gif_path = process_translation_pipeline(
|
| 418 |
+
text_input,
|
| 419 |
+
source_language,
|
| 420 |
+
target_language,
|
| 421 |
+
selected_avatar
|
| 422 |
+
)
|
| 423 |
+
|
| 424 |
+
status_text.text("🗣️ Generating speech...")
|
| 425 |
+
progress.progress(66)
|
| 426 |
+
|
| 427 |
+
status_text.text("🎬 Creating animation...")
|
| 428 |
+
progress.progress(100)
|
| 429 |
+
|
| 430 |
+
progress.empty()
|
| 431 |
+
status_text.empty()
|
| 432 |
+
|
| 433 |
+
# Display translated text
|
| 434 |
+
st.markdown(
|
| 435 |
+
f'<div class="result-box">'
|
| 436 |
+
f'<h4>📜 Translated Text:</h4>'
|
| 437 |
+
f'<p style="font-size: 1.2rem;">{translated}</p>'
|
| 438 |
+
f'</div>',
|
| 439 |
+
unsafe_allow_html=True
|
| 440 |
+
)
|
| 441 |
+
|
| 442 |
+
# Audio player
|
| 443 |
+
if audio_path and os.path.exists(audio_path):
|
| 444 |
+
st.markdown("#### 🔊 Audio")
|
| 445 |
+
st.audio(audio_path, format="audio/mp3")
|
| 446 |
+
|
| 447 |
+
# Animation display
|
| 448 |
+
if gif_path and os.path.exists(gif_path):
|
| 449 |
+
st.markdown("#### 🎭 Lip-Sync Animation")
|
| 450 |
+
st.image(gif_path, width="stretch")
|
| 451 |
+
|
| 452 |
+
with open(gif_path, "rb") as f:
|
| 453 |
+
st.download_button(
|
| 454 |
+
label="📥 Download Animation",
|
| 455 |
+
data=f,
|
| 456 |
+
file_name="lipsync_animation.gif",
|
| 457 |
+
mime="image/gif"
|
| 458 |
+
)
|
| 459 |
+
else:
|
| 460 |
+
st.info("ℹ️ Animation not available (FFmpeg may be missing)")
|
| 461 |
+
|
| 462 |
+
except Exception as e:
|
| 463 |
+
progress.empty()
|
| 464 |
+
status_text.empty()
|
| 465 |
+
st.error(f"❌ Error: {str(e)}")
|
| 466 |
+
|
| 467 |
+
elif translate_btn:
|
| 468 |
+
st.warning("⚠️ Please enter some text to translate.")
|
| 469 |
+
|
| 470 |
+
|
| 471 |
+
# =============================================================================
|
| 472 |
+
# Tab 2: Voice Input
|
| 473 |
+
# =============================================================================
|
| 474 |
+
|
| 475 |
+
with tab2:
|
| 476 |
+
col1, col2 = st.columns([1, 1])
|
| 477 |
+
|
| 478 |
+
with col1:
|
| 479 |
+
st.markdown("### 🎤 Voice Recording")
|
| 480 |
+
|
| 481 |
+
st.markdown("""
|
| 482 |
+
<div class="info-box">
|
| 483 |
+
<strong>Instructions:</strong><br>
|
| 484 |
+
1. Upload an audio file (WAV, MP3, etc.)<br>
|
| 485 |
+
2. Or use the audio recorder below<br>
|
| 486 |
+
3. Click "Transcribe & Translate"
|
| 487 |
+
</div>
|
| 488 |
+
""", unsafe_allow_html=True)
|
| 489 |
+
|
| 490 |
+
uploaded_audio = st.file_uploader(
|
| 491 |
+
"Upload an audio file",
|
| 492 |
+
type=["wav", "mp3", "ogg", "flac", "m4a"],
|
| 493 |
+
help="Supported formats: WAV, MP3, OGG, FLAC, M4A"
|
| 494 |
+
)
|
| 495 |
+
|
| 496 |
+
recorded_audio = None
|
| 497 |
+
try:
|
| 498 |
+
from audio_recorder_streamlit import audio_recorder
|
| 499 |
+
st.markdown("**Or record directly:**")
|
| 500 |
+
recorded_audio = audio_recorder(
|
| 501 |
+
text="🎙️ Click to record",
|
| 502 |
+
recording_color="#e94560",
|
| 503 |
+
neutral_color="#6c757d",
|
| 504 |
+
icon_name="microphone",
|
| 505 |
+
icon_size="2x"
|
| 506 |
+
)
|
| 507 |
+
except ImportError:
|
| 508 |
+
st.info("💡 For recording: `pip install audio-recorder-streamlit`")
|
| 509 |
+
|
| 510 |
+
voice_lang = st.selectbox(
|
| 511 |
+
"Recording Language",
|
| 512 |
+
options=["en", "hi"],
|
| 513 |
+
format_func=lambda x: {"en": "🇬🇧 English", "hi": "🇮🇳 Hindi"}[x]
|
| 514 |
+
)
|
| 515 |
+
|
| 516 |
+
voice_btn = st.button(
|
| 517 |
+
"🎯 Transcribe & Translate",
|
| 518 |
+
key="voice_btn",
|
| 519 |
+
use_container_width=True
|
| 520 |
+
)
|
| 521 |
+
|
| 522 |
+
with col2:
|
| 523 |
+
st.markdown("### 🎬 Result")
|
| 524 |
+
|
| 525 |
+
audio_to_process = None
|
| 526 |
+
|
| 527 |
+
if uploaded_audio is not None:
|
| 528 |
+
temp_audio_path = TEMP_DIR / f"uploaded_{int(time.time()*1000)}.wav"
|
| 529 |
+
with open(temp_audio_path, "wb") as f:
|
| 530 |
+
f.write(uploaded_audio.getbuffer())
|
| 531 |
+
audio_to_process = str(temp_audio_path)
|
| 532 |
+
st.audio(uploaded_audio)
|
| 533 |
+
|
| 534 |
+
elif recorded_audio is not None:
|
| 535 |
+
temp_audio_path = TEMP_DIR / f"recorded_{int(time.time()*1000)}.wav"
|
| 536 |
+
with open(temp_audio_path, "wb") as f:
|
| 537 |
+
f.write(recorded_audio)
|
| 538 |
+
audio_to_process = str(temp_audio_path)
|
| 539 |
+
st.audio(recorded_audio, format="audio/wav")
|
| 540 |
+
|
| 541 |
+
if voice_btn:
|
| 542 |
+
if audio_to_process:
|
| 543 |
+
with st.spinner("🔄 Processing voice..."):
|
| 544 |
+
try:
|
| 545 |
+
st.text("🎤 Transcribing...")
|
| 546 |
+
lang_code = get_language_code(voice_lang)
|
| 547 |
+
transcribed_text, success = transcribe_audio(audio_to_process, lang_code)
|
| 548 |
+
|
| 549 |
+
if success:
|
| 550 |
+
st.markdown(
|
| 551 |
+
f'<div class="success-box">'
|
| 552 |
+
f'<strong>📝 Transcribed:</strong> {transcribed_text}'
|
| 553 |
+
f'</div>',
|
| 554 |
+
unsafe_allow_html=True
|
| 555 |
+
)
|
| 556 |
+
|
| 557 |
+
translated, audio_path, gif_path = process_translation_pipeline(
|
| 558 |
+
transcribed_text,
|
| 559 |
+
voice_lang,
|
| 560 |
+
target_language,
|
| 561 |
+
selected_avatar
|
| 562 |
+
)
|
| 563 |
+
|
| 564 |
+
st.markdown(
|
| 565 |
+
f'<div class="result-box">'
|
| 566 |
+
f'<h4>📜 Translated:</h4>'
|
| 567 |
+
f'<p style="font-size: 1.2rem;">{translated}</p>'
|
| 568 |
+
f'</div>',
|
| 569 |
+
unsafe_allow_html=True
|
| 570 |
+
)
|
| 571 |
+
|
| 572 |
+
if audio_path and os.path.exists(audio_path):
|
| 573 |
+
st.markdown("#### 🔊 Audio")
|
| 574 |
+
st.audio(audio_path, format="audio/mp3")
|
| 575 |
+
|
| 576 |
+
if gif_path and os.path.exists(gif_path):
|
| 577 |
+
st.markdown("#### 🎭 Animation")
|
| 578 |
+
st.image(gif_path, width="stretch")
|
| 579 |
+
|
| 580 |
+
with open(gif_path, "rb") as f:
|
| 581 |
+
st.download_button(
|
| 582 |
+
label="📥 Download",
|
| 583 |
+
data=f,
|
| 584 |
+
file_name="lipsync.gif",
|
| 585 |
+
mime="image/gif"
|
| 586 |
+
)
|
| 587 |
+
else:
|
| 588 |
+
st.error(f"❌ {transcribed_text}")
|
| 589 |
+
except Exception as e:
|
| 590 |
+
st.error(f"❌ Error: {str(e)}")
|
| 591 |
+
else:
|
| 592 |
+
st.warning("⚠️ Please upload or record audio first.")
|
| 593 |
+
|
| 594 |
+
|
| 595 |
+
# =============================================================================
|
| 596 |
+
# Footer
|
| 597 |
+
# =============================================================================
|
| 598 |
+
|
| 599 |
+
st.markdown("---")
|
| 600 |
+
st.markdown(
|
| 601 |
+
"""
|
| 602 |
+
<div style="text-align: center; color: #666; padding: 1rem;">
|
| 603 |
+
<p>Made By Praveen</p>
|
| 604 |
+
</div>
|
| 605 |
+
""",
|
| 606 |
+
unsafe_allow_html=True
|
| 607 |
+
)
|
avatars/sample/base.png
ADDED
|
|
avatars/sample/mouth_0.png
ADDED
|
|
avatars/sample/mouth_1.png
ADDED
|
|
avatars/sample/mouth_2.png
ADDED
|
|
requirements.txt
CHANGED
|
@@ -1,3 +1,11 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.28.0
|
| 2 |
+
deep-translator>=1.11.4
|
| 3 |
+
gTTS>=2.4.0
|
| 4 |
+
pydub>=0.25.1
|
| 5 |
+
Pillow>=10.0.0
|
| 6 |
+
imageio>=2.31.0
|
| 7 |
+
numpy>=1.24.0
|
| 8 |
+
SpeechRecognition>=3.10.0
|
| 9 |
+
streamlit-webrtc>=0.47.0
|
| 10 |
+
av>=10.0.0
|
| 11 |
+
audio-recorder-streamlit>=0.0.8
|
utils/__init__.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility modules for the Anime Translator application.
|
| 3 |
+
|
| 4 |
+
This package contains:
|
| 5 |
+
- translator: Text translation between languages
|
| 6 |
+
- tts_engine: Text-to-speech synthesis
|
| 7 |
+
- lipsync: Lip-sync animation generation
|
| 8 |
+
- speech_to_text: Voice input processing
|
| 9 |
+
- avatar_manager: Avatar image management
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from .translator import translate_text, detect_language
|
| 13 |
+
from .tts_engine import synthesize_speech
|
| 14 |
+
from .lipsync import generate_lipsync_gif, audio_to_rms_chunks
|
| 15 |
+
from .speech_to_text import transcribe_audio
|
| 16 |
+
from .avatar_manager import list_avatars, ensure_sample_avatar, get_avatar_preview
|
utils/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (845 Bytes). View file
|
|
|
utils/__pycache__/avatar_manager.cpython-310.pyc
ADDED
|
Binary file (4.47 kB). View file
|
|
|
utils/__pycache__/lipsync.cpython-310.pyc
ADDED
|
Binary file (5.35 kB). View file
|
|
|
utils/__pycache__/speech_to_text.cpython-310.pyc
ADDED
|
Binary file (3.06 kB). View file
|
|
|
utils/__pycache__/translator.cpython-310.pyc
ADDED
|
Binary file (2.18 kB). View file
|
|
|
utils/__pycache__/tts_engine.cpython-310.pyc
ADDED
|
Binary file (2.3 kB). View file
|
|
|
utils/avatar_manager.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Avatar Manager Module
|
| 3 |
+
=====================
|
| 4 |
+
Handles avatar discovery, creation, and management.
|
| 5 |
+
|
| 6 |
+
Functions:
|
| 7 |
+
- ensure_sample_avatar: Create default sample avatar
|
| 8 |
+
- list_avatars: Get list of available avatars
|
| 9 |
+
- get_avatar_preview: Get preview image of an avatar
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from PIL import Image, ImageDraw
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import List, Optional
|
| 15 |
+
import numpy as np
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def ensure_sample_avatar(avatars_dir: Path) -> None:
|
| 19 |
+
"""
|
| 20 |
+
Create a sample avatar if none exists.
|
| 21 |
+
|
| 22 |
+
Generates a simple animated avatar with:
|
| 23 |
+
- Base face image
|
| 24 |
+
- Three mouth positions (closed, medium, open)
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
avatars_dir: Base directory for avatars
|
| 28 |
+
|
| 29 |
+
Example:
|
| 30 |
+
>>> ensure_sample_avatar(Path("./avatars"))
|
| 31 |
+
# Creates ./avatars/sample/ with base.png and mouth_*.png
|
| 32 |
+
|
| 33 |
+
Note:
|
| 34 |
+
This creates a basic placeholder avatar. For better results,
|
| 35 |
+
create custom avatars with proper artwork.
|
| 36 |
+
"""
|
| 37 |
+
sample_dir = avatars_dir / "sample"
|
| 38 |
+
|
| 39 |
+
# Check if sample already exists with content
|
| 40 |
+
if sample_dir.exists() and any(sample_dir.iterdir()):
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
# Create directory
|
| 44 |
+
sample_dir.mkdir(parents=True, exist_ok=True)
|
| 45 |
+
|
| 46 |
+
# Image dimensions
|
| 47 |
+
width, height = 512, 512
|
| 48 |
+
|
| 49 |
+
# Create base image (simple face background)
|
| 50 |
+
base = Image.new("RGBA", (width, height), (255, 220, 200, 255))
|
| 51 |
+
draw_base = ImageDraw.Draw(base)
|
| 52 |
+
|
| 53 |
+
# Draw simple face features on base
|
| 54 |
+
# Face circle
|
| 55 |
+
draw_base.ellipse([56, 56, 456, 456], fill=(255, 230, 210, 255), outline=(200, 150, 130, 255), width=3)
|
| 56 |
+
|
| 57 |
+
# Eyes
|
| 58 |
+
draw_base.ellipse([150, 180, 200, 230], fill=(255, 255, 255, 255), outline=(0, 0, 0, 255), width=2)
|
| 59 |
+
draw_base.ellipse([312, 180, 362, 230], fill=(255, 255, 255, 255), outline=(0, 0, 0, 255), width=2)
|
| 60 |
+
|
| 61 |
+
# Pupils
|
| 62 |
+
draw_base.ellipse([165, 195, 185, 215], fill=(50, 50, 50, 255))
|
| 63 |
+
draw_base.ellipse([327, 195, 347, 215], fill=(50, 50, 50, 255))
|
| 64 |
+
|
| 65 |
+
# Eyebrows
|
| 66 |
+
draw_base.arc([140, 150, 210, 190], start=200, end=340, fill=(100, 70, 50, 255), width=3)
|
| 67 |
+
draw_base.arc([302, 150, 372, 190], start=200, end=340, fill=(100, 70, 50, 255), width=3)
|
| 68 |
+
|
| 69 |
+
# Nose
|
| 70 |
+
draw_base.polygon([(256, 250), (240, 310), (272, 310)], fill=(240, 200, 180, 255))
|
| 71 |
+
|
| 72 |
+
# Hair (simple)
|
| 73 |
+
draw_base.arc([40, 20, 472, 300], start=180, end=360, fill=(80, 50, 30, 255), width=30)
|
| 74 |
+
|
| 75 |
+
base.save(sample_dir / "base.png")
|
| 76 |
+
|
| 77 |
+
# Create mouth frames (transparent overlays)
|
| 78 |
+
mouth_positions = [
|
| 79 |
+
# (y_offset, height) - Mouth closed to open
|
| 80 |
+
(0, 8), # mouth_0: Nearly closed
|
| 81 |
+
(0, 20), # mouth_1: Slightly open
|
| 82 |
+
(0, 35), # mouth_2: Wide open
|
| 83 |
+
]
|
| 84 |
+
|
| 85 |
+
for i, (y_off, mouth_height) in enumerate(mouth_positions):
|
| 86 |
+
# Create transparent image for mouth overlay
|
| 87 |
+
mouth_img = Image.new("RGBA", (width, height), (0, 0, 0, 0))
|
| 88 |
+
draw_mouth = ImageDraw.Draw(mouth_img)
|
| 89 |
+
|
| 90 |
+
# Calculate mouth position
|
| 91 |
+
mouth_y = 340 + y_off
|
| 92 |
+
mouth_left = 200
|
| 93 |
+
mouth_right = 312
|
| 94 |
+
|
| 95 |
+
# Draw mouth (ellipse shape)
|
| 96 |
+
draw_mouth.ellipse(
|
| 97 |
+
[mouth_left, mouth_y, mouth_right, mouth_y + mouth_height],
|
| 98 |
+
fill=(180, 80, 80, 255),
|
| 99 |
+
outline=(120, 50, 50, 255),
|
| 100 |
+
width=2
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
# Add inner mouth detail for open mouths
|
| 104 |
+
if mouth_height > 15:
|
| 105 |
+
inner_offset = 5
|
| 106 |
+
draw_mouth.ellipse(
|
| 107 |
+
[mouth_left + inner_offset, mouth_y + inner_offset,
|
| 108 |
+
mouth_right - inner_offset, mouth_y + mouth_height - inner_offset],
|
| 109 |
+
fill=(100, 40, 40, 255)
|
| 110 |
+
)
|
| 111 |
+
|
| 112 |
+
mouth_img.save(sample_dir / f"mouth_{i}.png")
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
def list_avatars(avatars_dir: Path) -> List[str]:
|
| 116 |
+
"""
|
| 117 |
+
Get list of available avatar names.
|
| 118 |
+
|
| 119 |
+
Scans the avatars directory for valid avatar folders
|
| 120 |
+
(containing base.png and mouth_*.png files).
|
| 121 |
+
|
| 122 |
+
Args:
|
| 123 |
+
avatars_dir: Base directory containing avatar folders
|
| 124 |
+
|
| 125 |
+
Returns:
|
| 126 |
+
List of avatar folder names
|
| 127 |
+
|
| 128 |
+
Example:
|
| 129 |
+
>>> avatars = list_avatars(Path("./avatars"))
|
| 130 |
+
>>> print(avatars)
|
| 131 |
+
['sample', 'anime_girl', 'anime_boy']
|
| 132 |
+
"""
|
| 133 |
+
# Ensure sample avatar exists
|
| 134 |
+
ensure_sample_avatar(avatars_dir)
|
| 135 |
+
|
| 136 |
+
# Find all valid avatar directories
|
| 137 |
+
avatars = []
|
| 138 |
+
|
| 139 |
+
if avatars_dir.exists():
|
| 140 |
+
for path in avatars_dir.iterdir():
|
| 141 |
+
if path.is_dir():
|
| 142 |
+
# Check for required files
|
| 143 |
+
has_base = (path / "base.png").exists()
|
| 144 |
+
has_mouth = any(path.glob("mouth_*.png"))
|
| 145 |
+
|
| 146 |
+
if has_base and has_mouth:
|
| 147 |
+
avatars.append(path.name)
|
| 148 |
+
|
| 149 |
+
return sorted(avatars)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def get_avatar_preview(avatar_name: str, avatars_dir: Path) -> Optional[Image.Image]:
|
| 153 |
+
"""
|
| 154 |
+
Get a preview image of an avatar.
|
| 155 |
+
|
| 156 |
+
Composites the base image with the first mouth frame
|
| 157 |
+
to show what the avatar looks like.
|
| 158 |
+
|
| 159 |
+
Args:
|
| 160 |
+
avatar_name: Name of the avatar folder
|
| 161 |
+
avatars_dir: Base directory containing avatar folders
|
| 162 |
+
|
| 163 |
+
Returns:
|
| 164 |
+
PIL Image object or None if avatar not found
|
| 165 |
+
|
| 166 |
+
Example:
|
| 167 |
+
>>> preview = get_avatar_preview("sample", Path("./avatars"))
|
| 168 |
+
>>> preview.show()
|
| 169 |
+
"""
|
| 170 |
+
avatar_folder = avatars_dir / avatar_name
|
| 171 |
+
base_path = avatar_folder / "base.png"
|
| 172 |
+
|
| 173 |
+
if not base_path.exists():
|
| 174 |
+
return None
|
| 175 |
+
|
| 176 |
+
# Load base image
|
| 177 |
+
base = Image.open(base_path).convert("RGBA")
|
| 178 |
+
|
| 179 |
+
# Find first mouth frame
|
| 180 |
+
mouth_frames = sorted(avatar_folder.glob("mouth_*.png"))
|
| 181 |
+
|
| 182 |
+
if mouth_frames:
|
| 183 |
+
mouth = Image.open(mouth_frames[0]).convert("RGBA").resize(base.size)
|
| 184 |
+
# Composite mouth onto base
|
| 185 |
+
preview = Image.alpha_composite(base, mouth)
|
| 186 |
+
else:
|
| 187 |
+
preview = base
|
| 188 |
+
|
| 189 |
+
return preview
|
utils/lipsync.py
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Lip-Sync Animation Module
|
| 3 |
+
=========================
|
| 4 |
+
Generates animated GIFs with lip-sync based on audio amplitude.
|
| 5 |
+
|
| 6 |
+
Functions:
|
| 7 |
+
- audio_to_rms_chunks: Extract amplitude data from audio
|
| 8 |
+
- generate_lipsync_gif: Create lip-sync animation GIF
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from PIL import Image
|
| 12 |
+
import imageio
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
import time
|
| 15 |
+
from typing import List, Optional
|
| 16 |
+
import os
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def audio_to_rms_chunks(audio_path: str, chunk_ms: int = 80) -> List[float]:
|
| 20 |
+
"""
|
| 21 |
+
Extract RMS (Root Mean Square) amplitude values from audio.
|
| 22 |
+
|
| 23 |
+
Splits audio into chunks and calculates the RMS value for each,
|
| 24 |
+
which represents the "loudness" of that segment.
|
| 25 |
+
|
| 26 |
+
Args:
|
| 27 |
+
audio_path: Path to the audio file (MP3)
|
| 28 |
+
chunk_ms: Duration of each chunk in milliseconds
|
| 29 |
+
|
| 30 |
+
Returns:
|
| 31 |
+
List of RMS values, one per chunk
|
| 32 |
+
"""
|
| 33 |
+
try:
|
| 34 |
+
from pydub import AudioSegment
|
| 35 |
+
from pydub.utils import make_chunks
|
| 36 |
+
|
| 37 |
+
# Check if file exists
|
| 38 |
+
if not os.path.exists(audio_path):
|
| 39 |
+
raise FileNotFoundError(f"Audio file not found: {audio_path}")
|
| 40 |
+
|
| 41 |
+
# Load audio file
|
| 42 |
+
audio = AudioSegment.from_file(audio_path)
|
| 43 |
+
|
| 44 |
+
# Split into chunks
|
| 45 |
+
chunks = make_chunks(audio, chunk_ms)
|
| 46 |
+
|
| 47 |
+
# Calculate RMS for each chunk
|
| 48 |
+
rms_values = [chunk.rms for chunk in chunks if len(chunk) > 0]
|
| 49 |
+
|
| 50 |
+
return rms_values if rms_values else [0]
|
| 51 |
+
|
| 52 |
+
except Exception as e:
|
| 53 |
+
print(f"Error processing audio: {e}")
|
| 54 |
+
# Return default values if audio processing fails
|
| 55 |
+
return [100, 200, 150, 300, 250, 100, 200, 150] # Fallback animation
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
def audio_to_rms_chunks_simple(audio_path: str, chunk_ms: int = 80) -> List[float]:
|
| 59 |
+
"""
|
| 60 |
+
Simple fallback method to generate fake RMS values based on file size.
|
| 61 |
+
Used when pydub/ffmpeg fails.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
audio_path: Path to the audio file
|
| 65 |
+
chunk_ms: Duration of each chunk in milliseconds
|
| 66 |
+
|
| 67 |
+
Returns:
|
| 68 |
+
List of simulated RMS values
|
| 69 |
+
"""
|
| 70 |
+
import math
|
| 71 |
+
|
| 72 |
+
try:
|
| 73 |
+
# Estimate duration based on file size (rough approximation)
|
| 74 |
+
file_size = os.path.getsize(audio_path)
|
| 75 |
+
|
| 76 |
+
# Approximate: MP3 at 128kbps = 16KB per second
|
| 77 |
+
estimated_duration_sec = file_size / 16000
|
| 78 |
+
|
| 79 |
+
# Calculate number of chunks
|
| 80 |
+
num_chunks = max(int(estimated_duration_sec * 1000 / chunk_ms), 10)
|
| 81 |
+
|
| 82 |
+
# Generate wave-like RMS values for natural-looking lip sync
|
| 83 |
+
rms_values = []
|
| 84 |
+
for i in range(num_chunks):
|
| 85 |
+
# Create a wave pattern
|
| 86 |
+
value = 150 + 100 * math.sin(i * 0.5) + 50 * math.sin(i * 1.2)
|
| 87 |
+
rms_values.append(max(50, value))
|
| 88 |
+
|
| 89 |
+
return rms_values
|
| 90 |
+
|
| 91 |
+
except Exception:
|
| 92 |
+
return [100, 200, 150, 300, 250, 100, 200, 150, 100, 200]
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def generate_lipsync_gif(
|
| 96 |
+
avatar_name: str,
|
| 97 |
+
audio_path: str,
|
| 98 |
+
avatars_dir: Path,
|
| 99 |
+
output_dir: Path,
|
| 100 |
+
fps: int = 12,
|
| 101 |
+
output_path: Optional[str] = None
|
| 102 |
+
) -> str:
|
| 103 |
+
"""
|
| 104 |
+
Generate a lip-sync animated GIF from avatar images and audio.
|
| 105 |
+
|
| 106 |
+
The animation works by:
|
| 107 |
+
1. Analyzing audio amplitude (RMS) over time
|
| 108 |
+
2. Selecting mouth frame based on amplitude level
|
| 109 |
+
3. Compositing mouth frame onto base avatar image
|
| 110 |
+
4. Combining all frames into an animated GIF
|
| 111 |
+
|
| 112 |
+
Args:
|
| 113 |
+
avatar_name: Name of avatar folder (e.g., 'sample')
|
| 114 |
+
audio_path: Path to the audio file to sync with
|
| 115 |
+
avatars_dir: Base directory containing avatar folders
|
| 116 |
+
output_dir: Directory to save the output GIF
|
| 117 |
+
fps: Frames per second for the animation
|
| 118 |
+
output_path: Optional custom output path
|
| 119 |
+
|
| 120 |
+
Returns:
|
| 121 |
+
Path to the generated GIF file
|
| 122 |
+
|
| 123 |
+
Raises:
|
| 124 |
+
FileNotFoundError: If avatar base.png or mouth frames not found
|
| 125 |
+
"""
|
| 126 |
+
# Locate avatar folder and files
|
| 127 |
+
avatar_folder = avatars_dir / avatar_name
|
| 128 |
+
base_path = avatar_folder / "base.png"
|
| 129 |
+
mouth_frames_paths = sorted(avatar_folder.glob("mouth_*.png"))
|
| 130 |
+
|
| 131 |
+
# Validate avatar files exist
|
| 132 |
+
if not base_path.exists():
|
| 133 |
+
raise FileNotFoundError(f"Base image not found: {base_path}")
|
| 134 |
+
if not mouth_frames_paths:
|
| 135 |
+
raise FileNotFoundError(f"No mouth frames found in: {avatar_folder}")
|
| 136 |
+
|
| 137 |
+
# Load base image (the avatar face)
|
| 138 |
+
base_image = Image.open(base_path).convert("RGBA")
|
| 139 |
+
size = base_image.size
|
| 140 |
+
|
| 141 |
+
# Load all mouth frame images
|
| 142 |
+
mouth_frames = [
|
| 143 |
+
Image.open(path).convert("RGBA").resize(size)
|
| 144 |
+
for path in mouth_frames_paths
|
| 145 |
+
]
|
| 146 |
+
|
| 147 |
+
# Calculate chunk duration to match target FPS
|
| 148 |
+
chunk_ms = int(1000 / fps)
|
| 149 |
+
|
| 150 |
+
# Try to extract audio amplitude data
|
| 151 |
+
try:
|
| 152 |
+
rms_values = audio_to_rms_chunks(audio_path, chunk_ms=chunk_ms)
|
| 153 |
+
except Exception as e:
|
| 154 |
+
print(f"Primary audio processing failed: {e}")
|
| 155 |
+
print("Using fallback animation method...")
|
| 156 |
+
rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms)
|
| 157 |
+
|
| 158 |
+
# Handle edge case of empty or invalid audio
|
| 159 |
+
if not rms_values or all(v == 0 for v in rms_values):
|
| 160 |
+
rms_values = audio_to_rms_chunks_simple(audio_path, chunk_ms=chunk_ms)
|
| 161 |
+
|
| 162 |
+
# Normalize RMS values to 0-1 range
|
| 163 |
+
max_rms = max(rms_values) if max(rms_values) > 0 else 1
|
| 164 |
+
|
| 165 |
+
# Generate animation frames
|
| 166 |
+
frames = []
|
| 167 |
+
num_mouth_frames = len(mouth_frames)
|
| 168 |
+
|
| 169 |
+
for rms in rms_values:
|
| 170 |
+
# Calculate mouth openness ratio (0 to 1)
|
| 171 |
+
ratio = rms / max_rms
|
| 172 |
+
|
| 173 |
+
# Map ratio to mouth frame index
|
| 174 |
+
mouth_index = int(ratio * (num_mouth_frames - 1))
|
| 175 |
+
mouth_index = max(0, min(mouth_index, num_mouth_frames - 1))
|
| 176 |
+
|
| 177 |
+
# Composite mouth onto base image
|
| 178 |
+
mouth = mouth_frames[mouth_index]
|
| 179 |
+
frame = Image.alpha_composite(base_image, mouth)
|
| 180 |
+
|
| 181 |
+
# Convert to RGB for GIF compatibility
|
| 182 |
+
frame_rgb = Image.new("RGB", frame.size, (255, 255, 255))
|
| 183 |
+
frame_rgb.paste(frame, mask=frame.split()[-1] if frame.mode == 'RGBA' else None)
|
| 184 |
+
|
| 185 |
+
frames.append(frame_rgb)
|
| 186 |
+
|
| 187 |
+
# Ensure output directory exists
|
| 188 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 189 |
+
|
| 190 |
+
# Generate output filename
|
| 191 |
+
if output_path is None:
|
| 192 |
+
timestamp = int(time.time() * 1000)
|
| 193 |
+
output_path = str(output_dir / f"lipsync_{timestamp}.gif")
|
| 194 |
+
|
| 195 |
+
# Save as animated GIF
|
| 196 |
+
if frames:
|
| 197 |
+
# Use imageio to save GIF
|
| 198 |
+
imageio.mimsave(
|
| 199 |
+
output_path,
|
| 200 |
+
frames,
|
| 201 |
+
fps=fps,
|
| 202 |
+
loop=0 # Loop forever
|
| 203 |
+
)
|
| 204 |
+
else:
|
| 205 |
+
raise ValueError("No frames generated for animation")
|
| 206 |
+
|
| 207 |
+
return output_path
|
utils/speech_to_text.py
ADDED
|
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Speech-to-Text Module
|
| 3 |
+
=====================
|
| 4 |
+
Converts voice/audio input to text using speech recognition.
|
| 5 |
+
|
| 6 |
+
Functions:
|
| 7 |
+
- transcribe_audio: Convert audio file to text
|
| 8 |
+
- transcribe_from_microphone: Real-time microphone transcription
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
import speech_recognition as sr
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
from typing import Optional, Tuple
|
| 14 |
+
import tempfile
|
| 15 |
+
from pydub import AudioSegment
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def transcribe_audio(
|
| 19 |
+
audio_path: str,
|
| 20 |
+
language: str = "en-US"
|
| 21 |
+
) -> Tuple[str, bool]:
|
| 22 |
+
"""
|
| 23 |
+
Transcribe audio file to text using Google Speech Recognition.
|
| 24 |
+
|
| 25 |
+
Supports various audio formats (WAV, MP3, etc.) and converts
|
| 26 |
+
them automatically for processing.
|
| 27 |
+
|
| 28 |
+
Args:
|
| 29 |
+
audio_path: Path to the audio file
|
| 30 |
+
language: Language code for recognition
|
| 31 |
+
- 'en-US' for English (US)
|
| 32 |
+
- 'hi-IN' for Hindi (India)
|
| 33 |
+
|
| 34 |
+
Returns:
|
| 35 |
+
Tuple of (transcribed_text, success_flag)
|
| 36 |
+
- If successful: (text, True)
|
| 37 |
+
- If failed: (error_message, False)
|
| 38 |
+
|
| 39 |
+
Example:
|
| 40 |
+
>>> text, success = transcribe_audio("recording.wav", "en-US")
|
| 41 |
+
>>> if success:
|
| 42 |
+
... print(f"You said: {text}")
|
| 43 |
+
... else:
|
| 44 |
+
... print(f"Error: {text}")
|
| 45 |
+
|
| 46 |
+
Supported Formats:
|
| 47 |
+
- WAV (recommended)
|
| 48 |
+
- MP3
|
| 49 |
+
- FLAC
|
| 50 |
+
- OGG
|
| 51 |
+
"""
|
| 52 |
+
recognizer = sr.Recognizer()
|
| 53 |
+
|
| 54 |
+
try:
|
| 55 |
+
# Convert audio to WAV format if needed
|
| 56 |
+
audio_path = Path(audio_path)
|
| 57 |
+
|
| 58 |
+
if audio_path.suffix.lower() != '.wav':
|
| 59 |
+
# Convert to WAV using pydub
|
| 60 |
+
audio = AudioSegment.from_file(str(audio_path))
|
| 61 |
+
|
| 62 |
+
# Create temporary WAV file
|
| 63 |
+
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as tmp:
|
| 64 |
+
wav_path = tmp.name
|
| 65 |
+
audio.export(wav_path, format='wav')
|
| 66 |
+
else:
|
| 67 |
+
wav_path = str(audio_path)
|
| 68 |
+
|
| 69 |
+
# Load audio file for recognition
|
| 70 |
+
with sr.AudioFile(wav_path) as source:
|
| 71 |
+
# Adjust for ambient noise
|
| 72 |
+
recognizer.adjust_for_ambient_noise(source, duration=0.5)
|
| 73 |
+
|
| 74 |
+
# Record the audio
|
| 75 |
+
audio_data = recognizer.record(source)
|
| 76 |
+
|
| 77 |
+
# Perform speech recognition
|
| 78 |
+
text = recognizer.recognize_google(audio_data, language=language)
|
| 79 |
+
|
| 80 |
+
return text, True
|
| 81 |
+
|
| 82 |
+
except sr.UnknownValueError:
|
| 83 |
+
return "Could not understand the audio. Please speak clearly.", False
|
| 84 |
+
|
| 85 |
+
except sr.RequestError as e:
|
| 86 |
+
return f"Speech recognition service error: {str(e)}", False
|
| 87 |
+
|
| 88 |
+
except Exception as e:
|
| 89 |
+
return f"Error processing audio: {str(e)}", False
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
def get_language_code(lang: str) -> str:
|
| 93 |
+
"""
|
| 94 |
+
Convert short language code to full speech recognition code.
|
| 95 |
+
|
| 96 |
+
Args:
|
| 97 |
+
lang: Short language code ('en' or 'hi')
|
| 98 |
+
|
| 99 |
+
Returns:
|
| 100 |
+
Full language code for speech recognition
|
| 101 |
+
|
| 102 |
+
Example:
|
| 103 |
+
>>> get_language_code('en')
|
| 104 |
+
'en-US'
|
| 105 |
+
>>> get_language_code('hi')
|
| 106 |
+
'hi-IN'
|
| 107 |
+
"""
|
| 108 |
+
language_map = {
|
| 109 |
+
'en': 'en-US',
|
| 110 |
+
'hi': 'hi-IN',
|
| 111 |
+
'auto': 'en-US' # Default to English for auto
|
| 112 |
+
}
|
| 113 |
+
return language_map.get(lang, 'en-US')
|
utils/translator.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Translation Module
|
| 3 |
+
==================
|
| 4 |
+
Handles text translation between English and Hindi using deep-translator.
|
| 5 |
+
|
| 6 |
+
Functions:
|
| 7 |
+
- detect_language: Auto-detect if text is English or Hindi
|
| 8 |
+
- translate_text: Translate text between languages
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from deep_translator import GoogleTranslator
|
| 12 |
+
from typing import Literal
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def detect_language(text: str) -> Literal["en", "hi"]:
|
| 16 |
+
"""
|
| 17 |
+
Detect if the input text is English or Hindi.
|
| 18 |
+
|
| 19 |
+
Uses Unicode range detection for Devanagari script (Hindi).
|
| 20 |
+
|
| 21 |
+
Args:
|
| 22 |
+
text: Input text string to analyze
|
| 23 |
+
|
| 24 |
+
Returns:
|
| 25 |
+
'hi' if Hindi/Devanagari characters found, 'en' otherwise
|
| 26 |
+
|
| 27 |
+
Example:
|
| 28 |
+
>>> detect_language("Hello World")
|
| 29 |
+
'en'
|
| 30 |
+
>>> detect_language("नमस्ते")
|
| 31 |
+
'hi'
|
| 32 |
+
"""
|
| 33 |
+
# Check for Devanagari Unicode range (U+0900 to U+097F)
|
| 34 |
+
for char in text:
|
| 35 |
+
if '\u0900' <= char <= '\u097F':
|
| 36 |
+
return "hi"
|
| 37 |
+
return "en"
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def translate_text(
|
| 41 |
+
text: str,
|
| 42 |
+
source_lang: Literal["auto", "en", "hi"],
|
| 43 |
+
target_lang: Literal["en", "hi"]
|
| 44 |
+
) -> str:
|
| 45 |
+
"""
|
| 46 |
+
Translate text from source language to target language.
|
| 47 |
+
|
| 48 |
+
Uses Google Translator via deep-translator library for accurate
|
| 49 |
+
translations between English and Hindi.
|
| 50 |
+
|
| 51 |
+
Args:
|
| 52 |
+
text: The text to translate
|
| 53 |
+
source_lang: Source language code ('auto', 'en', or 'hi')
|
| 54 |
+
target_lang: Target language code ('en' or 'hi')
|
| 55 |
+
|
| 56 |
+
Returns:
|
| 57 |
+
Translated text string
|
| 58 |
+
|
| 59 |
+
Raises:
|
| 60 |
+
ValueError: If text is empty
|
| 61 |
+
|
| 62 |
+
Example:
|
| 63 |
+
>>> translate_text("Hello", "en", "hi")
|
| 64 |
+
'नमस्ते'
|
| 65 |
+
>>> translate_text("नमस्ते", "auto", "en")
|
| 66 |
+
'Hello'
|
| 67 |
+
"""
|
| 68 |
+
# Validate input
|
| 69 |
+
if not text or not text.strip():
|
| 70 |
+
return ""
|
| 71 |
+
|
| 72 |
+
# Auto-detect source language if needed
|
| 73 |
+
if source_lang == "auto":
|
| 74 |
+
source_lang = detect_language(text)
|
| 75 |
+
|
| 76 |
+
# Skip translation if source and target are the same
|
| 77 |
+
if source_lang == target_lang:
|
| 78 |
+
return text
|
| 79 |
+
|
| 80 |
+
# Perform translation using Google Translator
|
| 81 |
+
translator = GoogleTranslator(source=source_lang, target=target_lang)
|
| 82 |
+
translated = translator.translate(text)
|
| 83 |
+
|
| 84 |
+
return translated
|
utils/tts_engine.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Text-to-Speech Engine
|
| 3 |
+
=====================
|
| 4 |
+
Converts text to speech audio using Google Text-to-Speech (gTTS).
|
| 5 |
+
|
| 6 |
+
Functions:
|
| 7 |
+
- synthesize_speech: Convert text to MP3 audio file
|
| 8 |
+
- get_audio_duration: Get duration of audio file
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from gtts import gTTS
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
import time
|
| 14 |
+
from typing import Literal
|
| 15 |
+
import os
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def synthesize_speech(
|
| 19 |
+
text: str,
|
| 20 |
+
language: Literal["en", "hi"],
|
| 21 |
+
output_dir: Path,
|
| 22 |
+
slow: bool = False
|
| 23 |
+
) -> str:
|
| 24 |
+
"""
|
| 25 |
+
Convert text to speech and save as MP3 file.
|
| 26 |
+
|
| 27 |
+
Uses Google Text-to-Speech (gTTS) for natural-sounding
|
| 28 |
+
speech synthesis in English and Hindi.
|
| 29 |
+
|
| 30 |
+
Args:
|
| 31 |
+
text: Text to convert to speech
|
| 32 |
+
language: Language code ('en' for English, 'hi' for Hindi)
|
| 33 |
+
output_dir: Directory to save the audio file
|
| 34 |
+
slow: If True, speak slowly (useful for language learning)
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
Path to the generated MP3 file
|
| 38 |
+
"""
|
| 39 |
+
# Ensure output directory exists
|
| 40 |
+
output_dir = Path(output_dir)
|
| 41 |
+
output_dir.mkdir(parents=True, exist_ok=True)
|
| 42 |
+
|
| 43 |
+
# Generate unique filename using timestamp
|
| 44 |
+
timestamp = int(time.time() * 1000)
|
| 45 |
+
output_path = output_dir / f"tts_{timestamp}.mp3"
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
# Create TTS object and save to file
|
| 49 |
+
tts = gTTS(text=text, lang=language, slow=slow)
|
| 50 |
+
tts.save(str(output_path))
|
| 51 |
+
|
| 52 |
+
# Verify file was created
|
| 53 |
+
if not output_path.exists():
|
| 54 |
+
raise FileNotFoundError(f"TTS file was not created: {output_path}")
|
| 55 |
+
|
| 56 |
+
# Verify file has content
|
| 57 |
+
if output_path.stat().st_size == 0:
|
| 58 |
+
raise ValueError("TTS file is empty")
|
| 59 |
+
|
| 60 |
+
return str(output_path)
|
| 61 |
+
|
| 62 |
+
except Exception as e:
|
| 63 |
+
print(f"TTS Error: {e}")
|
| 64 |
+
raise
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def get_audio_duration(audio_path: str) -> float:
|
| 68 |
+
"""
|
| 69 |
+
Get the duration of an audio file in seconds.
|
| 70 |
+
|
| 71 |
+
Args:
|
| 72 |
+
audio_path: Path to the audio file
|
| 73 |
+
|
| 74 |
+
Returns:
|
| 75 |
+
Duration in seconds (estimated if pydub fails)
|
| 76 |
+
"""
|
| 77 |
+
try:
|
| 78 |
+
from pydub import AudioSegment
|
| 79 |
+
audio = AudioSegment.from_file(audio_path)
|
| 80 |
+
return len(audio) / 1000.0 # Convert milliseconds to seconds
|
| 81 |
+
except Exception:
|
| 82 |
+
# Fallback: estimate based on file size
|
| 83 |
+
# Approximate: MP3 at 128kbps = 16KB per second
|
| 84 |
+
try:
|
| 85 |
+
file_size = os.path.getsize(audio_path)
|
| 86 |
+
return file_size / 16000
|
| 87 |
+
except Exception:
|
| 88 |
+
return 3.0 # Default 3 seconds
|