Offex's picture
Update app.py
67748bb verified
import gradio as gr
import yt_dlp
import os
import shutil
import subprocess
import tempfile
from faster_whisper import WhisperModel
from indic_transliteration import sanscript
from indic_transliteration.sanscript import transliterate
import torch
# ===============================
# 🔒 GLOBALS & CONFIG
# ===============================
MODEL_CACHE_DIR = "/tmp/qwen_whisper_cache"
os.makedirs(MODEL_CACHE_DIR, exist_ok=True)
# Lazy-loaded model (shared across calls)
_model = None
def load_whisper_model():
global _model
if _model is None:
print("📥 Loading Whisper 'base' model (CPU/int8)...")
_model = WhisperModel(
"base",
device="cpu",
compute_type="int8",
download_root=MODEL_CACHE_DIR
)
print("✅ Model loaded.")
return _model
def get_ffmpeg():
return shutil.which("ffmpeg") or "/usr/bin/ffmpeg"
# ===============================
# 📥 SAFE DOWNLOAD (YouTube, TikTok, etc.)
# ===============================
def download_video(url):
video_path = os.path.join(tempfile.gettempdir(), "downloaded_video.mp4")
if os.path.exists(video_path):
os.remove(video_path)
ydl_opts = {
"format": "bestvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best",
"outtmpl": video_path,
"quiet": True,
"nocheckcertificate": True,
"noplaylist": True, "extract_audio": False,
"retries": 10,
"fragment_retries": 10,
}
try:
with yt_dlp.YoutubeDL(ydl_opts) as ydl:
info = ydl.extract_info(url, download=True)
# Ensure file exists
if not os.path.exists(video_path):
raise FileNotFoundError("Download failed: no file created")
return video_path, info.get("title", "Untitled")
except Exception as e:
raise RuntimeError(f"Download failed: {str(e)}")
# ===============================
# 🎧 EXTRACT AUDIO (robust)
# ===============================
def extract_audio(video_path):
audio_path = os.path.join(tempfile.gettempdir(), "extracted_audio.wav")
if os.path.exists(audio_path):
os.remove(audio_path)
cmd = [
get_ffmpeg(),
"-y",
"-i", video_path,
"-vn",
"-ac", "1",
"-ar", "16000",
"-c:a", "pcm_s16le",
audio_path
]
try:
result = subprocess.run(cmd, capture_output=True, text=True, timeout=60)
if result.returncode != 0:
raise RuntimeError(f"FFmpeg failed: {result.stderr}")
if not os.path.exists(audio_path) or os.path.getsize(audio_path) < 5000:
raise RuntimeError("Audio extraction produced empty/invalid file")
return audio_path
except subprocess.TimeoutExpired:
raise RuntimeError("Audio extraction timed out (>60s)")
# ===============================
# 🌐 LANGUAGE-AWARE TRANSLITERATION & NORMALIZATION
# ===============================
def normalize_to_hindi(text):
"""Convert any script to Devanagari + clean up"""
if not text.strip(): return ""
# Step 1: Transliterate non-Devanagari scripts to Devanagari
try:
# Try Arabic → Devanagari (for Urdu)
text = transliterate(text, sanscript.ARABIC, sanscript.DEVANAGARI)
# Try Roman → Devanagari (for Hindi/English mixed)
text = transliterate(text, sanscript.ITRANS, sanscript.DEVANAGARI)
except Exception:
pass # fallback to raw text
# Step 2: Clean punctuation & spacing
import re
text = re.sub(r'[^\u0900-\u097F\u0020\u002E\u002C\u003F\u0021\u003B\u003A\u002D\u0028\u0029]', '', text)
text = re.sub(r'\s+', ' ', text).strip()
text = re.sub(r'\.\s*\.', '.', text) # fix .. → .
text = re.sub(r'\?\s*\?', '?', text)
text = re.sub(r'!\s*!', '!', text)
# Step 3: Add proper full stops at end if missing
if text and text[-1] not in "।.!?":
text += "।"
return text
# ===============================
# 🎯 CORE TRANSCRIBE FUNCTION (ALWAYS OUTPUT HINDI)
# ===============================
def transcribe_to_hindi(url=None, file=None, lang_choice="Auto Detect"):
try:
# ======== INPUT HANDLING ========
if file:
ext = os.path.splitext(file)[1].lower()
if ext in [".mp3", ".wav", ".m4a", ".ogg"]:
audio_path = file
title = os.path.basename(file)
else:
video_path = file
audio_path = extract_audio(video_path)
title = os.path.basename(video_path)
elif url:
video_path, title = download_video(url)
audio_path = extract_audio(video_path)
else:
return "⚠️ Please paste a URL or upload a file."
# Safety check
if not os.path.exists(audio_path) or os.path.getsize(audio_path) < 5000:
return "❌ Audio file too small or missing. Try again."
# ======== TRANSCRIPTION ========
model = load_whisper_model()
segments, info = model.transcribe(
audio_path,
beam_size=5,
best_of=3,
patience=1.0,
temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0),
vad_filter=True,
word_timestamps=False,
language=None # Auto-detect
)
raw_text = " ".join([seg.text for seg in segments]).strip()
# ======== FORCE HINDI OUTPUT ========
# Even if detected language is en/ur/tam, convert to Hindi script
final_text = normalize_to_hindi(raw_text)
# Optional: Add title & metadata
header = f"🎬 {title[:50]}{'...' if len(title) > 50 else ''}\n"
header += f"🌍 Detected: {info.language or 'Unknown'} → 🇮🇳 Output: Hindi (Devanagari)\n\n"
return header + final_text
except Exception as e:
err_msg = str(e).lower()
if "instagram" in err_msg:
return (
"❌ Instagram URLs are blocked on Hugging Face.\n\n"
"✅ Solution: Download the video manually (e.g., via online downloader), then upload it here."
)
elif "timeout" in err_msg or "network" in err_msg:
return "⚠️ Network timeout. Try again or upload file directly."
else:
return f"❌ Error: {str(e)[:200]}..."
# ===============================
# 🎨 MODERN UI (HUGGING FACE OPTIMIZED)
# ===============================
CSS = """
/* Glassmorphism + Dark Gradient */
body {
background: radial-gradient(circle at top, #0c1445, #1a2a6c, #2c3e50);
font-family: 'Inter', system-ui, sans-serif;
}
.glass-card {
background: rgba(255, 255, 255, 0.07);
backdrop-filter:);
border-radius: 20px; padding: 28px;
box-shadow: 0 12px 32px rgba(0, 0, 0, 0.4);
border: 1px solid rgba(255, 255, 255, 0.1);
}
.gr-button-primary {
background: linear-gradient(135deg, #6a11cb 0%, #2575fc 100%);
border: none;
color: white;
font-weight: 600;
padding: 12px 24px;
border-radius: 12px;
transition: all 0.3s ease;
}
.gr-button-primary:hover {
transform: translateY(-2px);
box-shadow: 0 6px 15px rgba(37, 117252, 0.4);
}
.gr-input, .gr-textarea, .gr-dropdown {
background: rgba(255, 255, 255, 0.08) !important;
color: #e0e0ff !important;
border: 1px solid rgba(255, 255, 255, 0.15) !important;
border-radius: 10px;
}
.gr-markdown p, .gr-markdown h2 {
color: #f0f4ff !important;
}
footer { display: none !important; }
.title {
font-size: 2.2rem;
font-weight: 800;
background: linear-gradient(90deg, #ffd700, #ff8c00);
-webkit-background-clip: text;
background-clip: text;
color: transparent;
margin-bottom: 12px;
}
.subtitle {
color: #a0d2eb;
font-size: 1.1rem;
margin-bottom: 24px;
}
.feature-badge {
display: inline-block;
background: rgba(106, 17, 203, 0.3);
color: #ffd700;
padding: 3px 10;
border-radius: 20px;
font-size: 0.85rem;
margin: 0 4px;
}"""
with gr.Blocks(
css=CSS,
theme=gr.themes.Default(
primary_hue=gr.themes.Color(c100="#6a11cb", c200="#2575fc", c300="#1a5fb4"),
secondary_hue=gr.themes.Color(c100="#ff9e00", c200="#ff7b00"),
neutral_hue=gr.themes.Color(c100="#1e293b", c200="#0f172a"),
),
title="🗣️ AI Hindi Transcript Studio",
) as demo:
with gr.Column(elem_classes=["glass-card"]):
gr.HTML("<div class='title'>AI Hindi Transcript Studio</div>")
gr.HTML("<div class='subtitle'>Upload or paste any video → Get clean Devanagari Hindi transcript instantly</div>")
gr.Markdown(
"✨ Supports: YouTube, TikTok, Facebook, Twitter/X, Instagram (via upload), local files<br>"
"⚡ Zero ffprobe errors • Auto-script conversion • Real-time cleanup"
)
with gr.Tabs():
with gr.TabItem("🔗 URL"):
url_input = gr.Textbox(
label="🎥 Video URL",
placeholder="https://youtu.be/...",
info="Instagram? Upload file instead (HF restriction)"
)
btn_url = gr.Button("🔊 Transcribe to Hindi", variant="primary", size="lg")
with gr.TabItem("📂 File"):
file_input = gr.File(
label="📁 Upload Video/Audio",
file_types=["video", "audio"],
info="MP4, MOV, MP3, WAV, M4A, etc."
)
btn_file = gr.Button("📖 Convert to Hindi", variant="primary", size="lg")
lang_dummy = gr.Dropdown(
choices=["Auto (→ Hindi)"],
value="Auto (→ Hindi)",
interactive=False,
visible=False
) # Hidden — we force Hindi output
output_box = gr.Textbox(
label="📝 Hindi Transcript (Devanagari)",
lines=16,
max_lines=25,
show_copy_button=True,
interactive=False, elem_classes=["gr-textarea"]
)
gr.Markdown(
"<div style='text-align:center; margin-top:20px; color:#a0d2eb; font-size:0.9rem;'>"
"🚀 Powered by Faster-Whisper + Indic Transliteration | Deployed on Hugging Face Spaces"
"</div>"
)
# Event bindings
btn_url.click(
fn=transcribe_to_hindi,
inputs=[url_input, gr.State(None), lang_dummy],
outputs=output_box
)
btn_file.click(
fn=transcribe_to_hindi,
inputs=[gr.State(None), file_input, lang_dummy],
outputs=output_box
)
# Optional: Enable queue for HF Spaces
demo.queue(concurrency_count=2, max_size=10)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)