Spaces:

DARKWICK
/

Project-Elvtr

Runtime error

App Files Files Community

DARKWICK commited on Oct 18, 2025

Commit

5d13512

verified ·

1 Parent(s): d3c713e

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -210

app.py CHANGED Viewed

@@ -1,210 +1,140 @@
-import os
-import io
-import re
-import math
-import tempfile
-from typing import List, Tuple
-import gradio as gr
-import numpy as np
-from moviepy.editor import VideoFileClip
-from transformers import (
-    pipeline,
-    AutoModelForSeq2SeqLM,
-    AutoTokenizer,
-    M2M100ForConditionalGeneration,
-    M2M100Tokenizer,
-)
-# ---------------------------
-# Model choices (balanced for CPU Spaces)
-# ---------------------------
-ASR_MODEL_ID = "openai/whisper-small"                 # good balance of quality/speed
-SUMM_MODEL_ID = "sshleifer/distilbart-cnn-12-6"       # light summarizer
-PARA_MODEL_ID = "google/flan-t5-base"                 # for “modernization” rewrite
-TRANS_MODEL_ID = "facebook/m2m100_418M"               # many-to-many language translation
-# Preload pipelines/models once (Space warm-up)
-asr_pipe = pipeline(
-    "automatic-speech-recognition",
-    model=ASR_MODEL_ID,
-    chunk_length_s=30,  # chunking helps long audio on CPU
-)
-summ_pipe = pipeline(
-    "summarization",
-    model=SUMM_MODEL_ID,
-)
-para_tok = AutoTokenizer.from_pretrained(PARA_MODEL_ID)
-para_model = AutoModelForSeq2SeqLM.from_pretrained(PARA_MODEL_ID)
-m2m_tok = M2M100Tokenizer.from_pretrained(TRANS_MODEL_ID)
-m2m_model = M2M100ForConditionalGeneration.from_pretrained(TRANS_MODEL_ID)
-# Maps for M2M100 language codes (expand as needed)
-M2M_LANGS = {
-    "English": "en",
-    "Arabic": "ar",
-    "French": "fr",
-    "German": "de",
-    "Hindi": "hi",
-    "Italian": "it",
-    "Japanese": "ja",
-    "Korean": "ko",
-    "Portuguese": "pt",
-    "Russian": "ru",
-    "Spanish": "es",
-    "Turkish": "tr",
-    "Urdu": "ur",
-    "Chinese (simplified)": "zh",
-}
-def _extract_audio_to_wav(video_path: str) -> Tuple[str, float]:
-    """
-    Extract audio to a temp .wav (mono 16k) using moviepy.
-    Returns (wav_path, duration_seconds)
-    """
-    clip = VideoFileClip(video_path)
-    duration = clip.duration
-    # Write temp wav
-    tmp_wav = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    tmp_wav.close()
-    clip.audio.write_audiofile(tmp_wav.name, fps=16000, nbytes=2, codec="pcm_s16le", ffmpeg_params=["-ac", "1"])
-    clip.close()
-    return tmp_wav.name, float(duration)
-def _clean_text(s: str) -> str:
-    return re.sub(r"\s+", " ", s).strip()
-def transcribe_video(video_file) -> Tuple[str, float]:
-    """
-    Transcribe the uploaded video to text using Whisper pipeline.
-    """
-    if video_file is None:
-        return "", 0.0
-    wav_path, duration = _extract_audio_to_wav(video_file)
-    try:
-        result = asr_pipe(wav_path)
-        text = result["text"] if isinstance(result, dict) else str(result)
-        return _clean_text(text), duration
-    finally:
-        if os.path.exists(wav_path):
-            os.remove(wav_path)
-def translate_text_m2m(txt: str, src_code: str, tgt_code: str, max_len=1024) -> str:
-    """
-    Translate using M2M100. Handles long text by chunking on sentence boundaries.
-    """
-    if not txt.strip():
-        return ""
-    chunks = smart_sentence_chunks(txt, max_len=max_len//2)  # conservative
-    outputs = []
-    for ch in chunks:
-        m2m_tok.src_lang = src_code
-        encoded = m2m_tok(ch, return_tensors="pt", truncation=True, max_length=max_len)
-        generated_tokens = m2m_model.generate(
-            **encoded,
-            forced_bos_token_id=m2m_tok.get_lang_id(tgt_code),
-            max_length=max_len,
-            num_beams=4,
-        )
-        outputs.append(m2m_tok.batch_decode(generated_tokens, skip_special_tokens=True)[0])
-    return _clean_text(" ".join(outputs))
-def summarize_text(txt: str) -> str:
-    if not txt.strip():
-        return ""
-    # chunk long text to stay under model limits
-    chunks = smart_sentence_chunks(txt, max_len=900)
-    out = []
-    for ch in chunks:
-        s = summ_pipe(ch, max_length=180, min_length=60, do_sample=False)[0]["summary_text"]
-        out.append(s)
-    # if multiple chunks, do a final squeeze
-    joined = " ".join(out)
-    if len(joined.split()) > 300:
-        joined = summ_pipe(joined, max_length=220, min_length=80, do_sample=False)[0]["summary_text"]
-    return _clean_text(joined)
-def modernize_text(txt: str, style: str = "concise") -> str:
-    """
-    Paraphrase / modernize via FLAN-T5 instruction.
-    """
-    if not txt.strip():
-        return ""
-    prompt = (
-        "Rewrite the text into modern, clear, and natural language. "
-        "Preserve meaning and important details. Style: " + style + ".\n\nText:\n" + txt
-    )
-    inputs = para_tok(prompt, return_tensors="pt", truncation=True, max_length=2048)
-    outputs = para_model.generate(**inputs, max_length=512, num_beams=4)
-    return _clean_text(para_tok.decode(outputs[0], skip_special_tokens=True))
-# ------------ SRT Helpers ------------
-def smart_sentence_chunks(text: str, max_len: int = 800) -> List[str]:
-    """
-    Split text by sentences with a soft max token length (approx by chars).
-    """
-    # crude sentence split
-    sents = re.split(r'(?<=[.!?])\s+', _clean_text(text))
-    chunks, cur = [], ""
-    for s in sents:
-        if len(cur) + len(s) + 1 <= max_len:
-            cur = (cur + " " + s).strip()
-        else:
-            if cur:
-                chunks.append(cur)
-            cur = s
-    if cur:
-        chunks.append(cur)
-    return chunks
-def make_naive_srt(transcript: str, total_seconds: float) -> str:
-    """
-    Make a "good enough" SRT by assigning equal time slices per sentence.
-    Not perfect, but usable when we don't have per-token timestamps.
-    """
-    sents = [s for s in re.split(r'(?<=[.!?])\s+', _clean_text(transcript)) if s]
-    n = max(1, len(sents))
-    # Avoid too-short windows: min 1.5s per sentence
-    avg = max(1.5, total_seconds / n) if total_seconds > 0 else 3.0
-    lines = []
-    t = 0.0
-    for i, s in enumerate(sents, start=1):
-        start = t
-        end = t + avg
-        t = end
-        lines.append(str(i))
-        lines.append(f"{_fmt_srt_time(start)} --> {_fmt_srt_time(end)}")
-        lines.append(s)
-        lines.append("")  # blank
-    return "\n".join(lines).strip()
-def _fmt_srt_time(sec: float) -> str:
-    sec = max(0.0, sec)
-    h = int(sec // 3600)
-    m = int((sec % 3600) // 60)
-    s = int(sec % 60)
-    ms = int((sec - math.floor(sec)) * 1000)
-    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
-# ------------ Gradio Handlers ------------
-def ui_video_translate(video, src_lang, tgt_lang):
-    if video is None:
-        return gr.update(value=""), gr.update(value=""), gr.update(value=b"", visible=False), gr.update(value="")
-    src = M2M_LANGS[src_lang]
-    tgt = M2M_LANGS[tgt_lang]
-    transcript, duration = transcribe_video(video)
-    translated = translate_text_m2m(transcript, src, tgt)
-    srt_text = make_naive_srt(translated, duration)
-    # Prepare SRT file for download
-    srt_bytes = srt_text.encode("utf-8")
-    srt_file = io.BytesIO(srt_bytes)
-    srt_file.name = "subtitles_translated.srt"
-    return transcript, translated, srt_file, srt_text
-def ui_video_summarize(video, lang_hint):
-    if video is None:
-        return "", ""
-    # Transcribe then summarize (lang-hint doesn’t constrain

+import os
+import tempfile
+from pathlib import Path
+import gradio as gr
+import yt_dlp
+from faster_whisper import WhisperModel
+# -------- Settings you can tweak --------
+DEFAULT_MODEL = os.getenv("WHISPER_MODEL", "small")  # small | medium | large-v3 (requires more RAM)
+COMPUTE_TYPE = os.getenv("COMPUTE_TYPE", "int8")     # int8 | int8_float16 | float16 | float32
+MAX_DURATION_SEC = int(os.getenv("MAX_DURATION_SEC", "1800"))  # 30 min cap to keep things predictable
+# ---------------------------------------
+# Lazy-load model once per container
+_model = None
+def get_model():
+    global _model
+    if _model is None:
+        _model = WhisperModel(DEFAULT_MODEL, compute_type=COMPUTE_TYPE)
+    return _model
+def _download_youtube_audio(url: str, workdir: str) -> str:
+    """
+    Download YouTube audio and convert to WAV mono 16 kHz using FFmpegExtractAudio.
+    Returns path to the WAV file.
+    """
+    outtmpl = str(Path(workdir) / "%(id)s.%(ext)s")
+    ydl_opts = {
+        "format": "bestaudio/best",
+        "outtmpl": outtmpl,
+        "noplaylist": True,
+        "quiet": True,
+        "no_warnings": True,
+        "postprocessors": [
+            {
+                "key": "FFmpegExtractAudio",
+                "preferredcodec": "wav",
+                "preferredquality": "5",
+            }
+        ],
+        # ensure mono @ 16 kHz
+        "postprocessor_args": ["-ac", "1", "-ar", "16000"],
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(url, download=True)
+        duration = info.get("duration") or 0
+        if duration and duration > MAX_DURATION_SEC:
+            raise gr.Error(f"Video too long ({duration//60} min). Max allowed is {MAX_DURATION_SEC//60} min.")
+    # Find the produced .wav in the temp dir (name can vary)
+    wavs = list(Path(workdir).glob("*.wav"))
+    if not wavs:
+        raise gr.Error("Audio extraction failed. Try a different video.")
+    return str(wavs[0])
+def _write_srt(segments, path: str):
+    def srt_timestamp(t):
+        # t in seconds -> "HH:MM:SS,mmm"
+        h = int(t // 3600)
+        m = int((t % 3600) // 60)
+        s = int(t % 60)
+        ms = int((t - int(t)) * 1000)
+        return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"
+    with open(path, "w", encoding="utf-8") as f:
+        for i, seg in enumerate(segments, start=1):
+            f.write(f"{i}\n")
+            f.write(f"{srt_timestamp(seg.start)} --> {srt_timestamp(seg.end)}\n")
+            f.write(seg.text.strip() + "\n\n")
+def transcribe(youtube_url, upload_file, model_size, language, translate_to_english):
+    if not youtube_url and not upload_file:
+        raise gr.Error("Provide a YouTube URL or upload a file.")
+    # Update model on-the-fly if user changes it
+    global _model
+    if _model is None or getattr(_model, "_model_size", None) != model_size:
+        _model = WhisperModel(model_size, compute_type=COMPUTE_TYPE)
+        _model._model_size = model_size  # tag for reuse
+    with tempfile.TemporaryDirectory() as td:
+        if youtube_url:
+            audio_path = _download_youtube_audio(youtube_url.strip(), td)
+        else:
+            # Save uploaded file and (optionally) convert via ffmpeg if needed
+            src = Path(td) / Path(upload_file.name).name
+            with open(src, "wb") as w:
+                w.write(upload_file.read())
+            # Let faster-whisper/ffmpeg handle decoding directly
+            audio_path = str(src)
+        # Transcribe
+        segments, info = _model.transcribe(
+            audio_path,
+            language=None if language == "auto" else language,
+            task="translate" if translate_to_english else "transcribe",
+            vad_filter=True
+        )
+        # Collect text and also write SRT
+        segs = list(segments)
+        full_text = "".join(s.text for s in segs).strip()
+        srt_path = Path(td) / "subtitles.srt"
+        _write_srt(segs, srt_path)
+        return full_text, str(srt_path)
+# ---- Gradio UI ----
+with gr.Blocks(title="YouTube → Text (Whisper)") as demo:
+    gr.Markdown("## 🎬 YouTube → 📝 Text\nPaste a YouTube link **or** upload a media file to get a transcript.")
+    with gr.Row():
+        youtube_url = gr.Textbox(label="YouTube URL", placeholder="https://www.youtube.com/watch?v=...")
+    with gr.Row():
+        upload_file = gr.File(label="Or upload a video/audio file", file_count="single")
+    with gr.Row():
+        model_size = gr.Dropdown(
+            ["small", "medium", "large-v3"],
+            value=DEFAULT_MODEL,
+            label="Model size (larger = more accurate, slower)"
+        )
+        language = gr.Dropdown(
+            ["auto","en","ar","fr","de","es","hi","ur","fa","ru","zh"],
+            value="auto",
+            label="Language (auto-detect or force)"
+        )
+        translate_to_english = gr.Checkbox(value=False, label="Translate to English")
+    run_btn = gr.Button("Transcribe", variant="primary")
+    transcript = gr.Textbox(label="Transcript", lines=12)
+    srt_file = gr.File(label="Download SRT (subtitles)")
+    run_btn.click(
+        transcribe,
+        inputs=[youtube_url, upload_file, model_size, language, translate_to_english],
+        outputs=[transcript, srt_file]
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)