Spaces:

PlotweaverModel
/

Live_Commentary_App

Running

App Files Files Community

PlotweaverModel commited on 9 days ago

Commit

f7b3ceb

verified ·

1 Parent(s): 4ebac7c

Delete pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +0 -260

pipeline.py DELETED Viewed

@@ -1,260 +0,0 @@
-"""
-Core pipeline: ASR (Whisper) + MT (NLLB-200) functions.
-TTS is handled by tts_engine.py.
-"""
-import torch
-import numpy as np
-import re
-import time
-import os
-import subprocess
-import tempfile
-import logging
-import soundfile as sf
-logger = logging.getLogger(__name__)
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
-# Models (loaded once at startup)
-asr_pipe = None
-mt_tokenizer = None
-mt_model = None
-tts_pipe_local = None  # Local TTS for Yoruba/Hausa/Igbo/Zulu
-def load_models():
-    """Load all models at startup."""
-    global asr_pipe, mt_tokenizer, mt_model, tts_pipe_local
-    from transformers import (
-        pipeline as hf_pipeline,
-        AutoTokenizer,
-        AutoModelForSeq2SeqLM,
-    )
-    print(f"Device: {DEVICE} | Dtype: {TORCH_DTYPE}")
-    print("Loading models...")
-    # ASR
-    ASR_MODEL_ID = "PlotweaverAI/whisper-small-de-en"
-    print(f"  Loading ASR: {ASR_MODEL_ID}")
-    asr_pipe = hf_pipeline(
-        "automatic-speech-recognition",
-        model=ASR_MODEL_ID,
-        device=DEVICE,
-        torch_dtype=TORCH_DTYPE,
-    )
-    print("  ASR loaded")
-    # MT
-    MT_MODEL_ID = "PlotweaverAI/nllb-200-distilled-600M-african-6lang"
-    print(f"  Loading MT: {MT_MODEL_ID}")
-    mt_tokenizer = AutoTokenizer.from_pretrained(MT_MODEL_ID)
-    mt_model = AutoModelForSeq2SeqLM.from_pretrained(
-        MT_MODEL_ID, torch_dtype=TORCH_DTYPE
-    ).to(DEVICE)
-    mt_tokenizer.src_lang = "eng_Latn"
-    print("  MT loaded")
-    # Local TTS (Yoruba)
-    TTS_MODEL_ID = "PlotweaverAI/yoruba-mms-tts-new"
-    print(f"  Loading local TTS: {TTS_MODEL_ID}")
-    tts_pipe_local = hf_pipeline(
-        "text-to-speech",
-        model=TTS_MODEL_ID,
-        device=DEVICE,
-        torch_dtype=TORCH_DTYPE,
-    )
-    print("  Local TTS loaded")
-    # Diagnostics
-    print(f"\n=== Device diagnostics ===")
-    print(f"CUDA available: {torch.cuda.is_available()}")
-    if torch.cuda.is_available():
-        print(f"CUDA device: {torch.cuda.get_device_name(0)}")
-    print(f"ASR on: {next(asr_pipe.model.parameters()).device}")
-    print(f"MT on: {next(mt_model.parameters()).device}")
-    print(f"TTS on: {next(tts_pipe_local.model.parameters()).device}")
-    print(f"YourVoic API key: {'set' if os.environ.get('YOURVOIC_API_KEY') else 'NOT SET'}")
-    print(f"==========================\n")
-    print("All models loaded!")
-# ---- Text Processing ----
-def split_into_sentences(text):
-    """Split raw ASR text into individual sentences."""
-    text = text.strip()
-    if not text:
-        return []
-    text = '. '.join(s.strip().capitalize() for s in text.split('. ') if s.strip())
-    if re.search(r'[.!?]', text):
-        sentences = re.split(r'(?<=[.!?])\s+', text)
-        return [s.strip() for s in sentences if s.strip()]
-    words = text.split()
-    MAX_WORDS = 12
-    sentences = []
-    for i in range(0, len(words), MAX_WORDS):
-        chunk = ' '.join(words[i:i + MAX_WORDS])
-        if not chunk.endswith(('.', '!', '?')):
-            chunk += '.'
-        chunk = chunk[0].upper() + chunk[1:] if len(chunk) > 1 else chunk.upper()
-        sentences.append(chunk)
-    return sentences
-# ---- ASR ----
-def transcribe(audio_array, sample_rate=16000):
-    """ASR: English audio to text. Handles both short and long audio."""
-    if len(audio_array) < 1600:
-        return ""
-    duration_s = len(audio_array) / sample_rate
-    if sample_rate != 16000:
-        import torchaudio.functional as F_audio
-        audio_tensor = torch.from_numpy(audio_array).float()
-        audio_tensor = F_audio.resample(audio_tensor, sample_rate, 16000)
-        audio_array = audio_tensor.numpy()
-        sample_rate = 16000
-    if duration_s <= 28:
-        result = asr_pipe(
-            {"raw": audio_array, "sampling_rate": sample_rate},
-            return_timestamps=False,
-        )
-        return result["text"].strip()
-    # Long-form: native Whisper generate
-    model = asr_pipe.model
-    processor = asr_pipe.feature_extractor
-    tokenizer = asr_pipe.tokenizer
-    inputs = processor(
-        audio_array, sampling_rate=16000, return_tensors="pt",
-        truncation=False, padding="longest", return_attention_mask=True,
-    )
-    input_features = inputs.input_features.to(DEVICE, dtype=TORCH_DTYPE)
-    attention_mask = inputs.attention_mask.to(DEVICE) if "attention_mask" in inputs else None
-    generate_kwargs = {"return_timestamps": True, "language": "en", "task": "transcribe"}
-    if attention_mask is not None:
-        generate_kwargs["attention_mask"] = attention_mask
-    with torch.no_grad():
-        predicted_ids = model.generate(input_features, **generate_kwargs)
-    transcription = tokenizer.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-    return transcription.strip()
-# ---- MT ----
-def translate_sentence(text, target_nllb_code, fast=True, max_length=256):
-    """Translate a single sentence from English to target language."""
-    inputs = mt_tokenizer(text, return_tensors="pt", truncation=True).to(DEVICE)
-    tgt_lang_id = mt_tokenizer.convert_tokens_to_ids(target_nllb_code)
-    generate_kwargs = {
-        "forced_bos_token_id": tgt_lang_id,
-        "repetition_penalty": 1.5,
-        "no_repeat_ngram_size": 3,
-    }
-    if fast:
-        generate_kwargs.update({"max_length": 128, "num_beams": 1, "do_sample": False})
-    else:
-        generate_kwargs.update({"max_length": max_length, "num_beams": 4, "early_stopping": True})
-    with torch.no_grad():
-        output_ids = mt_model.generate(**inputs, **generate_kwargs)
-    return mt_tokenizer.decode(output_ids[0], skip_special_tokens=True)
-def translate_text(text, target_nllb_code, fast=True):
-    """Split and translate full text sentence-by-sentence."""
-    sentences = split_into_sentences(text)
-    if not sentences:
-        return "", [], []
-    translations = []
-    for s in sentences:
-        yo = translate_sentence(s, target_nllb_code, fast=fast)
-        translations.append(yo)
-    return ' '.join(translations), sentences, translations
-# ---- Video Processing ----
-def extract_audio_from_video(video_path, output_path, target_sr=16000):
-    """Extract audio track from video as 16kHz mono WAV."""
-    cmd = [
-        "ffmpeg", "-y", "-i", video_path,
-        "-vn", "-acodec", "pcm_s16le", "-ar", str(target_sr), "-ac", "1",
-        output_path,
-    ]
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    if result.returncode != 0:
-        raise RuntimeError(f"ffmpeg extraction failed: {result.stderr[:200]}")
-    return output_path
-def get_media_duration(path):
-    """Get duration in seconds."""
-    cmd = [
-        "ffprobe", "-v", "error",
-        "-show_entries", "format=duration",
-        "-of", "default=noprint_wrappers=1:nokey=1", path,
-    ]
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    if result.returncode != 0:
-        raise RuntimeError(f"ffprobe failed: {result.stderr[:200]}")
-    return float(result.stdout.strip())
-def stretch_audio_to_duration(input_path, output_path, target_duration_s):
-    """Stretch/compress audio to match target duration."""
-    current_duration = get_media_duration(input_path)
-    if current_duration <= 0:
-        raise RuntimeError("Invalid audio duration")
-    ratio = current_duration / target_duration_s
-    filters = []
-    remaining = ratio
-    while remaining > 2.0:
-        filters.append("atempo=2.0")
-        remaining /= 2.0
-    while remaining < 0.5:
-        filters.append("atempo=0.5")
-        remaining /= 0.5
-    filters.append(f"atempo={remaining:.4f}")
-    cmd = ["ffmpeg", "-y", "-i", input_path, "-filter:a", ",".join(filters), output_path]
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    if result.returncode != 0:
-        raise RuntimeError(f"ffmpeg tempo failed: {result.stderr[:200]}")
-    return output_path
-def mux_video_audio(video_path, audio_path, output_path, extend_video=False, target_duration=None):
-    """Combine video with new audio. Optionally extend video by freezing last frame."""
-    if extend_video and target_duration:
-        cmd = [
-            "ffmpeg", "-y", "-i", video_path, "-i", audio_path,
-            "-filter_complex", f"[0:v]tpad=stop_mode=clone:stop_duration={target_duration}[v]",
-            "-map", "[v]", "-map", "1:a:0",
-            "-c:v", "libx264", "-preset", "fast", "-c:a", "aac",
-            "-t", str(target_duration), output_path,
-        ]
-    else:
-        cmd = [
-            "ffmpeg", "-y", "-i", video_path, "-i", audio_path,
-            "-c:v", "copy", "-c:a", "aac",
-            "-map", "0:v:0", "-map", "1:a:0", "-shortest", output_path,
-        ]
-    result = subprocess.run(cmd, capture_output=True, text=True)
-    if result.returncode != 0:
-        raise RuntimeError(f"ffmpeg mux failed: {result.stderr[:200]}")
-    return output_path