import os import torch import spaces import gradio as gr import librosa import traceback from transformers import WhisperForConditionalGeneration, WhisperProcessor from peft import PeftModel import peft.tuners.tuners_utils as _peft_utils _peft_utils._torch_supports_distributed = False # --- Config --- ADAPTER_ID = "Tsedee/whisper-large-v2-mn-commercial-v1" BASE_ID = "Tsedee/whisper-large-v2-mn-monsub" SAMPLE_RATE = 16000 MAX_MIC_SEC = 5 * 60 MAX_FILE_SEC = 25 * 60 CHUNK_SEC = 29 HF_TOKEN = os.getenv("HF_TOKEN") # --- CPU-д ачаалана --- print("[1/4] Loading base model on CPU (float32)...") base_model = WhisperForConditionalGeneration.from_pretrained( BASE_ID, torch_dtype=torch.float32, low_cpu_mem_usage=True, ) print("[2/4] Applying LoRA adapter...") model = PeftModel.from_pretrained(base_model, ADAPTER_ID, token=HF_TOKEN) print("[3/4] Merging LoRA weights...") model = model.merge_and_unload() model.eval() model.generation_config.suppress_tokens = None # Fix: suppress_tokens=[] → IndexError print("[4/4] Loading processor...") processor = WhisperProcessor.from_pretrained(BASE_ID) print("Model ready on CPU!") def run_inference(waveform): chunk_samples = CHUNK_SEC * SAMPLE_RATE texts = [] for start in range(0, len(waveform), chunk_samples): chunk = waveform[start : start + chunk_samples] inputs = processor(chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt") feats = inputs.input_features.to("cuda").half() with torch.no_grad(): ids = model.generate(feats, max_new_tokens=444) text = processor.batch_decode(ids, skip_special_tokens=True)[0].strip() if text: texts.append(text) return " ".join(texts) if texts else "Текст таних боломжгүй байна." @spaces.GPU(duration=120) def transcribe_mic(audio): try: if audio is None: return "Аудио оруулаагүй байна." model.half(); model.to("cuda") waveform, _ = librosa.load(audio, sr=SAMPLE_RATE, mono=True, duration=MAX_MIC_SEC) duration = len(waveform) / SAMPLE_RATE text = run_inference(waveform) model.to("cpu"); model.float(); torch.cuda.empty_cache() m, s = divmod(int(duration), 60) return f"{text}\n\n⏱ {m}:{s:02d}" except Exception as e: model.to("cpu"); model.float(); torch.cuda.empty_cache() return f"Алдаа: {type(e).__name__}: {e}\n\n{traceback.format_exc()}" @spaces.GPU(duration=300) def transcribe_file(audio): try: if audio is None: return "Файл оруулаагүй байна." model.half(); model.to("cuda") waveform, _ = librosa.load(audio, sr=SAMPLE_RATE, mono=True, duration=MAX_FILE_SEC) duration = len(waveform) / SAMPLE_RATE text = run_inference(waveform) model.to("cpu"); model.float(); torch.cuda.empty_cache() m, s = divmod(int(duration), 60) return f"{text}\n\n⏱ {m}:{s:02d}" except Exception as e: model.to("cpu"); model.float(); torch.cuda.empty_cache() return f"Алдаа: {type(e).__name__}: {e}\n\n{traceback.format_exc()}" # --- UI --- CSS = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap'); * { font-family: 'Inter', sans-serif !important; } .gradio-container { max-width: 860px !important; margin: 0 auto !important; } .hero-banner { background: linear-gradient(135deg, #1e3a5f 0%, #0f4c8a 50%, #1a6b3c 100%); border-radius: 20px; padding: 36px 24px 28px; text-align: center; margin-bottom: 16px; box-shadow: 0 8px 32px rgba(15,76,138,0.3); } .hero-banner h1 { font-size: 2rem !important; font-weight: 700 !important; color: #fff !important; margin: 0 0 8px 0 !important; } .hero-banner p { color: #94c6f7 !important; font-size: 0.9rem !important; margin: 0 !important; } .badge-row { display: flex; justify-content: center; gap: 10px; margin-top: 14px; flex-wrap: wrap; } .badge { background: rgba(255,255,255,0.12); border: 1px solid rgba(255,255,255,0.2); border-radius: 20px; padding: 4px 14px; font-size: 0.78rem; color: #e2f0ff; } .badge-green { background: rgba(22,163,74,0.25); border-color: rgba(22,163,74,0.5); color: #86efac; } .badge-gpu { background: rgba(168,85,247,0.25); border-color: rgba(168,85,247,0.5); color: #d8b4fe; } .section-label { font-size: 0.72rem !important; font-weight: 600 !important; color: #64748b !important; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 6px !important; } .transcribe-btn { background: linear-gradient(135deg, #3b82f6, #1d4ed8) !important; border: none !important; border-radius: 12px !important; font-size: 1rem !important; font-weight: 600 !important; color: white !important; box-shadow: 0 4px 14px rgba(59,130,246,0.4) !important; width: 100% !important; margin-top: 8px !important; } .footer-note { text-align: center; font-size: 0.78rem; color: #64748b; margin-top: 20px; padding-top: 16px; border-top: 1px solid #e2e8f0; } """ with gr.Blocks(css=CSS, title="Монгол Яриа Таних", theme=gr.themes.Soft()) as demo: gr.HTML("""

🎙️ Монгол Яриа Таних

Whisper LoRA загварт суурилсан Монгол хэлний автомат яриа таних систем

⚡ Whisper Large-v2 ✅ WER 6% 🇲🇳 Монгол хэл 🚀 A10G GPU 📁 25 мин хүртэл
""") with gr.Tabs(): with gr.Tab("🎤 Микрофон"): gr.HTML("

Дуу бичих — дээд тал 5 минут

") mic_input = gr.Audio(label="Микрофон", sources=["microphone"], type="filepath", show_download_button=False) mic_btn = gr.Button("🔍 Таних", elem_classes="transcribe-btn", variant="primary") gr.HTML("

Таниулсан текст

") mic_out = gr.Textbox(label="", lines=6, show_copy_button=True, placeholder="Энд таниулсан текст гарна...") mic_btn.click(fn=transcribe_mic, inputs=mic_input, outputs=mic_out) with gr.Tab("📂 Файл Upload"): gr.HTML("

Аудио файл — дээд тал 25 минут · WAV / MP3 / M4A / FLAC

") file_input = gr.Audio(label="Аудио файл", sources=["upload"], type="filepath", show_download_button=False) file_btn = gr.Button("🔍 Таних", elem_classes="transcribe-btn", variant="primary") gr.HTML("

Таниулсан текст

") file_out = gr.Textbox(label="", lines=8, show_copy_button=True, placeholder="Энд таниулсан текст гарна...") file_btn.click(fn=transcribe_file, inputs=file_input, outputs=file_out) gr.HTML(""" """) if __name__ == "__main__": demo.queue(max_size=5).launch()