Spaces:
Running on Zero
Running on Zero
| import os | |
| import torch | |
| import spaces | |
| import gradio as gr | |
| import librosa | |
| import traceback | |
| from transformers import WhisperForConditionalGeneration, WhisperProcessor | |
| from peft import PeftModel | |
| import peft.tuners.tuners_utils as _peft_utils | |
| _peft_utils._torch_supports_distributed = False | |
| # --- Config --- | |
| ADAPTER_ID = "Tsedee/whisper-large-v2-mn-commercial-v1" | |
| BASE_ID = "Tsedee/whisper-large-v2-mn-monsub" | |
| SAMPLE_RATE = 16000 | |
| MAX_MIC_SEC = 5 * 60 | |
| MAX_FILE_SEC = 25 * 60 | |
| CHUNK_SEC = 29 | |
| HF_TOKEN = os.getenv("HF_TOKEN") | |
| # --- CPU-д ачаалана --- | |
| print("[1/4] Loading base model on CPU (float32)...") | |
| base_model = WhisperForConditionalGeneration.from_pretrained( | |
| BASE_ID, torch_dtype=torch.float32, low_cpu_mem_usage=True, | |
| ) | |
| print("[2/4] Applying LoRA adapter...") | |
| model = PeftModel.from_pretrained(base_model, ADAPTER_ID, token=HF_TOKEN) | |
| print("[3/4] Merging LoRA weights...") | |
| model = model.merge_and_unload() | |
| model.eval() | |
| model.generation_config.suppress_tokens = None # Fix: suppress_tokens=[] → IndexError | |
| print("[4/4] Loading processor...") | |
| processor = WhisperProcessor.from_pretrained(BASE_ID) | |
| print("Model ready on CPU!") | |
| def run_inference(waveform): | |
| chunk_samples = CHUNK_SEC * SAMPLE_RATE | |
| texts = [] | |
| for start in range(0, len(waveform), chunk_samples): | |
| chunk = waveform[start : start + chunk_samples] | |
| inputs = processor(chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt") | |
| feats = inputs.input_features.to("cuda").half() | |
| with torch.no_grad(): | |
| ids = model.generate(feats, max_new_tokens=444) | |
| text = processor.batch_decode(ids, skip_special_tokens=True)[0].strip() | |
| if text: | |
| texts.append(text) | |
| return " ".join(texts) if texts else "Текст таних боломжгүй байна." | |
| def transcribe_mic(audio): | |
| try: | |
| if audio is None: | |
| return "Аудио оруулаагүй байна." | |
| model.half(); model.to("cuda") | |
| waveform, _ = librosa.load(audio, sr=SAMPLE_RATE, mono=True, duration=MAX_MIC_SEC) | |
| duration = len(waveform) / SAMPLE_RATE | |
| text = run_inference(waveform) | |
| model.to("cpu"); model.float(); torch.cuda.empty_cache() | |
| m, s = divmod(int(duration), 60) | |
| return f"{text}\n\n⏱ {m}:{s:02d}" | |
| except Exception as e: | |
| model.to("cpu"); model.float(); torch.cuda.empty_cache() | |
| return f"Алдаа: {type(e).__name__}: {e}\n\n{traceback.format_exc()}" | |
| def transcribe_file(audio): | |
| try: | |
| if audio is None: | |
| return "Файл оруулаагүй байна." | |
| model.half(); model.to("cuda") | |
| waveform, _ = librosa.load(audio, sr=SAMPLE_RATE, mono=True, duration=MAX_FILE_SEC) | |
| duration = len(waveform) / SAMPLE_RATE | |
| text = run_inference(waveform) | |
| model.to("cpu"); model.float(); torch.cuda.empty_cache() | |
| m, s = divmod(int(duration), 60) | |
| return f"{text}\n\n⏱ {m}:{s:02d}" | |
| except Exception as e: | |
| model.to("cpu"); model.float(); torch.cuda.empty_cache() | |
| return f"Алдаа: {type(e).__name__}: {e}\n\n{traceback.format_exc()}" | |
| # --- UI --- | |
| CSS = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap'); | |
| * { font-family: 'Inter', sans-serif !important; } | |
| .gradio-container { max-width: 860px !important; margin: 0 auto !important; } | |
| .hero-banner { | |
| background: linear-gradient(135deg, #1e3a5f 0%, #0f4c8a 50%, #1a6b3c 100%); | |
| border-radius: 20px; padding: 36px 24px 28px; | |
| text-align: center; margin-bottom: 16px; | |
| box-shadow: 0 8px 32px rgba(15,76,138,0.3); | |
| } | |
| .hero-banner h1 { font-size: 2rem !important; font-weight: 700 !important; color: #fff !important; margin: 0 0 8px 0 !important; } | |
| .hero-banner p { color: #94c6f7 !important; font-size: 0.9rem !important; margin: 0 !important; } | |
| .badge-row { display: flex; justify-content: center; gap: 10px; margin-top: 14px; flex-wrap: wrap; } | |
| .badge { background: rgba(255,255,255,0.12); border: 1px solid rgba(255,255,255,0.2); border-radius: 20px; padding: 4px 14px; font-size: 0.78rem; color: #e2f0ff; } | |
| .badge-green { background: rgba(22,163,74,0.25); border-color: rgba(22,163,74,0.5); color: #86efac; } | |
| .badge-gpu { background: rgba(168,85,247,0.25); border-color: rgba(168,85,247,0.5); color: #d8b4fe; } | |
| .section-label { font-size: 0.72rem !important; font-weight: 600 !important; color: #64748b !important; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 6px !important; } | |
| .transcribe-btn { background: linear-gradient(135deg, #3b82f6, #1d4ed8) !important; border: none !important; border-radius: 12px !important; font-size: 1rem !important; font-weight: 600 !important; color: white !important; box-shadow: 0 4px 14px rgba(59,130,246,0.4) !important; width: 100% !important; margin-top: 8px !important; } | |
| .footer-note { text-align: center; font-size: 0.78rem; color: #64748b; margin-top: 20px; padding-top: 16px; border-top: 1px solid #e2e8f0; } | |
| """ | |
| with gr.Blocks(css=CSS, title="Монгол Яриа Таних", theme=gr.themes.Soft()) as demo: | |
| gr.HTML(""" | |
| <div class="hero-banner"> | |
| <h1>🎙️ Монгол Яриа Таних</h1> | |
| <p>Whisper LoRA загварт суурилсан Монгол хэлний автомат яриа таних систем</p> | |
| <div class="badge-row"> | |
| <span class="badge">⚡ Whisper Large-v2</span> | |
| <span class="badge badge-green">✅ WER 6%</span> | |
| <span class="badge">🇲🇳 Монгол хэл</span> | |
| <span class="badge badge-gpu">🚀 A10G GPU</span> | |
| <span class="badge">📁 25 мин хүртэл</span> | |
| </div> | |
| </div> | |
| """) | |
| with gr.Tabs(): | |
| with gr.Tab("🎤 Микрофон"): | |
| gr.HTML("<p class='section-label'>Дуу бичих — дээд тал 5 минут</p>") | |
| mic_input = gr.Audio(label="Микрофон", sources=["microphone"], type="filepath", show_download_button=False) | |
| mic_btn = gr.Button("🔍 Таних", elem_classes="transcribe-btn", variant="primary") | |
| gr.HTML("<p class='section-label' style='margin-top:14px'>Таниулсан текст</p>") | |
| mic_out = gr.Textbox(label="", lines=6, show_copy_button=True, placeholder="Энд таниулсан текст гарна...") | |
| mic_btn.click(fn=transcribe_mic, inputs=mic_input, outputs=mic_out) | |
| with gr.Tab("📂 Файл Upload"): | |
| gr.HTML("<p class='section-label'>Аудио файл — дээд тал 25 минут · WAV / MP3 / M4A / FLAC</p>") | |
| file_input = gr.Audio(label="Аудио файл", sources=["upload"], type="filepath", show_download_button=False) | |
| file_btn = gr.Button("🔍 Таних", elem_classes="transcribe-btn", variant="primary") | |
| gr.HTML("<p class='section-label' style='margin-top:14px'>Таниулсан текст</p>") | |
| file_out = gr.Textbox(label="", lines=8, show_copy_button=True, placeholder="Энд таниулсан текст гарна...") | |
| file_btn.click(fn=transcribe_file, inputs=file_input, outputs=file_out) | |
| gr.HTML(""" | |
| <div class="footer-note"> | |
| Зөвхөн Монгол хэл дэмжинэ · | |
| Загвар: <b>Tsedee/whisper-large-v2-mn-commercial-v1</b> · | |
| A10G GPU · float16 | |
| </div> | |
| """) | |
| if __name__ == "__main__": | |
| demo.queue(max_size=5).launch() | |