Tsedee's picture
Restore full ASR model - all fixes applied
6c09762 verified
import os
import torch
import spaces
import gradio as gr
import librosa
import traceback
from transformers import WhisperForConditionalGeneration, WhisperProcessor
from peft import PeftModel
import peft.tuners.tuners_utils as _peft_utils
_peft_utils._torch_supports_distributed = False
# --- Config ---
ADAPTER_ID = "Tsedee/whisper-large-v2-mn-commercial-v1"
BASE_ID = "Tsedee/whisper-large-v2-mn-monsub"
SAMPLE_RATE = 16000
MAX_MIC_SEC = 5 * 60
MAX_FILE_SEC = 25 * 60
CHUNK_SEC = 29
HF_TOKEN = os.getenv("HF_TOKEN")
# --- CPU-д ачаалана ---
print("[1/4] Loading base model on CPU (float32)...")
base_model = WhisperForConditionalGeneration.from_pretrained(
BASE_ID, torch_dtype=torch.float32, low_cpu_mem_usage=True,
)
print("[2/4] Applying LoRA adapter...")
model = PeftModel.from_pretrained(base_model, ADAPTER_ID, token=HF_TOKEN)
print("[3/4] Merging LoRA weights...")
model = model.merge_and_unload()
model.eval()
model.generation_config.suppress_tokens = None # Fix: suppress_tokens=[] → IndexError
print("[4/4] Loading processor...")
processor = WhisperProcessor.from_pretrained(BASE_ID)
print("Model ready on CPU!")
def run_inference(waveform):
chunk_samples = CHUNK_SEC * SAMPLE_RATE
texts = []
for start in range(0, len(waveform), chunk_samples):
chunk = waveform[start : start + chunk_samples]
inputs = processor(chunk, sampling_rate=SAMPLE_RATE, return_tensors="pt")
feats = inputs.input_features.to("cuda").half()
with torch.no_grad():
ids = model.generate(feats, max_new_tokens=444)
text = processor.batch_decode(ids, skip_special_tokens=True)[0].strip()
if text:
texts.append(text)
return " ".join(texts) if texts else "Текст таних боломжгүй байна."
@spaces.GPU(duration=120)
def transcribe_mic(audio):
try:
if audio is None:
return "Аудио оруулаагүй байна."
model.half(); model.to("cuda")
waveform, _ = librosa.load(audio, sr=SAMPLE_RATE, mono=True, duration=MAX_MIC_SEC)
duration = len(waveform) / SAMPLE_RATE
text = run_inference(waveform)
model.to("cpu"); model.float(); torch.cuda.empty_cache()
m, s = divmod(int(duration), 60)
return f"{text}\n\n⏱ {m}:{s:02d}"
except Exception as e:
model.to("cpu"); model.float(); torch.cuda.empty_cache()
return f"Алдаа: {type(e).__name__}: {e}\n\n{traceback.format_exc()}"
@spaces.GPU(duration=300)
def transcribe_file(audio):
try:
if audio is None:
return "Файл оруулаагүй байна."
model.half(); model.to("cuda")
waveform, _ = librosa.load(audio, sr=SAMPLE_RATE, mono=True, duration=MAX_FILE_SEC)
duration = len(waveform) / SAMPLE_RATE
text = run_inference(waveform)
model.to("cpu"); model.float(); torch.cuda.empty_cache()
m, s = divmod(int(duration), 60)
return f"{text}\n\n⏱ {m}:{s:02d}"
except Exception as e:
model.to("cpu"); model.float(); torch.cuda.empty_cache()
return f"Алдаа: {type(e).__name__}: {e}\n\n{traceback.format_exc()}"
# --- UI ---
CSS = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
* { font-family: 'Inter', sans-serif !important; }
.gradio-container { max-width: 860px !important; margin: 0 auto !important; }
.hero-banner {
background: linear-gradient(135deg, #1e3a5f 0%, #0f4c8a 50%, #1a6b3c 100%);
border-radius: 20px; padding: 36px 24px 28px;
text-align: center; margin-bottom: 16px;
box-shadow: 0 8px 32px rgba(15,76,138,0.3);
}
.hero-banner h1 { font-size: 2rem !important; font-weight: 700 !important; color: #fff !important; margin: 0 0 8px 0 !important; }
.hero-banner p { color: #94c6f7 !important; font-size: 0.9rem !important; margin: 0 !important; }
.badge-row { display: flex; justify-content: center; gap: 10px; margin-top: 14px; flex-wrap: wrap; }
.badge { background: rgba(255,255,255,0.12); border: 1px solid rgba(255,255,255,0.2); border-radius: 20px; padding: 4px 14px; font-size: 0.78rem; color: #e2f0ff; }
.badge-green { background: rgba(22,163,74,0.25); border-color: rgba(22,163,74,0.5); color: #86efac; }
.badge-gpu { background: rgba(168,85,247,0.25); border-color: rgba(168,85,247,0.5); color: #d8b4fe; }
.section-label { font-size: 0.72rem !important; font-weight: 600 !important; color: #64748b !important; text-transform: uppercase; letter-spacing: 1px; margin-bottom: 6px !important; }
.transcribe-btn { background: linear-gradient(135deg, #3b82f6, #1d4ed8) !important; border: none !important; border-radius: 12px !important; font-size: 1rem !important; font-weight: 600 !important; color: white !important; box-shadow: 0 4px 14px rgba(59,130,246,0.4) !important; width: 100% !important; margin-top: 8px !important; }
.footer-note { text-align: center; font-size: 0.78rem; color: #64748b; margin-top: 20px; padding-top: 16px; border-top: 1px solid #e2e8f0; }
"""
with gr.Blocks(css=CSS, title="Монгол Яриа Таних", theme=gr.themes.Soft()) as demo:
gr.HTML("""
<div class="hero-banner">
<h1>🎙️ Монгол Яриа Таних</h1>
<p>Whisper LoRA загварт суурилсан Монгол хэлний автомат яриа таних систем</p>
<div class="badge-row">
<span class="badge">⚡ Whisper Large-v2</span>
<span class="badge badge-green">✅ WER 6%</span>
<span class="badge">🇲🇳 Монгол хэл</span>
<span class="badge badge-gpu">🚀 A10G GPU</span>
<span class="badge">📁 25 мин хүртэл</span>
</div>
</div>
""")
with gr.Tabs():
with gr.Tab("🎤 Микрофон"):
gr.HTML("<p class='section-label'>Дуу бичих — дээд тал 5 минут</p>")
mic_input = gr.Audio(label="Микрофон", sources=["microphone"], type="filepath", show_download_button=False)
mic_btn = gr.Button("🔍 Таних", elem_classes="transcribe-btn", variant="primary")
gr.HTML("<p class='section-label' style='margin-top:14px'>Таниулсан текст</p>")
mic_out = gr.Textbox(label="", lines=6, show_copy_button=True, placeholder="Энд таниулсан текст гарна...")
mic_btn.click(fn=transcribe_mic, inputs=mic_input, outputs=mic_out)
with gr.Tab("📂 Файл Upload"):
gr.HTML("<p class='section-label'>Аудио файл — дээд тал 25 минут · WAV / MP3 / M4A / FLAC</p>")
file_input = gr.Audio(label="Аудио файл", sources=["upload"], type="filepath", show_download_button=False)
file_btn = gr.Button("🔍 Таних", elem_classes="transcribe-btn", variant="primary")
gr.HTML("<p class='section-label' style='margin-top:14px'>Таниулсан текст</p>")
file_out = gr.Textbox(label="", lines=8, show_copy_button=True, placeholder="Энд таниулсан текст гарна...")
file_btn.click(fn=transcribe_file, inputs=file_input, outputs=file_out)
gr.HTML("""
<div class="footer-note">
Зөвхөн Монгол хэл дэмжинэ &nbsp;·&nbsp;
Загвар: <b>Tsedee/whisper-large-v2-mn-commercial-v1</b> &nbsp;·&nbsp;
A10G GPU · float16
</div>
""")
if __name__ == "__main__":
demo.queue(max_size=5).launch()