whisper_asr / app.py
mimoha's picture
Update app.py
3fd2b8d verified
import os, re, tempfile, traceback
import numpy as np
import pandas as pd
import torch
import soundfile as sf
import gradio as gr
from faster_whisper import WhisperModel
# =========================
# إعدادات عامة
# =========================
FORCE_WHISPER_NAME = "large-v3"
FORCE_COMPUTE_TYPE = "int8"
ASR_OPTS = dict(
word_timestamps=True,
vad_filter=True,
vad_parameters={"min_silence_duration_ms": 200},
beam_size=5,
best_of=5,
temperature=0.0,
)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"[INIT] Using device: {DEVICE}", flush=True)
# =========================
# تحميل نموذج Whisper
# =========================
_WHISPER = None
def load_whisper_model():
global _WHISPER
if _WHISPER is None:
_WHISPER = WhisperModel(FORCE_WHISPER_NAME,
device=DEVICE,
compute_type=FORCE_COMPUTE_TYPE)
print(f"[LOAD] Whisper model: {FORCE_WHISPER_NAME} ({FORCE_COMPUTE_TYPE})", flush=True)
return _WHISPER
whisper = load_whisper_model()
# =========================
# دوال مساعدة
# =========================
def ensure_audio_path(audio):
"""يتأكد إنو الإدخال الصوتي محفوظ مؤقتًا كملف wav"""
if isinstance(audio, str):
if not os.path.exists(audio):
raise FileNotFoundError(f"Audio file not found: {audio}")
return audio
if isinstance(audio, tuple) and len(audio) == 2:
data, sr = audio
if isinstance(data, np.ndarray):
tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
sf.write(tmp.name, data, sr)
return tmp.name
raise ValueError("Unsupported audio input format")
def normalize_ar_orth(text: str) -> str:
text = re.sub(r"[ًٌٍَُِّْـ]", "", text)
text = re.sub(r"[“”\"',:؛؟.!()\[\]{}،\-–—_]", " ", text)
text = re.sub(r"\s+", " ", text).strip()
return text
def clean_ar_token(t: str) -> str:
t = t.strip()
t = re.sub(r'^[^\w\u0600-\u06FF]+|[^\w\u0600-\u06FF]+$', '', t)
t = normalize_ar_orth(t)
return t
def extract_word_conf_table(segments):
"""يبني جدول الكلمات مع الزمن والثقة"""
rows = []
for seg in segments:
for w in (seg.words or []):
rows.append({
"seg_start": float(seg.start),
"seg_end": float(seg.end),
"word_start": float(w.start),
"word_end": float(w.end),
"word": clean_ar_token(w.word),
"prob": float(w.probability),
})
return pd.DataFrame(rows)
# =========================
# الدالة الأساسية
# =========================
def transcribe_audio(audio):
try:
audio_path = ensure_audio_path(audio)
segments, info = whisper.transcribe(audio_path, **ASR_OPTS)
full_text = " ".join([seg.text for seg in segments])
df_words = extract_word_conf_table(segments)
if df_words.empty:
# نرجع جدول فاضي بدل النص حتى ما يصير error
df_words = pd.DataFrame(columns=["seg_start", "seg_end", "word_start", "word_end", "word", "prob"])
return full_text, df_words
except Exception as e:
return f"❌ حدث خطأ أثناء المعالجة:\n{str(e)}\n\n{traceback.format_exc()}", pd.DataFrame()
# =========================
# واجهة Gradio
# =========================
demo = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(sources=["upload", "microphone"], type="filepath", label="🎙️ ملف الصوت أو التسجيل"),
outputs=[
gr.Textbox(label="النص الناتج"),
gr.Dataframe(label="تفاصيل الكلمات (زمن البداية، النهاية، الثقة)")
],
title="Arabic Whisper ASR",
description="حوّل الكلام العربي إلى نص باستخدام Faster-Whisper (large-v3).",
theme="soft",
allow_flagging="never"
)
if __name__ == "__main__":
demo.launch(share=True, show_error=True)