Spaces:
Sleeping
Sleeping
| import os, re, tempfile, traceback | |
| import numpy as np | |
| import pandas as pd | |
| import torch | |
| import soundfile as sf | |
| import gradio as gr | |
| from faster_whisper import WhisperModel | |
| # ========================= | |
| # إعدادات عامة | |
| # ========================= | |
| FORCE_WHISPER_NAME = "large-v3" | |
| FORCE_COMPUTE_TYPE = "int8" | |
| ASR_OPTS = dict( | |
| word_timestamps=True, | |
| vad_filter=True, | |
| vad_parameters={"min_silence_duration_ms": 200}, | |
| beam_size=5, | |
| best_of=5, | |
| temperature=0.0, | |
| ) | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"[INIT] Using device: {DEVICE}", flush=True) | |
| # ========================= | |
| # تحميل نموذج Whisper | |
| # ========================= | |
| _WHISPER = None | |
| def load_whisper_model(): | |
| global _WHISPER | |
| if _WHISPER is None: | |
| _WHISPER = WhisperModel(FORCE_WHISPER_NAME, | |
| device=DEVICE, | |
| compute_type=FORCE_COMPUTE_TYPE) | |
| print(f"[LOAD] Whisper model: {FORCE_WHISPER_NAME} ({FORCE_COMPUTE_TYPE})", flush=True) | |
| return _WHISPER | |
| whisper = load_whisper_model() | |
| # ========================= | |
| # دوال مساعدة | |
| # ========================= | |
| def ensure_audio_path(audio): | |
| """يتأكد إنو الإدخال الصوتي محفوظ مؤقتًا كملف wav""" | |
| if isinstance(audio, str): | |
| if not os.path.exists(audio): | |
| raise FileNotFoundError(f"Audio file not found: {audio}") | |
| return audio | |
| if isinstance(audio, tuple) and len(audio) == 2: | |
| data, sr = audio | |
| if isinstance(data, np.ndarray): | |
| tmp = tempfile.NamedTemporaryFile(suffix=".wav", delete=False) | |
| sf.write(tmp.name, data, sr) | |
| return tmp.name | |
| raise ValueError("Unsupported audio input format") | |
| def normalize_ar_orth(text: str) -> str: | |
| text = re.sub(r"[ًٌٍَُِّْـ]", "", text) | |
| text = re.sub(r"[“”\"',:؛؟.!()\[\]{}،\-–—_]", " ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| return text | |
| def clean_ar_token(t: str) -> str: | |
| t = t.strip() | |
| t = re.sub(r'^[^\w\u0600-\u06FF]+|[^\w\u0600-\u06FF]+$', '', t) | |
| t = normalize_ar_orth(t) | |
| return t | |
| def extract_word_conf_table(segments): | |
| """يبني جدول الكلمات مع الزمن والثقة""" | |
| rows = [] | |
| for seg in segments: | |
| for w in (seg.words or []): | |
| rows.append({ | |
| "seg_start": float(seg.start), | |
| "seg_end": float(seg.end), | |
| "word_start": float(w.start), | |
| "word_end": float(w.end), | |
| "word": clean_ar_token(w.word), | |
| "prob": float(w.probability), | |
| }) | |
| return pd.DataFrame(rows) | |
| # ========================= | |
| # الدالة الأساسية | |
| # ========================= | |
| def transcribe_audio(audio): | |
| try: | |
| audio_path = ensure_audio_path(audio) | |
| segments, info = whisper.transcribe(audio_path, **ASR_OPTS) | |
| full_text = " ".join([seg.text for seg in segments]) | |
| df_words = extract_word_conf_table(segments) | |
| if df_words.empty: | |
| # نرجع جدول فاضي بدل النص حتى ما يصير error | |
| df_words = pd.DataFrame(columns=["seg_start", "seg_end", "word_start", "word_end", "word", "prob"]) | |
| return full_text, df_words | |
| except Exception as e: | |
| return f"❌ حدث خطأ أثناء المعالجة:\n{str(e)}\n\n{traceback.format_exc()}", pd.DataFrame() | |
| # ========================= | |
| # واجهة Gradio | |
| # ========================= | |
| demo = gr.Interface( | |
| fn=transcribe_audio, | |
| inputs=gr.Audio(sources=["upload", "microphone"], type="filepath", label="🎙️ ملف الصوت أو التسجيل"), | |
| outputs=[ | |
| gr.Textbox(label="النص الناتج"), | |
| gr.Dataframe(label="تفاصيل الكلمات (زمن البداية، النهاية، الثقة)") | |
| ], | |
| title="Arabic Whisper ASR", | |
| description="حوّل الكلام العربي إلى نص باستخدام Faster-Whisper (large-v3).", | |
| theme="soft", | |
| allow_flagging="never" | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch(share=True, show_error=True) | |