| # from fastapi import FastAPI, File, UploadFile | |
| # from fastapi.responses import JSONResponse | |
| # import uvicorn | |
| # import tempfile | |
| # import nemo.collections.asr as nemo_asr | |
| # import re | |
| # import os | |
| # import librosa | |
| # import soundfile as sf | |
| # # ===== Arabic number mapping (expanded) ===== | |
| # arabic_numbers = { | |
| # "صفر": "0", "زيرو": "0", "٠": "0", "زيو": "0", "زير": "0", | |
| # "واحد": "1", "واحدة": "1", "١": "1", | |
| # "اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2", | |
| # "تلاتة": "3", "ثلاثة": "3", "٣": "3","ثلاث": "3","تلات": "3", | |
| # "اربعة": "4", "أربعة": "4", "٤": "4", | |
| # "خمسة": "5", "٥": "5","خمسه": "5", | |
| # "ستة": "6", "٦": "6", | |
| # "سبعة": "7", "٧": "7","سبعه": "7", | |
| # "تمانية": "8", "ثمانية": "8", "٨": "8", | |
| # "تسعة": "9", "٩": "9", | |
| # "عشرة": "10", "١٠": "10","عشره": "10", | |
| # "حداشر": "11", "احد عشر": "11", "احداشر": "11", | |
| # "اتناشر": "12", "اثنا عشر": "12", | |
| # "تلتاشر": "13", "ثلاثة عشر": "13", | |
| # "اربعتاشر": "14", "أربعة عشر": "14", | |
| # "خمستاشر": "15", "خمسة عشر": "15", | |
| # "ستاشر": "16", "ستة عشر": "16", | |
| # "سبعتاشر": "17", "سبعة عشر": "17", | |
| # "طمنتاشر": "18", "ثمانية عشر": "18", | |
| # "تسعتاشر": "19", "تسعة عشر": "19", | |
| # "عشرين": "20", "٢٠": "20", | |
| # "تلاتين": "30", "ثلاثين": "30", "٣٠": "30", | |
| # "اربعين": "40", "أربعين": "40", "٤٠": "40", | |
| # "خمسين": "50", "٥٠": "50", | |
| # "ستين": "60", "٦٠": "60", | |
| # "سبعين": "70", "٧٠": "70", | |
| # "تمانين": "80", "ثمانين": "80", "٨٠": "80", | |
| # "تسعين": "90", "٩٠": "90", | |
| # "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100", | |
| # "ميتين": "200", "مائتين": "200", | |
| # "تلاتمية": "300", "ثلاثمائة": "300", | |
| # "اربعمية": "400", "أربعمائة": "400", | |
| # "خمسمية": "500", "خمسمائة": "500", | |
| # "ستمية": "600", "ستمائة": "600", | |
| # "سبعمية": "700", "سبعمائة": "700", | |
| # "تمانمية": "800", "ثمانمائة": "800", | |
| # "تسعمية": "900", "تسعمائة": "900", | |
| # "ألف": "1000", "الف": "1000", "١٠٠٠": "1000", | |
| # "ألفين": "2000", "الفين": "2000", | |
| # "تلات تلاف": "3000", "ثلاثة آلاف": "3000", | |
| # "اربعة آلاف": "4000", "أربعة آلاف": "4000", | |
| # "خمسة آلاف": "5000", | |
| # "ستة آلاف": "6000", | |
| # "سبعة آلاف": "7000", | |
| # "تمانية آلاف": "8000", "ثمانية آلاف": "8000", | |
| # "تسعة آلاف": "9000", | |
| # "عشرة آلاف": "10000", | |
| # "مية ألف": "100000", "مائة ألف": "100000", | |
| # "مليون": "1000000", "ملايين": "1000000", | |
| # "مليار": "1000000000" | |
| # } | |
| # # ===== Helpers ===== | |
| # def normalize_arabic(text: str) -> str: | |
| # diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]') | |
| # text = re.sub(diacritics, '', text) | |
| # text = re.sub(r'[إأآا]', 'ا', text) | |
| # text = re.sub(r'ى', 'ي', text) | |
| # text = re.sub(r'ؤ', 'و', text) | |
| # text = re.sub(r'ئ', 'ي', text) | |
| # text = re.sub(r'ة', 'ه', text) | |
| # return text | |
| # def replace_arabic_numbers(text: str) -> str: | |
| # for word, digit in arabic_numbers.items(): | |
| # text = re.sub(fr"(?:^|\s){word}(?:$|\s)", f" {digit} ", text) | |
| # return " ".join(text.split()) | |
| # def join_digit_sequences(text: str) -> str: | |
| # tokens = text.split() | |
| # out, buffer = [], [] | |
| # for tok in tokens: | |
| # if tok.isdigit() and len(tok) == 1: | |
| # buffer.append(tok) | |
| # else: | |
| # if buffer: | |
| # out.append("".join(buffer)) | |
| # buffer = [] | |
| # out.append(tok) | |
| # if buffer: | |
| # out.append("".join(buffer)) | |
| # return " ".join(out) | |
| # def ensure_16k_wav(input_path, output_path): | |
| # y, sr = librosa.load(input_path, sr=16000, mono=True) | |
| # sf.write(output_path, y, 16000) | |
| # # ===== FastAPI app ===== | |
| # app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic digit conversion") | |
| # @app.on_event("startup") | |
| # def load_model(): | |
| # global asr_model | |
| # model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/asr-egyptian-nemo-v2.0.nemo" | |
| # asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path) | |
| # @app.post("/transcribe") | |
| # async def transcribe_audio(file: UploadFile = File(...)): | |
| # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| # tmp.write(await file.read()) | |
| # tmp_path = tmp.name | |
| # # Resample to 16kHz | |
| # resampled_path = tmp_path.replace(".wav", "_16k.wav") | |
| # ensure_16k_wav(tmp_path, resampled_path) | |
| # try: | |
| # result = asr_model.transcribe([resampled_path]) | |
| # raw_text = result[0].text | |
| # raw_text = normalize_arabic(raw_text) | |
| # cleaned_text = replace_arabic_numbers(raw_text) | |
| # cleaned_text = join_digit_sequences(cleaned_text) | |
| # return JSONResponse(content={"transcription": cleaned_text}) | |
| # finally: | |
| # os.remove(tmp_path) | |
| # if os.path.exists(resampled_path): | |
| # os.remove(resampled_path) | |
| # @app.post("/transcribe-bytes") | |
| # async def transcribe_audio_bytes(audio_bytes: bytes = File(...)): | |
| # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| # tmp.write(audio_bytes) | |
| # tmp_path = tmp.name | |
| # resampled_path = tmp_path.replace(".wav", "_16k.wav") | |
| # ensure_16k_wav(tmp_path, resampled_path) | |
| # try: | |
| # result = asr_model.transcribe([resampled_path]) | |
| # raw_text = result[0].text | |
| # raw_text = normalize_arabic(raw_text) | |
| # cleaned_text = replace_arabic_numbers(raw_text) | |
| # cleaned_text = join_digit_sequences(cleaned_text) | |
| # return JSONResponse(content={"transcription": cleaned_text}) | |
| # finally: | |
| # os.remove(tmp_path) | |
| # if os.path.exists(resampled_path): | |
| # os.remove(resampled_path) | |
| # if __name__ == "__main__": | |
| # uvicorn.run(app, host="0.0.0.0", port=8000, reload=True) | |
| from fastapi import FastAPI, File, UploadFile | |
| from fastapi.responses import JSONResponse | |
| import uvicorn | |
| import tempfile | |
| import nemo.collections.asr as nemo_asr | |
| import re | |
| import os | |
| import librosa | |
| import soundfile as sf | |
| from omegaconf import OmegaConf | |
| # ===== Arabic + English number mapping (expanded) ===== | |
| arabic_numbers = { | |
| "صفر": "0", "زيرو": "0", "زيو": "0", "زير": "0", "٠": "0", | |
| "واحد": "1", "واحدة": "1", "واحده": "1", "١": "1", | |
| "اثنين": "2", "اثنان": "2", "اتنين": "2", "٢": "2", | |
| "ثلاثة": "3", "ثلاث": "3", "تلاتة": "3", "تلات": "3", "ثلاثه": "3", "تلاته": "3", | |
| "أربعة": "4", "اربعة": "4", "٤": "4","أربعه": "4","اربعه": "4", | |
| "خمسة": "5", "خمسه": "5", "٥": "5", | |
| "ستة": "6", "ست": "6", "٦": "6","سته": "6", | |
| "سبعة": "7", "سبعه": "7", "٧": "7", | |
| "ثمانية": "8", "تمانية": "8", "تمنية": "8", "ثمان": "8", "٨": "8","تمانيه": "8", | |
| "تسعة": "9", "تسعه": "9", "٩": "9" | |
| } | |
| # ===== Helpers ===== | |
| def normalize_arabic(text: str) -> str: | |
| diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]') | |
| text = re.sub(diacritics, '', text) | |
| text = re.sub(r'[إأآا]', 'ا', text) | |
| text = re.sub(r'ى', 'ي', text) | |
| text = re.sub(r'ؤ', 'و', text) | |
| text = re.sub(r'ئ', 'ي', text) | |
| text = re.sub(r'ة', 'ه', text) | |
| return text | |
| def replace_arabic_numbers(text: str) -> str: | |
| # Replace Arabic words 0-9 with digits | |
| for word, digit in arabic_numbers.items(): | |
| text = re.sub(rf'\b{re.escape(word)}\b', digit, text) | |
| return text | |
| def join_digit_sequences(text: str) -> str: | |
| # Merge consecutive digits into single numbers | |
| tokens = text.split() | |
| out, buffer = [], [] | |
| for tok in tokens: | |
| if tok.isdigit(): | |
| buffer.append(tok) | |
| else: | |
| if buffer: | |
| out.append("".join(buffer)) | |
| buffer = [] | |
| out.append(tok) | |
| if buffer: | |
| out.append("".join(buffer)) | |
| return " ".join(out) | |
| def ensure_16k_wav(input_path, output_path): | |
| y, sr = librosa.load(input_path, sr=16000, mono=True) | |
| sf.write(output_path, y, 16000) | |
| # ===== FastAPI app ===== | |
| app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic/English digit conversion") | |
| def load_model(): | |
| global asr_model | |
| model_path = "output_finetuned/finetuned_model_best.nemo" | |
| asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(model_path) | |
| # Add this right after loading the model to see what's actually available: | |
| print("Available greedy parameters:") | |
| print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy)) | |
| # ===== STEP 3: Configure for LITERAL transcription ===== | |
| print("🔍 Configuring greedy decoding for literal output...") | |
| # Set struct mode to False temporarily to allow modifications | |
| OmegaConf.set_struct(asr_model.cfg.decoding, False) | |
| OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False) | |
| decoding_cfg = asr_model.cfg.decoding | |
| decoding_cfg.strategy = "maes" | |
| # Now try setting the parameters | |
| try: | |
| decoding_cfg.greedy.max_symbols_per_step = 300 | |
| print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}") | |
| except: | |
| print("⚠ Could not set max_symbols_per_step") | |
| decoding_cfg.greedy.max_symbols = 500 | |
| decoding_cfg.greedy.loop_labels = True | |
| decoding_cfg.greedy.preserve_alignments = True | |
| decoding_cfg.preserve_alignments = True | |
| decoding_cfg.compute_timestamps = True | |
| decoding_cfg.temperature = 1.3 | |
| decoding_cfg.beam.beam_size = 64 | |
| decoding_cfg.beam.softmax_temperature = 1.3 | |
| decoding_cfg.beam.search_type = "beam" | |
| print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}") | |
| print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}") | |
| print(f"✓ temperature: {decoding_cfg.temperature}") | |
| # Re-enable struct mode | |
| OmegaConf.set_struct(asr_model.cfg.decoding, True) | |
| OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True) | |
| # Apply configuration | |
| asr_model.change_decoding_strategy(decoding_cfg) | |
| async def transcribe_audio(file: UploadFile = File(...)): | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| tmp.write(await file.read()) | |
| tmp_path = tmp.name | |
| resampled_path = tmp_path.replace(".wav", "_16k.wav") | |
| ensure_16k_wav(tmp_path, resampled_path) | |
| try: | |
| result = asr_model.transcribe([resampled_path]) | |
| # Handle NeMo tuple/list structure robustly | |
| if isinstance(result, tuple): | |
| result = result[0] # take first element if tuple | |
| if isinstance(result, list): | |
| raw_text = result[0] | |
| else: | |
| raw_text = str(result) | |
| # Normalize and replace Arabic numerals | |
| raw_text = normalize_arabic(raw_text) | |
| cleaned_text = replace_arabic_numbers(raw_text) | |
| cleaned_text = join_digit_sequences(cleaned_text) | |
| print("📝 Cleaned Transcription:", cleaned_text) # for debug | |
| return JSONResponse(content={"transcription": cleaned_text}) | |
| finally: | |
| os.remove(tmp_path) | |
| if os.path.exists(resampled_path): | |
| os.remove(resampled_path) | |
| async def transcribe_audio_bytes(audio_bytes: bytes = File(...)): | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: | |
| tmp.write(audio_bytes) | |
| tmp_path = tmp.name | |
| resampled_path = tmp_path.replace(".wav", "_16k.wav") | |
| ensure_16k_wav(tmp_path, resampled_path) | |
| try: | |
| result = asr_model.transcribe([resampled_path]) | |
| # Robust extraction | |
| if isinstance(result, list): | |
| # if first element is also a list, flatten it | |
| first = result[0] | |
| if isinstance(first, list): | |
| raw_text = first[0] | |
| elif isinstance(first, str): | |
| raw_text = first | |
| elif hasattr(first, "text"): # sometimes result contains objects with 'text' | |
| raw_text = first.text | |
| else: | |
| raw_text = str(first) # fallback to string | |
| else: | |
| raw_text = str(result) | |
| #print("Raw text:", raw_text) | |
| raw_text = normalize_arabic(raw_text) | |
| cleaned_text = replace_arabic_numbers(raw_text) | |
| cleaned_text = join_digit_sequences(cleaned_text) | |
| return JSONResponse(content={"transcription": cleaned_text}) | |
| finally: | |
| os.remove(tmp_path) | |
| if os.path.exists(resampled_path): | |
| os.remove(resampled_path) | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=8000, reload=True) | |