# from fastapi import FastAPI, File, UploadFile # from fastapi.responses import JSONResponse # import uvicorn # import tempfile # import nemo.collections.asr as nemo_asr # import re # import os # import librosa # import soundfile as sf # # ===== Arabic number mapping (expanded) ===== # arabic_numbers = { # "صفر": "0", "زيرو": "0", "٠": "0", "زيو": "0", "زير": "0", # "واحد": "1", "واحدة": "1", "١": "1", # "اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2", # "تلاتة": "3", "ثلاثة": "3", "٣": "3","ثلاث": "3","تلات": "3", # "اربعة": "4", "أربعة": "4", "٤": "4", # "خمسة": "5", "٥": "5","خمسه": "5", # "ستة": "6", "٦": "6", # "سبعة": "7", "٧": "7","سبعه": "7", # "تمانية": "8", "ثمانية": "8", "٨": "8", # "تسعة": "9", "٩": "9", # "عشرة": "10", "١٠": "10","عشره": "10", # "حداشر": "11", "احد عشر": "11", "احداشر": "11", # "اتناشر": "12", "اثنا عشر": "12", # "تلتاشر": "13", "ثلاثة عشر": "13", # "اربعتاشر": "14", "أربعة عشر": "14", # "خمستاشر": "15", "خمسة عشر": "15", # "ستاشر": "16", "ستة عشر": "16", # "سبعتاشر": "17", "سبعة عشر": "17", # "طمنتاشر": "18", "ثمانية عشر": "18", # "تسعتاشر": "19", "تسعة عشر": "19", # "عشرين": "20", "٢٠": "20", # "تلاتين": "30", "ثلاثين": "30", "٣٠": "30", # "اربعين": "40", "أربعين": "40", "٤٠": "40", # "خمسين": "50", "٥٠": "50", # "ستين": "60", "٦٠": "60", # "سبعين": "70", "٧٠": "70", # "تمانين": "80", "ثمانين": "80", "٨٠": "80", # "تسعين": "90", "٩٠": "90", # "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100", # "ميتين": "200", "مائتين": "200", # "تلاتمية": "300", "ثلاثمائة": "300", # "اربعمية": "400", "أربعمائة": "400", # "خمسمية": "500", "خمسمائة": "500", # "ستمية": "600", "ستمائة": "600", # "سبعمية": "700", "سبعمائة": "700", # "تمانمية": "800", "ثمانمائة": "800", # "تسعمية": "900", "تسعمائة": "900", # "ألف": "1000", "الف": "1000", "١٠٠٠": "1000", # "ألفين": "2000", "الفين": "2000", # "تلات تلاف": "3000", "ثلاثة آلاف": "3000", # "اربعة آلاف": "4000", "أربعة آلاف": "4000", # "خمسة آلاف": "5000", # "ستة آلاف": "6000", # "سبعة آلاف": "7000", # "تمانية آلاف": "8000", "ثمانية آلاف": "8000", # "تسعة آلاف": "9000", # "عشرة آلاف": "10000", # "مية ألف": "100000", "مائة ألف": "100000", # "مليون": "1000000", "ملايين": "1000000", # "مليار": "1000000000" # } # # ===== Helpers ===== # def normalize_arabic(text: str) -> str: # diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]') # text = re.sub(diacritics, '', text) # text = re.sub(r'[إأآا]', 'ا', text) # text = re.sub(r'ى', 'ي', text) # text = re.sub(r'ؤ', 'و', text) # text = re.sub(r'ئ', 'ي', text) # text = re.sub(r'ة', 'ه', text) # return text # def replace_arabic_numbers(text: str) -> str: # for word, digit in arabic_numbers.items(): # text = re.sub(fr"(?:^|\s){word}(?:$|\s)", f" {digit} ", text) # return " ".join(text.split()) # def join_digit_sequences(text: str) -> str: # tokens = text.split() # out, buffer = [], [] # for tok in tokens: # if tok.isdigit() and len(tok) == 1: # buffer.append(tok) # else: # if buffer: # out.append("".join(buffer)) # buffer = [] # out.append(tok) # if buffer: # out.append("".join(buffer)) # return " ".join(out) # def ensure_16k_wav(input_path, output_path): # y, sr = librosa.load(input_path, sr=16000, mono=True) # sf.write(output_path, y, 16000) # # ===== FastAPI app ===== # app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic digit conversion") # @app.on_event("startup") # def load_model(): # global asr_model # model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/asr-egyptian-nemo-v2.0.nemo" # asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path) # @app.post("/transcribe") # async def transcribe_audio(file: UploadFile = File(...)): # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: # tmp.write(await file.read()) # tmp_path = tmp.name # # Resample to 16kHz # resampled_path = tmp_path.replace(".wav", "_16k.wav") # ensure_16k_wav(tmp_path, resampled_path) # try: # result = asr_model.transcribe([resampled_path]) # raw_text = result[0].text # raw_text = normalize_arabic(raw_text) # cleaned_text = replace_arabic_numbers(raw_text) # cleaned_text = join_digit_sequences(cleaned_text) # return JSONResponse(content={"transcription": cleaned_text}) # finally: # os.remove(tmp_path) # if os.path.exists(resampled_path): # os.remove(resampled_path) # @app.post("/transcribe-bytes") # async def transcribe_audio_bytes(audio_bytes: bytes = File(...)): # with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: # tmp.write(audio_bytes) # tmp_path = tmp.name # resampled_path = tmp_path.replace(".wav", "_16k.wav") # ensure_16k_wav(tmp_path, resampled_path) # try: # result = asr_model.transcribe([resampled_path]) # raw_text = result[0].text # raw_text = normalize_arabic(raw_text) # cleaned_text = replace_arabic_numbers(raw_text) # cleaned_text = join_digit_sequences(cleaned_text) # return JSONResponse(content={"transcription": cleaned_text}) # finally: # os.remove(tmp_path) # if os.path.exists(resampled_path): # os.remove(resampled_path) # if __name__ == "__main__": # uvicorn.run(app, host="0.0.0.0", port=8000, reload=True) from fastapi import FastAPI, File, UploadFile from fastapi.responses import JSONResponse import uvicorn import tempfile import nemo.collections.asr as nemo_asr import re import os import librosa import soundfile as sf # ===== Arabic + English number mapping (expanded) ===== arabic_numbers = { # Arabic digits "صفر": "0", "زيرو": "0", "٠": "0", "زيو": "0", "زير": "0","زينوا": "0", "واحد": "1", "واحدة": "1", "١": "1", "اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2", "تلاتة": "3", "ثلاثة": "3", "٣": "3", "ثلاث": "3", "تلات": "3", "اربعة": "4", "أربعة": "4", "٤": "4", "خمسة": "5", "٥": "5", "خمسه": "5", "ستة": "6", "٦": "6", "سبعة": "7", "٧": "7", "سبعه": "7", "تمانية": "8", "ثمانية": "8", "٨": "8", "تسعة": "9", "٩": "9", "عشرة": "10", "١٠": "10", "عشره": "10", "حداشر": "11", "احد عشر": "11", "احداشر": "11", "اتناشر": "12", "اثنا عشر": "12", "تلتاشر": "13", "ثلاثة عشر": "13", "اربعتاشر": "14", "أربعة عشر": "14", "خمستاشر": "15", "خمسة عشر": "15", "ستاشر": "16", "ستة عشر": "16", "سبعتاشر": "17", "سبعة عشر": "17", "طمنتاشر": "18", "ثمانية عشر": "18", "تسعتاشر": "19", "تسعة عشر": "19", "عشرين": "20", "٢٠": "20", "تلاتين": "30", "ثلاثين": "30", "٣٠": "30", "اربعين": "40", "أربعين": "40", "٤٠": "40", "خمسين": "50", "٥٠": "50", "ستين": "60", "٦٠": "60", "سبعين": "70", "٧٠": "70", "تمانين": "80", "ثمانين": "80", "٨٠": "80", "تسعين": "90", "٩٠": "90", "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100", "ميتين": "200", "مائتين": "200", "تلاتمية": "300", "ثلاثمائة": "300", "اربعمية": "400", "أربعمائة": "400", "خمسمية": "500", "خمسمائة": "500", "ستمية": "600", "ستمائة": "600", "سبعمية": "700", "سبعمائة": "700", "تمانمية": "800", "ثمانمائة": "800", "تسعمية": "900", "تسعمائة": "900", "ألف": "1000", "الف": "1000", "١٠٠٠": "1000", "ألفين": "2000", "الفين": "2000", "تلات تلاف": "3000", "ثلاثة آلاف": "3000", "اربعة آلاف": "4000", "أربعة آلاف": "4000", "خمسة آلاف": "5000", "ستة آلاف": "6000", "سبعة آلاف": "7000", "تمانية آلاف": "8000", "ثمانية آلاف": "8000", "تسعة آلاف": "9000", "عشرة آلاف": "10000", "مية ألف": "100000", "مائة ألف": "100000", "مليون": "1000000", "ملايين": "1000000", "مليار": "1000000000", # English digits "zero": "0", "one": "1", "two": "2", "to": "2", "too": "2", "three": "3", "four": "4", "for": "4", "five": "5", "six": "6", "seven": "7", "eight": "8", "nine": "9", "ten": "10", "eleven": "11", "twelve": "12", "thirteen": "13", "fourteen": "14", "fifteen": "15", "sixteen": "16", "seventeen": "17", "eighteen": "18", "nineteen": "19", "twenty": "20", "thirty": "30", "forty": "40", "fifty": "50", "sixty": "60", "seventy": "70", "eighty": "80", "ninety": "90", "hundred": "100", "thousand": "1000", "million": "1000000", # Arabic variants "تلاته": "3", "اربعه": "4", "سته": "6", "تمانيه": "8", "ثماني": "80", "تسعه": "9", "واحده": "1", "عشره": "10", "حداشر": "11", "اتناشر": "12", "تلاته عشر": "13", "اربعه عشر": "14", "خمسه عشر": "15", "سته عشر": "16", "سبعه عشر": "17", "ثمانيه عشر": "18", "تسعه عشر": "19", # English tricky forms "oh": "0", "double zero": "00", "double one": "11", "double two": "22", "double three": "33", "double four": "44", "double five": "55", "double six": "66", "double seven": "77", "double eight": "88", "double nine": "99", "for": "4", "to": "2", "too": "2", "nite": "9", "fiv": "5", # 🔹 Repeated Digits "واحد واحد": "11", "اثنين اثنين": "22", "اتنين اتنين": "22", "ثلاثة ثلاثة": "33", "تلاتة تلاتة": "33", "أربعة أربعة": "44", "اربعة اربعة": "44", "خمسة خمسة": "55", "ستة ستة": "66", "سبعة سبعة": "77", "ثمانية ثمانية": "88", "تمانية تمانية": "88", "تسعة تسعة": "99", # 🔹 Hundreds "مئة": "100", "مية": "100", "مئتين": "200", "ميتين": "200", "ثلاثمية": "300", "تلتمية": "300", "أربعمية": "400", "اربعمية": "400", "خمسمية": "500", "ستمية": "600", "سبعمية": "700", "تمانمية": "800", "تسعمية": "900", # 🔹 Teens "أحد عشر": "11", "حداشر": "11", "اثنا عشر": "12", "اتناشر": "12", "ثلاثة عشر": "13", "تلاتاشر": "13", "أربعة عشر": "14", "اربعتاشر": "14", "خمسة عشر": "15", "خمسطاشر": "15", "ستة عشر": "16", "ستاشر": "16", "سبعة عشر": "17", "سبعتاشر": "17", "ثمانية عشر": "18", "طمنتاشر": "18", "تسعة عشر": "19", "تسعتاشر": "19", # 🔹 Tens "عشرين": "20", "تلاتين": "30", "ثلاثين": "30", "أربعين": "40", "اربعين": "40", "خمسين": "50", "ستين": "60", "سبعين": "70", "ثمانين": "80", "تمانين": "80", "ثامنين": "80", "تسعين": "90", # 🔹 Mixed Word + Digits "خمسة صفر": "50", "ثلاثة صفر صفر": "300", "تلاتة صفر صفر": "300", "واحد صفر": "10", "واحد صفر واحد": "101", "واحد اثنين": "12", "واحد اتنين": "12", "واحد ثلاثة": "13", "واحد تلاتة": "13", "واحد خمسة": "15", "عشرة عشرة": "1010", # Zero "صفر": "0", "زيرو": "0", "زيو": "0", "٠": "0", # One "واحد": "1", "واحدة": "1", "١": "1", # Two "اتنين": "2", "اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2", # Three "تلاتة": "3", "تلات": "3", "ثلاثة": "3", "ثلاث": "3", "٣": "3", # Four "اربعة": "4", "أربعة": "4", "٤": "4", # Five "خمسة": "5", "خمسه": "5", "٥": "5", # Six "ستة": "6", "ست": "6", "٦": "6", # Seven "سبعة": "7", "سبعه": "7", "٧": "7", # Eight "تمانية": "8", "تمنية": "8", "ثمانية": "8", "ثماني": "8", "ثمان": "8", "ثمانيّة": "8", "ثماني": "8", "٨": "8", # Nine "تسعة": "9", "تسعه": "9", "٩": "9" } # ===== Helpers ===== def normalize_arabic(text: str) -> str: diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]') text = re.sub(diacritics, '', text) text = re.sub(r'[إأآا]', 'ا', text) text = re.sub(r'ى', 'ي', text) text = re.sub(r'ؤ', 'و', text) text = re.sub(r'ئ', 'ي', text) text = re.sub(r'ة', 'ه', text) return text def replace_arabic_numbers(text: str) -> str: for word, digit in arabic_numbers.items(): text = re.sub(fr"(?:^|\s){word}(?:$|\s)", f" {digit} ", text, flags=re.IGNORECASE) return " ".join(text.split()) def join_digit_sequences(text: str) -> str: tokens = text.split() out, buffer = [], [] for tok in tokens: if tok.isdigit() and len(tok) == 1: buffer.append(tok) else: if buffer: out.append("".join(buffer)) # join sequences like 8 5 -> 85 buffer = [] out.append(tok) if buffer: out.append("".join(buffer)) return " ".join(out) def ensure_16k_wav(input_path, output_path): y, sr = librosa.load(input_path, sr=16000, mono=True) sf.write(output_path, y, 16000) # ===== FastAPI app ===== app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic/English digit conversion") @app.on_event("startup") def load_model(): global asr_model model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/stt_ar_fastconformer_hybrid_large_pc_v1.0.nemo" asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path) @app.post("/transcribe") async def transcribe_audio(file: UploadFile = File(...)): with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(await file.read()) tmp_path = tmp.name resampled_path = tmp_path.replace(".wav", "_16k.wav") ensure_16k_wav(tmp_path, resampled_path) try: result = asr_model.transcribe([resampled_path]) raw_text = result[0].text print(raw_text) raw_text = normalize_arabic(raw_text) cleaned_text = replace_arabic_numbers(raw_text) cleaned_text = join_digit_sequences(cleaned_text) return JSONResponse(content={"transcription": cleaned_text}) finally: os.remove(tmp_path) if os.path.exists(resampled_path): os.remove(resampled_path) @app.post("/transcribe-bytes") async def transcribe_audio_bytes(audio_bytes: bytes = File(...)): with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(audio_bytes) tmp_path = tmp.name resampled_path = tmp_path.replace(".wav", "_16k.wav") ensure_16k_wav(tmp_path, resampled_path) try: result = asr_model.transcribe([resampled_path]) raw_text = result[0].text raw_text = normalize_arabic(raw_text) cleaned_text = replace_arabic_numbers(raw_text) cleaned_text = join_digit_sequences(cleaned_text) return JSONResponse(content={"transcription": cleaned_text}) finally: os.remove(tmp_path) if os.path.exists(resampled_path): os.remove(resampled_path) if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)