from fastapi import FastAPI, File, UploadFile, Form from fastapi.responses import JSONResponse import uvicorn import tempfile import nemo.collections.asr as nemo_asr import re import os from transformers import AutoTokenizer, AutoModelForSeq2SeqLM import torch from word2number import w2n from deep_translator import GoogleTranslator # ===== Arabic number mapping (expanded) ===== arabic_numbers = { # Basic digits "صفر": "0", "زيرو": "0", "٠": "0","زيو": "0","زير": "0","زر": "0","زروا": "0","زرا": "0","زيره ": "0","زرو ": "0", "واحد": "1", "واحدة": "1", "١": "1", "اتنين": "2", "اثنين": "2", "إثنين": "2", "اثنان": "2", "إثنان": "2", "٢": "2", "تلاتة": "3", "ثلاثة": "3", "٣": "3", "اربعة": "4", "أربعة": "4", "٤": "4", "خمسة": "5", "٥": "5", "ستة": "6", "٦": "6", "سبعة": "7", "٧": "7", "تمانية": "8", "ثمانية": "8", "٨": "8", "تسعة": "9", "٩": "9", # Teens "عشرة": "10", "١٠": "10", # 11 "احد عشر": "11", "واحد عشر": "11", "حداشر": "11", "١ عشر": "11", "1 عشر": "11", "١عشر": "11", "1عشر": "11", "١١": "11", "11": "11", # 12 "اثنا عشر": "12", "اثني عشر": "12", "اتناشر": "12", "٢ عشر": "12", "2 عشر": "12", "٢عشر": "12", "2عشر": "12", "١٢": "12", "12": "12", # 13 "ثلاثة عشر": "13", "تلاتة عشر": "13", "تلتاشر": "13", "٣ عشر": "13", "3 عشر": "13", "٣عشر": "13", "3عشر": "13", "١٣": "13", "13": "13", # 14 "أربعة عشر": "14", "اربعة عشر": "14", "اربعتاشر": "14", "٤ عشر": "14", "4 عشر": "14", "٤عشر": "14", "4عشر": "14", "١٤": "14", "14": "14", # 15 "خمسة عشر": "15", "خمسه عشر": "15", "خمستاشر": "15", "٥ عشر": "15", "5 عشر": "15", "٥عشر": "15", "5عشر": "15", "١٥": "15", "15": "15", # 16 "ستة عشر": "16", "سته عشر": "16", "ستاشر": "16", "٦ عشر": "16", "6 عشر": "16", "٦عشر": "16", "6عشر": "16", "١٦": "16", "16": "16", # 17 "سبعة عشر": "17", "سبعه عشر": "17", "سبعتاشر": "17", "٧ عشر": "17", "7 عشر": "17", "٧عشر": "17", "7عشر": "17", "١٧": "17", "17": "17", # 18 "ثمانية عشر": "18", "تمانية عشر": "18", "طمنتاشر": "18", "٨ عشر": "18", "8 عشر": "18", "٨عشر": "18", "8عشر": "18", "١٨": "18", "18": "18", # 19 "تسعة عشر": "19", "تسعه عشر": "19", "تسعتاشر": "19", "٩ عشر": "19", "9 عشر": "19", "٩عشر": "19", "9عشر": "19", "١٩": "19", "19": "19", # Tens "عشرين": "20", "٢٠": "20", "تلاتين": "30", "ثلاثين": "30", "٣٠": "30", "اربعين": "40", "أربعين": "40", "٤٠": "40", "خمسين": "50", "٥٠": "50", "ستين": "60", "٦٠": "60", "سبعين": "70", "٧٠": "70", "تمانين": "80", "ثمانين": "80", "٨٠": "80","تمانون": "80","ثمانون": "80", "تسعين": "90", "٩٠": "90", # Hundreds "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100", "ميتين": "200", "مائتين": "200", "تلاتمية": "300", "ثلاثمائة": "300", "اربعمية": "400", "أربعمائة": "400", "خمسمية": "500", "خمسمائة": "500", "ستمية": "600", "ستمائة": "600", "سبعمية": "700", "سبعمائة": "700", "تمانمية": "800", "ثمانمائة": "800", "تسعمية": "900", "تسعمائة": "900", # Thousands "ألف": "1000", "الف": "1000", "١٠٠٠": "1000", "ألفين": "2000", "الفين": "2000", "تلات تلاف": "3000", "ثلاثة آلاف": "3000", "اربعة آلاف": "4000", "أربعة آلاف": "4000", "خمسة آلاف": "5000", "ستة آلاف": "6000", "سبعة آلاف": "7000", "تمانية آلاف": "8000", "ثمانية آلاف": "8000", "تسعة آلاف": "9000", # Large numbers "عشرة آلاف": "10000", "مية ألف": "100000", "مائة ألف": "100000", "مليون": "1000000", "١٠٠٠٠٠٠": "1000000", "ملايين": "1000000", "مليار": "1000000000", "١٠٠٠٠٠٠٠٠٠": "1000000000", # ===== Compound tens (Arabic + digit forms) ===== "واحد وعشرون": "21", "1 وعشرون": "21", "اثنان وعشرون": "22", "٢ وعشرون": "22", "ثلاثة وعشرون": "23", "٣ وعشرون": "23", "اربعة وعشرون": "24", "٤ وعشرون": "24", "خمسة وعشرون": "25", "٥ وعشرون": "25", "ستة وعشرون": "26", "٦ وعشرون": "26", "سبعة وعشرون": "27", "٧ وعشرون": "27", "تمانية وعشرون": "28", "ثمانية وعشرون": "28", "٨ وعشرون": "28", "تسعة وعشرون": "29", "٩ وعشرون": "29", "ثمانية وثمانون": "88", "8 وثمانون": "88", "اثنان وثمانون": "82", "٢ وثمانون": "82", "خمسة وستون": "65", "5 وستون": "65", "ستة عشر": "16", "٦ عشر": "16", "اثنا عشر": "12", "١٢": "12", "ثلاثة وثلاثون": "33", "٣٣": "33", "33": "33", "أربعة وأربعون": "44", "٤٤": "44", "44": "44", "خمسة وخمسون": "55", "٥٥": "55", "55": "55", "ستة وستون": "66", "٦٦": "66", "66": "66", "سبعة وسبعون": "77", "٧٧": "77", "77": "77", "ثمانية وثمانون": "88", "٨٨": "88", "88": "88", "تسعة وتسعون": "99", "٩٩": "99", "99": "99", } def replace_arabic_numbers(text: str) -> str: for word, digit in arabic_numbers.items(): text = re.sub(rf"\b{word}\b", digit, text) return text # ===== FastAPI app ===== app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic digit conversion") # Load model once on startup @app.on_event("startup") def load_model(): global asr_model global model global tokenizer global device #model_path = os.getenv("NEMO_MODEL_PATH", "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/nvidia_asr_eg_conformer_better_than_whisper/stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo") model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/stt_ar_fastconformer_hybrid_large_pc_v1.0.nemo" asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path) # Load once globally # model_name = "alaasayed_ai/Egyptian_Arabic_to_English" model_translator_name = "ukaAi/Egyptian_dialect_to_arabic" tokenizer = AutoTokenizer.from_pretrained(model_translator_name) model = AutoModelForSeq2SeqLM.from_pretrained(model_translator_name) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model = model.to(device) def translate_egyptian_to_english(text: str) -> str: """ Translates Egyptian Arabic text to English using the fine-tuned NLLB model. Parameters: - text (str): The input Egyptian Arabic text Returns: - str: The translated English text """ tokenizer.src_lang = "arz_Arab" forced_bos_token_id = tokenizer.convert_tokens_to_ids("eng_Latn") inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512) inputs = {k: v.to(device) for k, v in inputs.items()} translated = model.generate( **inputs, forced_bos_token_id=forced_bos_token_id, max_length=512, num_beams=4, early_stopping=True ) return tokenizer.decode(translated[0], skip_special_tokens=True) # Cardinal and tens WORD_TO_NUM = { "zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5, "six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10, "eleven": 11, "twelve": 12, "thirteen": 13, "fourteen": 14, "fifteen": 15, "sixteen": 16, "seventeen": 17, "eighteen": 18, "nineteen": 19, "twenty": 20, "thirty": 30, "forty": 40, "fifty": 50, "sixty": 60, "seventy": 70, "eighty": 80, "ninety": 90 } # Ordinals ORDINAL_TO_NUM = { "first": 1, "second": 2, "third": 3, "fourth": 4, "fifth": 5, "sixth": 6, "seventh": 7, "eighth": 8, "ninth": 9, "tenth": 10, "eleventh": 11, "twelfth": 12, "thirteenth": 13, "fourteenth": 14, "fifteenth": 15, "sixteenth": 16, "seventeenth": 17, "eighteenth": 18, "nineteenth": 19, "twentieth": 20, "thirtieth": 30, "fortieth": 40, "fiftieth": 50, "sixtieth": 60, "seventieth": 70, "eightieth": 80, "ninetieth": 90 } def normalize_token(token: str): """Convert a single token or hyphenated token into a number if possible.""" token = token.lower() # Handle ordinals if token in ORDINAL_TO_NUM: return ORDINAL_TO_NUM[token] # Handle hyphenated compounds like 'thirty-nine' if "-" in token: parts = token.split("-") nums = [WORD_TO_NUM.get(p) for p in parts if p in WORD_TO_NUM] if nums: return sum(nums) # Handle normal cardinals return WORD_TO_NUM.get(token) def words_to_numbers(phrase: str): tokens = phrase.lower().strip().split() nums = [normalize_token(t) for t in tokens if normalize_token(t) is not None] if not nums: return [] # Case: three tokens like "two one ninety" → 91 if len(nums) == 3: return [int(f"{nums[0]}{nums[1]}") + nums[2]] # Case: two tokens like "five thirty" → 35 if len(nums) == 2: if nums[1] >= 20: return [nums[0] + nums[1]] else: return [int("".join(str(n) for n in nums))] # Otherwise, return each token separately return nums def parse_numbers(text: str): chunks = re.split(r"[,\.;]", text) result = [] for chunk in chunks: result.extend(words_to_numbers(chunk)) return " ".join(str(n) for n in result) @app.post("/transcribe") async def transcribe_audio(file: UploadFile = File(...)): # Save uploaded file to a temp path with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(await file.read()) tmp_path = tmp.name try: # Run transcription result = asr_model.transcribe([tmp_path]) print(result) raw_text = result[0].text print(raw_text) result = translate_egyptian_to_english(raw_text) print("\n=== English Translation ===\n") print(result) print(parse_numbers(result)) # print (w2n.word_to_num(result)) # Convert Arabic numbers # cleaned_text = replace_arabic_numbers(raw_text) # print("\n\n") # print(cleaned_text) # print("\n\n") return JSONResponse(content={"transcription": raw_text}) finally: os.remove(tmp_path) @app.post("/transcribe-bytes") async def transcribe_audio_bytes(audio_bytes: bytes = File(...)): with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp: tmp.write(audio_bytes) tmp_path = tmp.name try: result = asr_model.transcribe([tmp_path]) raw_text = result[0].text cleaned_text = replace_arabic_numbers(raw_text) return JSONResponse(content={"transcription": cleaned_text}) finally: os.remove(tmp_path) if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)