alaatiger989's picture
Upload folder using huggingface_hub
9a199b4 verified
from fastapi import FastAPI, File, UploadFile, Form
from fastapi.responses import JSONResponse
import uvicorn
import tempfile
import nemo.collections.asr as nemo_asr
import re
import os
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
from word2number import w2n
from deep_translator import GoogleTranslator
# ===== Arabic number mapping (expanded) =====
arabic_numbers = {
# Basic digits
"صفر": "0", "زيرو": "0", "٠": "0","زيو": "0","زير": "0","زر": "0","زروا": "0","زرا": "0","زيره ": "0","زرو ": "0",
"واحد": "1", "واحدة": "1", "١": "1",
"اتنين": "2", "اثنين": "2", "إثنين": "2", "اثنان": "2", "إثنان": "2", "٢": "2",
"تلاتة": "3", "ثلاثة": "3", "٣": "3",
"اربعة": "4", "أربعة": "4", "٤": "4",
"خمسة": "5", "٥": "5",
"ستة": "6", "٦": "6",
"سبعة": "7", "٧": "7",
"تمانية": "8", "ثمانية": "8", "٨": "8",
"تسعة": "9", "٩": "9",
# Teens
"عشرة": "10", "١٠": "10",
# 11
"احد عشر": "11", "واحد عشر": "11", "حداشر": "11",
"١ عشر": "11", "1 عشر": "11", "١عشر": "11", "1عشر": "11",
"١١": "11", "11": "11",
# 12
"اثنا عشر": "12", "اثني عشر": "12", "اتناشر": "12",
"٢ عشر": "12", "2 عشر": "12", "٢عشر": "12", "2عشر": "12",
"١٢": "12", "12": "12",
# 13
"ثلاثة عشر": "13", "تلاتة عشر": "13", "تلتاشر": "13",
"٣ عشر": "13", "3 عشر": "13", "٣عشر": "13", "3عشر": "13",
"١٣": "13", "13": "13",
# 14
"أربعة عشر": "14", "اربعة عشر": "14", "اربعتاشر": "14",
"٤ عشر": "14", "4 عشر": "14", "٤عشر": "14", "4عشر": "14",
"١٤": "14", "14": "14",
# 15
"خمسة عشر": "15", "خمسه عشر": "15", "خمستاشر": "15",
"٥ عشر": "15", "5 عشر": "15", "٥عشر": "15", "5عشر": "15",
"١٥": "15", "15": "15",
# 16
"ستة عشر": "16", "سته عشر": "16", "ستاشر": "16",
"٦ عشر": "16", "6 عشر": "16", "٦عشر": "16", "6عشر": "16",
"١٦": "16", "16": "16",
# 17
"سبعة عشر": "17", "سبعه عشر": "17", "سبعتاشر": "17",
"٧ عشر": "17", "7 عشر": "17", "٧عشر": "17", "7عشر": "17",
"١٧": "17", "17": "17",
# 18
"ثمانية عشر": "18", "تمانية عشر": "18", "طمنتاشر": "18",
"٨ عشر": "18", "8 عشر": "18", "٨عشر": "18", "8عشر": "18",
"١٨": "18", "18": "18",
# 19
"تسعة عشر": "19", "تسعه عشر": "19", "تسعتاشر": "19",
"٩ عشر": "19", "9 عشر": "19", "٩عشر": "19", "9عشر": "19",
"١٩": "19", "19": "19",
# Tens
"عشرين": "20", "٢٠": "20",
"تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
"اربعين": "40", "أربعين": "40", "٤٠": "40",
"خمسين": "50", "٥٠": "50",
"ستين": "60", "٦٠": "60",
"سبعين": "70", "٧٠": "70",
"تمانين": "80", "ثمانين": "80", "٨٠": "80","تمانون": "80","ثمانون": "80",
"تسعين": "90", "٩٠": "90",
# Hundreds
"مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
"ميتين": "200", "مائتين": "200",
"تلاتمية": "300", "ثلاثمائة": "300",
"اربعمية": "400", "أربعمائة": "400",
"خمسمية": "500", "خمسمائة": "500",
"ستمية": "600", "ستمائة": "600",
"سبعمية": "700", "سبعمائة": "700",
"تمانمية": "800", "ثمانمائة": "800",
"تسعمية": "900", "تسعمائة": "900",
# Thousands
"ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
"ألفين": "2000", "الفين": "2000",
"تلات تلاف": "3000", "ثلاثة آلاف": "3000",
"اربعة آلاف": "4000", "أربعة آلاف": "4000",
"خمسة آلاف": "5000",
"ستة آلاف": "6000",
"سبعة آلاف": "7000",
"تمانية آلاف": "8000", "ثمانية آلاف": "8000",
"تسعة آلاف": "9000",
# Large numbers
"عشرة آلاف": "10000",
"مية ألف": "100000", "مائة ألف": "100000",
"مليون": "1000000", "١٠٠٠٠٠٠": "1000000",
"ملايين": "1000000",
"مليار": "1000000000", "١٠٠٠٠٠٠٠٠٠": "1000000000",
# ===== Compound tens (Arabic + digit forms) =====
"واحد وعشرون": "21", "1 وعشرون": "21",
"اثنان وعشرون": "22", "٢ وعشرون": "22",
"ثلاثة وعشرون": "23", "٣ وعشرون": "23",
"اربعة وعشرون": "24", "٤ وعشرون": "24",
"خمسة وعشرون": "25", "٥ وعشرون": "25",
"ستة وعشرون": "26", "٦ وعشرون": "26",
"سبعة وعشرون": "27", "٧ وعشرون": "27",
"تمانية وعشرون": "28", "ثمانية وعشرون": "28", "٨ وعشرون": "28",
"تسعة وعشرون": "29", "٩ وعشرون": "29",
"ثمانية وثمانون": "88", "8 وثمانون": "88",
"اثنان وثمانون": "82", "٢ وثمانون": "82",
"خمسة وستون": "65", "5 وستون": "65",
"ستة عشر": "16", "٦ عشر": "16",
"اثنا عشر": "12", "١٢": "12",
"ثلاثة وثلاثون": "33", "٣٣": "33", "33": "33",
"أربعة وأربعون": "44", "٤٤": "44", "44": "44",
"خمسة وخمسون": "55", "٥٥": "55", "55": "55",
"ستة وستون": "66", "٦٦": "66", "66": "66",
"سبعة وسبعون": "77", "٧٧": "77", "77": "77",
"ثمانية وثمانون": "88", "٨٨": "88", "88": "88",
"تسعة وتسعون": "99", "٩٩": "99", "99": "99",
}
def replace_arabic_numbers(text: str) -> str:
for word, digit in arabic_numbers.items():
text = re.sub(rf"\b{word}\b", digit, text)
return text
# ===== FastAPI app =====
app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic digit conversion")
# Load model once on startup
@app.on_event("startup")
def load_model():
global asr_model
global model
global tokenizer
global device
#model_path = os.getenv("NEMO_MODEL_PATH", "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/nvidia_asr_eg_conformer_better_than_whisper/stt_ar_fastconformer_hybrid_large_pcd_v1.0.nemo")
model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/stt_ar_fastconformer_hybrid_large_pc_v1.0.nemo"
asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
# Load once globally
# model_name = "alaasayed_ai/Egyptian_Arabic_to_English"
model_translator_name = "ukaAi/Egyptian_dialect_to_arabic"
tokenizer = AutoTokenizer.from_pretrained(model_translator_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_translator_name)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
def translate_egyptian_to_english(text: str) -> str:
"""
Translates Egyptian Arabic text to English using the fine-tuned NLLB model.
Parameters:
- text (str): The input Egyptian Arabic text
Returns:
- str: The translated English text
"""
tokenizer.src_lang = "arz_Arab"
forced_bos_token_id = tokenizer.convert_tokens_to_ids("eng_Latn")
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs = {k: v.to(device) for k, v in inputs.items()}
translated = model.generate(
**inputs,
forced_bos_token_id=forced_bos_token_id,
max_length=512,
num_beams=4,
early_stopping=True
)
return tokenizer.decode(translated[0], skip_special_tokens=True)
# Cardinal and tens
WORD_TO_NUM = {
"zero": 0, "one": 1, "two": 2, "three": 3, "four": 4, "five": 5,
"six": 6, "seven": 7, "eight": 8, "nine": 9, "ten": 10,
"eleven": 11, "twelve": 12, "thirteen": 13, "fourteen": 14,
"fifteen": 15, "sixteen": 16, "seventeen": 17, "eighteen": 18,
"nineteen": 19, "twenty": 20, "thirty": 30, "forty": 40,
"fifty": 50, "sixty": 60, "seventy": 70, "eighty": 80, "ninety": 90
}
# Ordinals
ORDINAL_TO_NUM = {
"first": 1, "second": 2, "third": 3, "fourth": 4, "fifth": 5,
"sixth": 6, "seventh": 7, "eighth": 8, "ninth": 9, "tenth": 10,
"eleventh": 11, "twelfth": 12, "thirteenth": 13, "fourteenth": 14,
"fifteenth": 15, "sixteenth": 16, "seventeenth": 17, "eighteenth": 18,
"nineteenth": 19, "twentieth": 20, "thirtieth": 30, "fortieth": 40,
"fiftieth": 50, "sixtieth": 60, "seventieth": 70, "eightieth": 80, "ninetieth": 90
}
def normalize_token(token: str):
"""Convert a single token or hyphenated token into a number if possible."""
token = token.lower()
# Handle ordinals
if token in ORDINAL_TO_NUM:
return ORDINAL_TO_NUM[token]
# Handle hyphenated compounds like 'thirty-nine'
if "-" in token:
parts = token.split("-")
nums = [WORD_TO_NUM.get(p) for p in parts if p in WORD_TO_NUM]
if nums:
return sum(nums)
# Handle normal cardinals
return WORD_TO_NUM.get(token)
def words_to_numbers(phrase: str):
tokens = phrase.lower().strip().split()
nums = [normalize_token(t) for t in tokens if normalize_token(t) is not None]
if not nums:
return []
# Case: three tokens like "two one ninety" → 91
if len(nums) == 3:
return [int(f"{nums[0]}{nums[1]}") + nums[2]]
# Case: two tokens like "five thirty" → 35
if len(nums) == 2:
if nums[1] >= 20:
return [nums[0] + nums[1]]
else:
return [int("".join(str(n) for n in nums))]
# Otherwise, return each token separately
return nums
def parse_numbers(text: str):
chunks = re.split(r"[,\.;]", text)
result = []
for chunk in chunks:
result.extend(words_to_numbers(chunk))
return " ".join(str(n) for n in result)
@app.post("/transcribe")
async def transcribe_audio(file: UploadFile = File(...)):
# Save uploaded file to a temp path
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
try:
# Run transcription
result = asr_model.transcribe([tmp_path])
print(result)
raw_text = result[0].text
print(raw_text)
result = translate_egyptian_to_english(raw_text)
print("\n=== English Translation ===\n")
print(result)
print(parse_numbers(result))
# print (w2n.word_to_num(result))
# Convert Arabic numbers
# cleaned_text = replace_arabic_numbers(raw_text)
# print("\n\n")
# print(cleaned_text)
# print("\n\n")
return JSONResponse(content={"transcription": raw_text})
finally:
os.remove(tmp_path)
@app.post("/transcribe-bytes")
async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp.write(audio_bytes)
tmp_path = tmp.name
try:
result = asr_model.transcribe([tmp_path])
raw_text = result[0].text
cleaned_text = replace_arabic_numbers(raw_text)
return JSONResponse(content={"transcription": cleaned_text})
finally:
os.remove(tmp_path)
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)