File size: 17,440 Bytes

9a199b4

# from fastapi import FastAPI, File, UploadFile
# from fastapi.responses import JSONResponse
# import uvicorn
# import tempfile
# import nemo.collections.asr as nemo_asr
# import re
# import os
# import librosa
# import soundfile as sf

# # ===== Arabic number mapping (expanded) =====
# arabic_numbers = {
#     "صفر": "0", "زيرو": "0", "٠": "0", "زيو": "0", "زير": "0",
#     "واحد": "1", "واحدة": "1", "١": "1",
#     "اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2",
#     "تلاتة": "3", "ثلاثة": "3", "٣": "3","ثلاث": "3","تلات": "3",
#     "اربعة": "4", "أربعة": "4", "٤": "4",
#     "خمسة": "5", "٥": "5","خمسه": "5",
#     "ستة": "6", "٦": "6",
#     "سبعة": "7", "٧": "7","سبعه": "7",
#     "تمانية": "8", "ثمانية": "8", "٨": "8",
#     "تسعة": "9", "٩": "9",
#     "عشرة": "10", "١٠": "10","عشره": "10",
#     "حداشر": "11", "احد عشر": "11", "احداشر": "11",
#     "اتناشر": "12", "اثنا عشر": "12",
#     "تلتاشر": "13", "ثلاثة عشر": "13",
#     "اربعتاشر": "14", "أربعة عشر": "14",
#     "خمستاشر": "15", "خمسة عشر": "15",
#     "ستاشر": "16", "ستة عشر": "16",
#     "سبعتاشر": "17", "سبعة عشر": "17",
#     "طمنتاشر": "18", "ثمانية عشر": "18",
#     "تسعتاشر": "19", "تسعة عشر": "19",
#     "عشرين": "20", "٢٠": "20",
#     "تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
#     "اربعين": "40", "أربعين": "40", "٤٠": "40",
#     "خمسين": "50", "٥٠": "50",
#     "ستين": "60", "٦٠": "60",
#     "سبعين": "70", "٧٠": "70",
#     "تمانين": "80", "ثمانين": "80", "٨٠": "80",
#     "تسعين": "90", "٩٠": "90",
#     "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
#     "ميتين": "200", "مائتين": "200",
#     "تلاتمية": "300", "ثلاثمائة": "300",
#     "اربعمية": "400", "أربعمائة": "400",
#     "خمسمية": "500", "خمسمائة": "500",
#     "ستمية": "600", "ستمائة": "600",
#     "سبعمية": "700", "سبعمائة": "700",
#     "تمانمية": "800", "ثمانمائة": "800",
#     "تسعمية": "900", "تسعمائة": "900",
#     "ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
#     "ألفين": "2000", "الفين": "2000",
#     "تلات تلاف": "3000", "ثلاثة آلاف": "3000",
#     "اربعة آلاف": "4000", "أربعة آلاف": "4000",
#     "خمسة آلاف": "5000",
#     "ستة آلاف": "6000",
#     "سبعة آلاف": "7000",
#     "تمانية آلاف": "8000", "ثمانية آلاف": "8000",
#     "تسعة آلاف": "9000",
#     "عشرة آلاف": "10000",
#     "مية ألف": "100000", "مائة ألف": "100000",
#     "مليون": "1000000", "ملايين": "1000000",
#     "مليار": "1000000000"
# }

# # ===== Helpers =====
# def normalize_arabic(text: str) -> str:
#     diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
#     text = re.sub(diacritics, '', text)
#     text = re.sub(r'[إأآا]', 'ا', text)
#     text = re.sub(r'ى', 'ي', text)
#     text = re.sub(r'ؤ', 'و', text)
#     text = re.sub(r'ئ', 'ي', text)
#     text = re.sub(r'ة', 'ه', text)
#     return text

# def replace_arabic_numbers(text: str) -> str:
#     for word, digit in arabic_numbers.items():
#         text = re.sub(fr"(?:^|\s){word}(?:$|\s)", f" {digit} ", text)
#     return " ".join(text.split())

# def join_digit_sequences(text: str) -> str:
#     tokens = text.split()
#     out, buffer = [], []
#     for tok in tokens:
#         if tok.isdigit() and len(tok) == 1:
#             buffer.append(tok)
#         else:
#             if buffer:
#                 out.append("".join(buffer))
#                 buffer = []
#             out.append(tok)
#     if buffer:
#         out.append("".join(buffer))
#     return " ".join(out)

# def ensure_16k_wav(input_path, output_path):
#     y, sr = librosa.load(input_path, sr=16000, mono=True)
#     sf.write(output_path, y, 16000)

# # ===== FastAPI app =====
# app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic digit conversion")

# @app.on_event("startup")
# def load_model():
#     global asr_model
#     model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/asr-egyptian-nemo-v2.0.nemo"
#     asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)

# @app.post("/transcribe")
# async def transcribe_audio(file: UploadFile = File(...)):
#     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
#         tmp.write(await file.read())
#         tmp_path = tmp.name

#     # Resample to 16kHz
#     resampled_path = tmp_path.replace(".wav", "_16k.wav")
#     ensure_16k_wav(tmp_path, resampled_path)

#     try:
#         result = asr_model.transcribe([resampled_path])
#         raw_text = result[0].text

#         raw_text = normalize_arabic(raw_text)
#         cleaned_text = replace_arabic_numbers(raw_text)
#         cleaned_text = join_digit_sequences(cleaned_text)

#         return JSONResponse(content={"transcription": cleaned_text})

#     finally:
#         os.remove(tmp_path)
#         if os.path.exists(resampled_path):
#             os.remove(resampled_path)

# @app.post("/transcribe-bytes")
# async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
#     with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
#         tmp.write(audio_bytes)
#         tmp_path = tmp.name

#     resampled_path = tmp_path.replace(".wav", "_16k.wav")
#     ensure_16k_wav(tmp_path, resampled_path)

#     try:
#         result = asr_model.transcribe([resampled_path])
#         raw_text = result[0].text

#         raw_text = normalize_arabic(raw_text)
#         cleaned_text = replace_arabic_numbers(raw_text)
#         cleaned_text = join_digit_sequences(cleaned_text)

#         return JSONResponse(content={"transcription": cleaned_text})

#     finally:
#         os.remove(tmp_path)
#         if os.path.exists(resampled_path):
#             os.remove(resampled_path)

# if __name__ == "__main__":
#     uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import uvicorn
import tempfile
import nemo.collections.asr as nemo_asr
import re
import os
import librosa
import soundfile as sf

# ===== Arabic + English number mapping (expanded) =====
arabic_numbers = {
    # Arabic digits
    "صفر": "0", "زيرو": "0", "٠": "0", "زيو": "0", "زير": "0","زينوا": "0",
    "واحد": "1", "واحدة": "1", "١": "1",
    "اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2",
    "تلاتة": "3", "ثلاثة": "3", "٣": "3", "ثلاث": "3", "تلات": "3",
    "اربعة": "4", "أربعة": "4", "٤": "4",
    "خمسة": "5", "٥": "5", "خمسه": "5",
    "ستة": "6", "٦": "6",
    "سبعة": "7", "٧": "7", "سبعه": "7",
    "تمانية": "8", "ثمانية": "8", "٨": "8",
    "تسعة": "9", "٩": "9",
    "عشرة": "10", "١٠": "10", "عشره": "10",
    "حداشر": "11", "احد عشر": "11", "احداشر": "11",
    "اتناشر": "12", "اثنا عشر": "12",
    "تلتاشر": "13", "ثلاثة عشر": "13",
    "اربعتاشر": "14", "أربعة عشر": "14",
    "خمستاشر": "15", "خمسة عشر": "15",
    "ستاشر": "16", "ستة عشر": "16",
    "سبعتاشر": "17", "سبعة عشر": "17",
    "طمنتاشر": "18", "ثمانية عشر": "18",
    "تسعتاشر": "19", "تسعة عشر": "19",
    "عشرين": "20", "٢٠": "20",
    "تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
    "اربعين": "40", "أربعين": "40", "٤٠": "40",
    "خمسين": "50", "٥٠": "50",
    "ستين": "60", "٦٠": "60",
    "سبعين": "70", "٧٠": "70",
    "تمانين": "80", "ثمانين": "80", "٨٠": "80",
    "تسعين": "90", "٩٠": "90",
    "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
    "ميتين": "200", "مائتين": "200",
    "تلاتمية": "300", "ثلاثمائة": "300",
    "اربعمية": "400", "أربعمائة": "400",
    "خمسمية": "500", "خمسمائة": "500",
    "ستمية": "600", "ستمائة": "600",
    "سبعمية": "700", "سبعمائة": "700",
    "تمانمية": "800", "ثمانمائة": "800",
    "تسعمية": "900", "تسعمائة": "900",
    "ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
    "ألفين": "2000", "الفين": "2000",
    "تلات تلاف": "3000", "ثلاثة آلاف": "3000",
    "اربعة آلاف": "4000", "أربعة آلاف": "4000",
    "خمسة آلاف": "5000",
    "ستة آلاف": "6000",
    "سبعة آلاف": "7000",
    "تمانية آلاف": "8000", "ثمانية آلاف": "8000",
    "تسعة آلاف": "9000",
    "عشرة آلاف": "10000",
    "مية ألف": "100000", "مائة ألف": "100000",
    "مليون": "1000000", "ملايين": "1000000",
    "مليار": "1000000000",

    # English digits
    "zero": "0", "one": "1", "two": "2", "to": "2", "too": "2", "three": "3",
    "four": "4", "for": "4", "five": "5", "six": "6", "seven": "7",
    "eight": "8", "nine": "9", "ten": "10",
    "eleven": "11", "twelve": "12", "thirteen": "13", "fourteen": "14",
    "fifteen": "15", "sixteen": "16", "seventeen": "17",
    "eighteen": "18", "nineteen": "19", "twenty": "20",
    "thirty": "30", "forty": "40", "fifty": "50",
    "sixty": "60", "seventy": "70", "eighty": "80", "ninety": "90",
    "hundred": "100", "thousand": "1000", "million": "1000000",

    # Arabic variants
    "تلاته": "3", "اربعه": "4", "سته": "6",
    "تمانيه": "8", "ثماني": "80", "تسعه": "9",
    "واحده": "1", "عشره": "10",
    "حداشر": "11", "اتناشر": "12",
    "تلاته عشر": "13", "اربعه عشر": "14",
    "خمسه عشر": "15", "سته عشر": "16",
    "سبعه عشر": "17", "ثمانيه عشر": "18",
    "تسعه عشر": "19",

    # English tricky forms
    "oh": "0",
    "double zero": "00",
    "double one": "11",
    "double two": "22",
    "double three": "33",
    "double four": "44",
    "double five": "55",
    "double six": "66",
    "double seven": "77",
    "double eight": "88",
    "double nine": "99",
    "for": "4", "to": "2", "too": "2",
    "nite": "9", "fiv": "5",

    # 🔹 Repeated Digits
    "واحد واحد": "11",
    "اثنين اثنين": "22",
    "اتنين اتنين": "22",
    "ثلاثة ثلاثة": "33",
    "تلاتة تلاتة": "33",
    "أربعة أربعة": "44",
    "اربعة اربعة": "44",
    "خمسة خمسة": "55",
    "ستة ستة": "66",
    "سبعة سبعة": "77",
    "ثمانية ثمانية": "88",
    "تمانية تمانية": "88",
    "تسعة تسعة": "99",

    # 🔹 Hundreds
    "مئة": "100",
    "مية": "100",
    "مئتين": "200",
    "ميتين": "200",
    "ثلاثمية": "300",
    "تلتمية": "300",
    "أربعمية": "400",
    "اربعمية": "400",
    "خمسمية": "500",
    "ستمية": "600",
    "سبعمية": "700",
    "تمانمية": "800",
    "تسعمية": "900",

    # 🔹 Teens
    "أحد عشر": "11",
    "حداشر": "11",
    "اثنا عشر": "12",
    "اتناشر": "12",
    "ثلاثة عشر": "13",
    "تلاتاشر": "13",
    "أربعة عشر": "14",
    "اربعتاشر": "14",
    "خمسة عشر": "15",
    "خمسطاشر": "15",
    "ستة عشر": "16",
    "ستاشر": "16",
    "سبعة عشر": "17",
    "سبعتاشر": "17",
    "ثمانية عشر": "18",
    "طمنتاشر": "18",
    "تسعة عشر": "19",
    "تسعتاشر": "19",

    # 🔹 Tens
    "عشرين": "20",
    "تلاتين": "30",
    "ثلاثين": "30",
    "أربعين": "40",
    "اربعين": "40",
    "خمسين": "50",
    "ستين": "60",
    "سبعين": "70",
    "ثمانين": "80",
    "تمانين": "80",
    "ثامنين": "80",
    "تسعين": "90",

    # 🔹 Mixed Word + Digits
    "خمسة صفر": "50",
    "ثلاثة صفر صفر": "300",
    "تلاتة صفر صفر": "300",
    "واحد صفر": "10",
    "واحد صفر واحد": "101",
    "واحد اثنين": "12",
    "واحد اتنين": "12",
    "واحد ثلاثة": "13",
    "واحد تلاتة": "13",
    "واحد خمسة": "15",
    "عشرة عشرة": "1010",
    # Zero
    "صفر": "0",
    "زيرو": "0",
    "زيو": "0",
    "٠": "0",

    # One
    "واحد": "1",
    "واحدة": "1",
    "١": "1",

    # Two
    "اتنين": "2",
    "اتنين": "2",
    "اثنين": "2",
    "اثنان": "2",
    "٢": "2",

    # Three
    "تلاتة": "3",
    "تلات": "3",
    "ثلاثة": "3",
    "ثلاث": "3",
    "٣": "3",

    # Four
    "اربعة": "4",
    "أربعة": "4",
    "٤": "4",

    # Five
    "خمسة": "5",
    "خمسه": "5",
    "٥": "5",

    # Six
    "ستة": "6",
    "ست": "6",
    "٦": "6",

    # Seven
    "سبعة": "7",
    "سبعه": "7",
    "٧": "7",

    # Eight
    "تمانية": "8",
    "تمنية": "8",
    "ثمانية": "8",
    "ثماني": "8",
    "ثمان": "8",
    "ثمانيّة": "8",
    "ثماني": "8",
    "٨": "8",

    # Nine
    "تسعة": "9",
    "تسعه": "9",
    "٩": "9"
}

# ===== Helpers =====
def normalize_arabic(text: str) -> str:
    diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
    text = re.sub(diacritics, '', text)
    text = re.sub(r'[إأآا]', 'ا', text)
    text = re.sub(r'ى', 'ي', text)
    text = re.sub(r'ؤ', 'و', text)
    text = re.sub(r'ئ', 'ي', text)
    text = re.sub(r'ة', 'ه', text)
    return text

def replace_arabic_numbers(text: str) -> str:
    for word, digit in arabic_numbers.items():
        text = re.sub(fr"(?:^|\s){word}(?:$|\s)", f" {digit} ", text, flags=re.IGNORECASE)
    return " ".join(text.split())

def join_digit_sequences(text: str) -> str:
    tokens = text.split()
    out, buffer = [], []
    for tok in tokens:
        if tok.isdigit() and len(tok) == 1:
            buffer.append(tok)
        else:
            if buffer:
                out.append("".join(buffer))  # join sequences like 8 5 -> 85
                buffer = []
            out.append(tok)
    if buffer:
        out.append("".join(buffer))
    return " ".join(out)

def ensure_16k_wav(input_path, output_path):
    y, sr = librosa.load(input_path, sr=16000, mono=True)
    sf.write(output_path, y, 16000)

# ===== FastAPI app =====
app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic/English digit conversion")

@app.on_event("startup")
def load_model():
    global asr_model
    model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/stt_ar_fastconformer_hybrid_large_pc_v1.0.nemo"
    asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)

@app.post("/transcribe")
async def transcribe_audio(file: UploadFile = File(...)):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        tmp.write(await file.read())
        tmp_path = tmp.name

    resampled_path = tmp_path.replace(".wav", "_16k.wav")
    ensure_16k_wav(tmp_path, resampled_path)

    try:
        result = asr_model.transcribe([resampled_path])
        raw_text = result[0].text
        print(raw_text)
        raw_text = normalize_arabic(raw_text)
        cleaned_text = replace_arabic_numbers(raw_text)
        cleaned_text = join_digit_sequences(cleaned_text)

        return JSONResponse(content={"transcription": cleaned_text})

    finally:
        os.remove(tmp_path)
        if os.path.exists(resampled_path):
            os.remove(resampled_path)

@app.post("/transcribe-bytes")
async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
        tmp.write(audio_bytes)
        tmp_path = tmp.name

    resampled_path = tmp_path.replace(".wav", "_16k.wav")
    ensure_16k_wav(tmp_path, resampled_path)

    try:
        result = asr_model.transcribe([resampled_path])
        raw_text = result[0].text

        raw_text = normalize_arabic(raw_text)
        cleaned_text = replace_arabic_numbers(raw_text)
        cleaned_text = join_digit_sequences(cleaned_text)

        return JSONResponse(content={"transcription": cleaned_text})

    finally:
        os.remove(tmp_path)
        if os.path.exists(resampled_path):
            os.remove(resampled_path)

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)