|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from fastapi import FastAPI, File, UploadFile
|
|
|
from fastapi.responses import JSONResponse
|
|
|
import uvicorn
|
|
|
import tempfile
|
|
|
import nemo.collections.asr as nemo_asr
|
|
|
import re
|
|
|
import os
|
|
|
import librosa
|
|
|
import soundfile as sf
|
|
|
|
|
|
|
|
|
arabic_numbers = {
|
|
|
|
|
|
"صفر": "0", "زيرو": "0", "٠": "0", "زيو": "0", "زير": "0","زينوا": "0",
|
|
|
"واحد": "1", "واحدة": "1", "١": "1",
|
|
|
"اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2",
|
|
|
"تلاتة": "3", "ثلاثة": "3", "٣": "3", "ثلاث": "3", "تلات": "3",
|
|
|
"اربعة": "4", "أربعة": "4", "٤": "4",
|
|
|
"خمسة": "5", "٥": "5", "خمسه": "5",
|
|
|
"ستة": "6", "٦": "6",
|
|
|
"سبعة": "7", "٧": "7", "سبعه": "7",
|
|
|
"تمانية": "8", "ثمانية": "8", "٨": "8",
|
|
|
"تسعة": "9", "٩": "9",
|
|
|
"عشرة": "10", "١٠": "10", "عشره": "10",
|
|
|
"حداشر": "11", "احد عشر": "11", "احداشر": "11",
|
|
|
"اتناشر": "12", "اثنا عشر": "12",
|
|
|
"تلتاشر": "13", "ثلاثة عشر": "13",
|
|
|
"اربعتاشر": "14", "أربعة عشر": "14",
|
|
|
"خمستاشر": "15", "خمسة عشر": "15",
|
|
|
"ستاشر": "16", "ستة عشر": "16",
|
|
|
"سبعتاشر": "17", "سبعة عشر": "17",
|
|
|
"طمنتاشر": "18", "ثمانية عشر": "18",
|
|
|
"تسعتاشر": "19", "تسعة عشر": "19",
|
|
|
"عشرين": "20", "٢٠": "20",
|
|
|
"تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
|
|
|
"اربعين": "40", "أربعين": "40", "٤٠": "40",
|
|
|
"خمسين": "50", "٥٠": "50",
|
|
|
"ستين": "60", "٦٠": "60",
|
|
|
"سبعين": "70", "٧٠": "70",
|
|
|
"تمانين": "80", "ثمانين": "80", "٨٠": "80",
|
|
|
"تسعين": "90", "٩٠": "90",
|
|
|
"مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
|
|
|
"ميتين": "200", "مائتين": "200",
|
|
|
"تلاتمية": "300", "ثلاثمائة": "300",
|
|
|
"اربعمية": "400", "أربعمائة": "400",
|
|
|
"خمسمية": "500", "خمسمائة": "500",
|
|
|
"ستمية": "600", "ستمائة": "600",
|
|
|
"سبعمية": "700", "سبعمائة": "700",
|
|
|
"تمانمية": "800", "ثمانمائة": "800",
|
|
|
"تسعمية": "900", "تسعمائة": "900",
|
|
|
"ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
|
|
|
"ألفين": "2000", "الفين": "2000",
|
|
|
"تلات تلاف": "3000", "ثلاثة آلاف": "3000",
|
|
|
"اربعة آلاف": "4000", "أربعة آلاف": "4000",
|
|
|
"خمسة آلاف": "5000",
|
|
|
"ستة آلاف": "6000",
|
|
|
"سبعة آلاف": "7000",
|
|
|
"تمانية آلاف": "8000", "ثمانية آلاف": "8000",
|
|
|
"تسعة آلاف": "9000",
|
|
|
"عشرة آلاف": "10000",
|
|
|
"مية ألف": "100000", "مائة ألف": "100000",
|
|
|
"مليون": "1000000", "ملايين": "1000000",
|
|
|
"مليار": "1000000000",
|
|
|
|
|
|
|
|
|
"zero": "0", "one": "1", "two": "2", "to": "2", "too": "2", "three": "3",
|
|
|
"four": "4", "for": "4", "five": "5", "six": "6", "seven": "7",
|
|
|
"eight": "8", "nine": "9", "ten": "10",
|
|
|
"eleven": "11", "twelve": "12", "thirteen": "13", "fourteen": "14",
|
|
|
"fifteen": "15", "sixteen": "16", "seventeen": "17",
|
|
|
"eighteen": "18", "nineteen": "19", "twenty": "20",
|
|
|
"thirty": "30", "forty": "40", "fifty": "50",
|
|
|
"sixty": "60", "seventy": "70", "eighty": "80", "ninety": "90",
|
|
|
"hundred": "100", "thousand": "1000", "million": "1000000",
|
|
|
|
|
|
|
|
|
"تلاته": "3", "اربعه": "4", "سته": "6",
|
|
|
"تمانيه": "8", "ثماني": "80", "تسعه": "9",
|
|
|
"واحده": "1", "عشره": "10",
|
|
|
"حداشر": "11", "اتناشر": "12",
|
|
|
"تلاته عشر": "13", "اربعه عشر": "14",
|
|
|
"خمسه عشر": "15", "سته عشر": "16",
|
|
|
"سبعه عشر": "17", "ثمانيه عشر": "18",
|
|
|
"تسعه عشر": "19",
|
|
|
|
|
|
|
|
|
"oh": "0",
|
|
|
"double zero": "00",
|
|
|
"double one": "11",
|
|
|
"double two": "22",
|
|
|
"double three": "33",
|
|
|
"double four": "44",
|
|
|
"double five": "55",
|
|
|
"double six": "66",
|
|
|
"double seven": "77",
|
|
|
"double eight": "88",
|
|
|
"double nine": "99",
|
|
|
"for": "4", "to": "2", "too": "2",
|
|
|
"nite": "9", "fiv": "5",
|
|
|
|
|
|
|
|
|
"واحد واحد": "11",
|
|
|
"اثنين اثنين": "22",
|
|
|
"اتنين اتنين": "22",
|
|
|
"ثلاثة ثلاثة": "33",
|
|
|
"تلاتة تلاتة": "33",
|
|
|
"أربعة أربعة": "44",
|
|
|
"اربعة اربعة": "44",
|
|
|
"خمسة خمسة": "55",
|
|
|
"ستة ستة": "66",
|
|
|
"سبعة سبعة": "77",
|
|
|
"ثمانية ثمانية": "88",
|
|
|
"تمانية تمانية": "88",
|
|
|
"تسعة تسعة": "99",
|
|
|
|
|
|
|
|
|
"مئة": "100",
|
|
|
"مية": "100",
|
|
|
"مئتين": "200",
|
|
|
"ميتين": "200",
|
|
|
"ثلاثمية": "300",
|
|
|
"تلتمية": "300",
|
|
|
"أربعمية": "400",
|
|
|
"اربعمية": "400",
|
|
|
"خمسمية": "500",
|
|
|
"ستمية": "600",
|
|
|
"سبعمية": "700",
|
|
|
"تمانمية": "800",
|
|
|
"تسعمية": "900",
|
|
|
|
|
|
|
|
|
"أحد عشر": "11",
|
|
|
"حداشر": "11",
|
|
|
"اثنا عشر": "12",
|
|
|
"اتناشر": "12",
|
|
|
"ثلاثة عشر": "13",
|
|
|
"تلاتاشر": "13",
|
|
|
"أربعة عشر": "14",
|
|
|
"اربعتاشر": "14",
|
|
|
"خمسة عشر": "15",
|
|
|
"خمسطاشر": "15",
|
|
|
"ستة عشر": "16",
|
|
|
"ستاشر": "16",
|
|
|
"سبعة عشر": "17",
|
|
|
"سبعتاشر": "17",
|
|
|
"ثمانية عشر": "18",
|
|
|
"طمنتاشر": "18",
|
|
|
"تسعة عشر": "19",
|
|
|
"تسعتاشر": "19",
|
|
|
|
|
|
|
|
|
"عشرين": "20",
|
|
|
"تلاتين": "30",
|
|
|
"ثلاثين": "30",
|
|
|
"أربعين": "40",
|
|
|
"اربعين": "40",
|
|
|
"خمسين": "50",
|
|
|
"ستين": "60",
|
|
|
"سبعين": "70",
|
|
|
"ثمانين": "80",
|
|
|
"تمانين": "80",
|
|
|
"ثامنين": "80",
|
|
|
"تسعين": "90",
|
|
|
|
|
|
|
|
|
"خمسة صفر": "50",
|
|
|
"ثلاثة صفر صفر": "300",
|
|
|
"تلاتة صفر صفر": "300",
|
|
|
"واحد صفر": "10",
|
|
|
"واحد صفر واحد": "101",
|
|
|
"واحد اثنين": "12",
|
|
|
"واحد اتنين": "12",
|
|
|
"واحد ثلاثة": "13",
|
|
|
"واحد تلاتة": "13",
|
|
|
"واحد خمسة": "15",
|
|
|
"عشرة عشرة": "1010",
|
|
|
|
|
|
"صفر": "0",
|
|
|
"زيرو": "0",
|
|
|
"زيو": "0",
|
|
|
"٠": "0",
|
|
|
|
|
|
|
|
|
"واحد": "1",
|
|
|
"واحدة": "1",
|
|
|
"١": "1",
|
|
|
|
|
|
|
|
|
"اتنين": "2",
|
|
|
"اتنين": "2",
|
|
|
"اثنين": "2",
|
|
|
"اثنان": "2",
|
|
|
"٢": "2",
|
|
|
|
|
|
|
|
|
"تلاتة": "3",
|
|
|
"تلات": "3",
|
|
|
"ثلاثة": "3",
|
|
|
"ثلاث": "3",
|
|
|
"٣": "3",
|
|
|
|
|
|
|
|
|
"اربعة": "4",
|
|
|
"أربعة": "4",
|
|
|
"٤": "4",
|
|
|
|
|
|
|
|
|
"خمسة": "5",
|
|
|
"خمسه": "5",
|
|
|
"٥": "5",
|
|
|
|
|
|
|
|
|
"ستة": "6",
|
|
|
"ست": "6",
|
|
|
"٦": "6",
|
|
|
|
|
|
|
|
|
"سبعة": "7",
|
|
|
"سبعه": "7",
|
|
|
"٧": "7",
|
|
|
|
|
|
|
|
|
"تمانية": "8",
|
|
|
"تمنية": "8",
|
|
|
"ثمانية": "8",
|
|
|
"ثماني": "8",
|
|
|
"ثمان": "8",
|
|
|
"ثمانيّة": "8",
|
|
|
"ثماني": "8",
|
|
|
"٨": "8",
|
|
|
|
|
|
|
|
|
"تسعة": "9",
|
|
|
"تسعه": "9",
|
|
|
"٩": "9"
|
|
|
}
|
|
|
|
|
|
|
|
|
def normalize_arabic(text: str) -> str:
|
|
|
diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
|
|
|
text = re.sub(diacritics, '', text)
|
|
|
text = re.sub(r'[إأآا]', 'ا', text)
|
|
|
text = re.sub(r'ى', 'ي', text)
|
|
|
text = re.sub(r'ؤ', 'و', text)
|
|
|
text = re.sub(r'ئ', 'ي', text)
|
|
|
text = re.sub(r'ة', 'ه', text)
|
|
|
return text
|
|
|
|
|
|
def replace_arabic_numbers(text: str) -> str:
|
|
|
for word, digit in arabic_numbers.items():
|
|
|
text = re.sub(fr"(?:^|\s){word}(?:$|\s)", f" {digit} ", text, flags=re.IGNORECASE)
|
|
|
return " ".join(text.split())
|
|
|
|
|
|
def join_digit_sequences(text: str) -> str:
|
|
|
tokens = text.split()
|
|
|
out, buffer = [], []
|
|
|
for tok in tokens:
|
|
|
if tok.isdigit() and len(tok) == 1:
|
|
|
buffer.append(tok)
|
|
|
else:
|
|
|
if buffer:
|
|
|
out.append("".join(buffer))
|
|
|
buffer = []
|
|
|
out.append(tok)
|
|
|
if buffer:
|
|
|
out.append("".join(buffer))
|
|
|
return " ".join(out)
|
|
|
|
|
|
def ensure_16k_wav(input_path, output_path):
|
|
|
y, sr = librosa.load(input_path, sr=16000, mono=True)
|
|
|
sf.write(output_path, y, 16000)
|
|
|
|
|
|
|
|
|
app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic/English digit conversion")
|
|
|
|
|
|
@app.on_event("startup")
|
|
|
def load_model():
|
|
|
global asr_model
|
|
|
model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/stt_ar_fastconformer_hybrid_large_pc_v1.0.nemo"
|
|
|
asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
|
|
|
|
|
|
@app.post("/transcribe")
|
|
|
async def transcribe_audio(file: UploadFile = File(...)):
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
|
|
tmp.write(await file.read())
|
|
|
tmp_path = tmp.name
|
|
|
|
|
|
resampled_path = tmp_path.replace(".wav", "_16k.wav")
|
|
|
ensure_16k_wav(tmp_path, resampled_path)
|
|
|
|
|
|
try:
|
|
|
result = asr_model.transcribe([resampled_path])
|
|
|
raw_text = result[0].text
|
|
|
print(raw_text)
|
|
|
raw_text = normalize_arabic(raw_text)
|
|
|
cleaned_text = replace_arabic_numbers(raw_text)
|
|
|
cleaned_text = join_digit_sequences(cleaned_text)
|
|
|
|
|
|
return JSONResponse(content={"transcription": cleaned_text})
|
|
|
|
|
|
finally:
|
|
|
os.remove(tmp_path)
|
|
|
if os.path.exists(resampled_path):
|
|
|
os.remove(resampled_path)
|
|
|
|
|
|
@app.post("/transcribe-bytes")
|
|
|
async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
|
|
|
tmp.write(audio_bytes)
|
|
|
tmp_path = tmp.name
|
|
|
|
|
|
resampled_path = tmp_path.replace(".wav", "_16k.wav")
|
|
|
ensure_16k_wav(tmp_path, resampled_path)
|
|
|
|
|
|
try:
|
|
|
result = asr_model.transcribe([resampled_path])
|
|
|
raw_text = result[0].text
|
|
|
|
|
|
raw_text = normalize_arabic(raw_text)
|
|
|
cleaned_text = replace_arabic_numbers(raw_text)
|
|
|
cleaned_text = join_digit_sequences(cleaned_text)
|
|
|
|
|
|
return JSONResponse(content={"transcription": cleaned_text})
|
|
|
|
|
|
finally:
|
|
|
os.remove(tmp_path)
|
|
|
if os.path.exists(resampled_path):
|
|
|
os.remove(resampled_path)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
|
|
|
|