alaatiger989's picture
Add files using upload-large-folder tool
b5e57ee verified
# from fastapi import FastAPI, File, UploadFile
# from fastapi.responses import JSONResponse
# import uvicorn
# import tempfile
# import nemo.collections.asr as nemo_asr
# import re
# import os
# import librosa
# import soundfile as sf
# # ===== Arabic number mapping (expanded) =====
# arabic_numbers = {
# "صفر": "0", "زيرو": "0", "٠": "0", "زيو": "0", "زير": "0",
# "واحد": "1", "واحدة": "1", "١": "1",
# "اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2",
# "تلاتة": "3", "ثلاثة": "3", "٣": "3","ثلاث": "3","تلات": "3",
# "اربعة": "4", "أربعة": "4", "٤": "4",
# "خمسة": "5", "٥": "5","خمسه": "5",
# "ستة": "6", "٦": "6",
# "سبعة": "7", "٧": "7","سبعه": "7",
# "تمانية": "8", "ثمانية": "8", "٨": "8",
# "تسعة": "9", "٩": "9",
# "عشرة": "10", "١٠": "10","عشره": "10",
# "حداشر": "11", "احد عشر": "11", "احداشر": "11",
# "اتناشر": "12", "اثنا عشر": "12",
# "تلتاشر": "13", "ثلاثة عشر": "13",
# "اربعتاشر": "14", "أربعة عشر": "14",
# "خمستاشر": "15", "خمسة عشر": "15",
# "ستاشر": "16", "ستة عشر": "16",
# "سبعتاشر": "17", "سبعة عشر": "17",
# "طمنتاشر": "18", "ثمانية عشر": "18",
# "تسعتاشر": "19", "تسعة عشر": "19",
# "عشرين": "20", "٢٠": "20",
# "تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
# "اربعين": "40", "أربعين": "40", "٤٠": "40",
# "خمسين": "50", "٥٠": "50",
# "ستين": "60", "٦٠": "60",
# "سبعين": "70", "٧٠": "70",
# "تمانين": "80", "ثمانين": "80", "٨٠": "80",
# "تسعين": "90", "٩٠": "90",
# "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
# "ميتين": "200", "مائتين": "200",
# "تلاتمية": "300", "ثلاثمائة": "300",
# "اربعمية": "400", "أربعمائة": "400",
# "خمسمية": "500", "خمسمائة": "500",
# "ستمية": "600", "ستمائة": "600",
# "سبعمية": "700", "سبعمائة": "700",
# "تمانمية": "800", "ثمانمائة": "800",
# "تسعمية": "900", "تسعمائة": "900",
# "ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
# "ألفين": "2000", "الفين": "2000",
# "تلات تلاف": "3000", "ثلاثة آلاف": "3000",
# "اربعة آلاف": "4000", "أربعة آلاف": "4000",
# "خمسة آلاف": "5000",
# "ستة آلاف": "6000",
# "سبعة آلاف": "7000",
# "تمانية آلاف": "8000", "ثمانية آلاف": "8000",
# "تسعة آلاف": "9000",
# "عشرة آلاف": "10000",
# "مية ألف": "100000", "مائة ألف": "100000",
# "مليون": "1000000", "ملايين": "1000000",
# "مليار": "1000000000"
# }
# # ===== Helpers =====
# def normalize_arabic(text: str) -> str:
# diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
# text = re.sub(diacritics, '', text)
# text = re.sub(r'[إأآا]', 'ا', text)
# text = re.sub(r'ى', 'ي', text)
# text = re.sub(r'ؤ', 'و', text)
# text = re.sub(r'ئ', 'ي', text)
# text = re.sub(r'ة', 'ه', text)
# return text
# def replace_arabic_numbers(text: str) -> str:
# for word, digit in arabic_numbers.items():
# text = re.sub(fr"(?:^|\s){word}(?:$|\s)", f" {digit} ", text)
# return " ".join(text.split())
# def join_digit_sequences(text: str) -> str:
# tokens = text.split()
# out, buffer = [], []
# for tok in tokens:
# if tok.isdigit() and len(tok) == 1:
# buffer.append(tok)
# else:
# if buffer:
# out.append("".join(buffer))
# buffer = []
# out.append(tok)
# if buffer:
# out.append("".join(buffer))
# return " ".join(out)
# def ensure_16k_wav(input_path, output_path):
# y, sr = librosa.load(input_path, sr=16000, mono=True)
# sf.write(output_path, y, 16000)
# # ===== FastAPI app =====
# app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic digit conversion")
# @app.on_event("startup")
# def load_model():
# global asr_model
# model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/asr-egyptian-nemo-v2.0.nemo"
# asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)
# @app.post("/transcribe")
# async def transcribe_audio(file: UploadFile = File(...)):
# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
# tmp.write(await file.read())
# tmp_path = tmp.name
# # Resample to 16kHz
# resampled_path = tmp_path.replace(".wav", "_16k.wav")
# ensure_16k_wav(tmp_path, resampled_path)
# try:
# result = asr_model.transcribe([resampled_path])
# raw_text = result[0].text
# raw_text = normalize_arabic(raw_text)
# cleaned_text = replace_arabic_numbers(raw_text)
# cleaned_text = join_digit_sequences(cleaned_text)
# return JSONResponse(content={"transcription": cleaned_text})
# finally:
# os.remove(tmp_path)
# if os.path.exists(resampled_path):
# os.remove(resampled_path)
# @app.post("/transcribe-bytes")
# async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
# tmp.write(audio_bytes)
# tmp_path = tmp.name
# resampled_path = tmp_path.replace(".wav", "_16k.wav")
# ensure_16k_wav(tmp_path, resampled_path)
# try:
# result = asr_model.transcribe([resampled_path])
# raw_text = result[0].text
# raw_text = normalize_arabic(raw_text)
# cleaned_text = replace_arabic_numbers(raw_text)
# cleaned_text = join_digit_sequences(cleaned_text)
# return JSONResponse(content={"transcription": cleaned_text})
# finally:
# os.remove(tmp_path)
# if os.path.exists(resampled_path):
# os.remove(resampled_path)
# if __name__ == "__main__":
# uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
import uvicorn
import tempfile
import nemo.collections.asr as nemo_asr
import re
import os
import librosa
import soundfile as sf
from omegaconf import OmegaConf
# ===== Arabic + English number mapping (expanded) =====
arabic_numbers = {
"صفر": "0", "زيرو": "0", "زيو": "0", "زير": "0", "٠": "0",
"واحد": "1", "واحدة": "1", "واحده": "1", "١": "1",
"اثنين": "2", "اثنان": "2", "اتنين": "2", "٢": "2",
"ثلاثة": "3", "ثلاث": "3", "تلاتة": "3", "تلات": "3", "ثلاثه": "3", "تلاته": "3",
"أربعة": "4", "اربعة": "4", "٤": "4","أربعه": "4","اربعه": "4",
"خمسة": "5", "خمسه": "5", "٥": "5",
"ستة": "6", "ست": "6", "٦": "6","سته": "6",
"سبعة": "7", "سبعه": "7", "٧": "7",
"ثمانية": "8", "تمانية": "8", "تمنية": "8", "ثمان": "8", "٨": "8","تمانيه": "8",
"تسعة": "9", "تسعه": "9", "٩": "9"
}
# ===== Helpers =====
def normalize_arabic(text: str) -> str:
diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
text = re.sub(diacritics, '', text)
text = re.sub(r'[إأآا]', 'ا', text)
text = re.sub(r'ى', 'ي', text)
text = re.sub(r'ؤ', 'و', text)
text = re.sub(r'ئ', 'ي', text)
text = re.sub(r'ة', 'ه', text)
return text
def replace_arabic_numbers(text: str) -> str:
# Replace Arabic words 0-9 with digits
for word, digit in arabic_numbers.items():
text = re.sub(rf'\b{re.escape(word)}\b', digit, text)
return text
def join_digit_sequences(text: str) -> str:
# Merge consecutive digits into single numbers
tokens = text.split()
out, buffer = [], []
for tok in tokens:
if tok.isdigit():
buffer.append(tok)
else:
if buffer:
out.append("".join(buffer))
buffer = []
out.append(tok)
if buffer:
out.append("".join(buffer))
return " ".join(out)
def ensure_16k_wav(input_path, output_path):
y, sr = librosa.load(input_path, sr=16000, mono=True)
sf.write(output_path, y, 16000)
# ===== FastAPI app =====
app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic/English digit conversion")
@app.on_event("startup")
def load_model():
global asr_model
model_path = "output_finetuned/finetuned_model_best.nemo"
asr_model = nemo_asr.models.EncDecHybridRNNTCTCModel.restore_from(model_path)
# Add this right after loading the model to see what's actually available:
print("Available greedy parameters:")
print(OmegaConf.to_yaml(asr_model.cfg.decoding.greedy))
# ===== STEP 3: Configure for LITERAL transcription =====
print("🔍 Configuring greedy decoding for literal output...")
# Set struct mode to False temporarily to allow modifications
OmegaConf.set_struct(asr_model.cfg.decoding, False)
OmegaConf.set_struct(asr_model.cfg.decoding.greedy, False)
decoding_cfg = asr_model.cfg.decoding
decoding_cfg.strategy = "maes"
# Now try setting the parameters
try:
decoding_cfg.greedy.max_symbols_per_step = 300
print(f"✓ max_symbols_per_step: {decoding_cfg.greedy.max_symbols_per_step}")
except:
print("⚠ Could not set max_symbols_per_step")
decoding_cfg.greedy.max_symbols = 500
decoding_cfg.greedy.loop_labels = True
decoding_cfg.greedy.preserve_alignments = True
decoding_cfg.preserve_alignments = True
decoding_cfg.compute_timestamps = True
decoding_cfg.temperature = 1.3
decoding_cfg.beam.beam_size = 64
decoding_cfg.beam.softmax_temperature = 1.3
decoding_cfg.beam.search_type = "beam"
print(f"✓ max_symbols: {decoding_cfg.greedy.max_symbols}")
print(f"✓ loop_labels: {decoding_cfg.greedy.loop_labels}")
print(f"✓ temperature: {decoding_cfg.temperature}")
# Re-enable struct mode
OmegaConf.set_struct(asr_model.cfg.decoding, True)
OmegaConf.set_struct(asr_model.cfg.decoding.greedy, True)
# Apply configuration
asr_model.change_decoding_strategy(decoding_cfg)
@app.post("/transcribe")
async def transcribe_audio(file: UploadFile = File(...)):
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp.write(await file.read())
tmp_path = tmp.name
resampled_path = tmp_path.replace(".wav", "_16k.wav")
ensure_16k_wav(tmp_path, resampled_path)
try:
result = asr_model.transcribe([resampled_path])
# Handle NeMo tuple/list structure robustly
if isinstance(result, tuple):
result = result[0] # take first element if tuple
if isinstance(result, list):
raw_text = result[0]
else:
raw_text = str(result)
# Normalize and replace Arabic numerals
raw_text = normalize_arabic(raw_text)
cleaned_text = replace_arabic_numbers(raw_text)
cleaned_text = join_digit_sequences(cleaned_text)
print("📝 Cleaned Transcription:", cleaned_text) # for debug
return JSONResponse(content={"transcription": cleaned_text})
finally:
os.remove(tmp_path)
if os.path.exists(resampled_path):
os.remove(resampled_path)
@app.post("/transcribe-bytes")
async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
tmp.write(audio_bytes)
tmp_path = tmp.name
resampled_path = tmp_path.replace(".wav", "_16k.wav")
ensure_16k_wav(tmp_path, resampled_path)
try:
result = asr_model.transcribe([resampled_path])
# Robust extraction
if isinstance(result, list):
# if first element is also a list, flatten it
first = result[0]
if isinstance(first, list):
raw_text = first[0]
elif isinstance(first, str):
raw_text = first
elif hasattr(first, "text"): # sometimes result contains objects with 'text'
raw_text = first.text
else:
raw_text = str(first) # fallback to string
else:
raw_text = str(result)
#print("Raw text:", raw_text)
raw_text = normalize_arabic(raw_text)
cleaned_text = replace_arabic_numbers(raw_text)
cleaned_text = join_digit_sequences(cleaned_text)
return JSONResponse(content={"transcription": cleaned_text})
finally:
os.remove(tmp_path)
if os.path.exists(resampled_path):
os.remove(resampled_path)
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)