Ambulance_App_ASR_Detection / app_api_2.py

Upload folder using huggingface_hub

9a199b4 verified 4 months ago

17.4 kB

	# from fastapi import FastAPI, File, UploadFile
	# from fastapi.responses import JSONResponse
	# import uvicorn
	# import tempfile
	# import nemo.collections.asr as nemo_asr
	# import re
	# import os
	# import librosa
	# import soundfile as sf

	# # ===== Arabic number mapping (expanded) =====
	# arabic_numbers = {
	# "صفر": "0", "زيرو": "0", "٠": "0", "زيو": "0", "زير": "0",
	# "واحد": "1", "واحدة": "1", "١": "1",
	# "اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2",
	# "تلاتة": "3", "ثلاثة": "3", "٣": "3","ثلاث": "3","تلات": "3",
	# "اربعة": "4", "أربعة": "4", "٤": "4",
	# "خمسة": "5", "٥": "5","خمسه": "5",
	# "ستة": "6", "٦": "6",
	# "سبعة": "7", "٧": "7","سبعه": "7",
	# "تمانية": "8", "ثمانية": "8", "٨": "8",
	# "تسعة": "9", "٩": "9",
	# "عشرة": "10", "١٠": "10","عشره": "10",
	# "حداشر": "11", "احد عشر": "11", "احداشر": "11",
	# "اتناشر": "12", "اثنا عشر": "12",
	# "تلتاشر": "13", "ثلاثة عشر": "13",
	# "اربعتاشر": "14", "أربعة عشر": "14",
	# "خمستاشر": "15", "خمسة عشر": "15",
	# "ستاشر": "16", "ستة عشر": "16",
	# "سبعتاشر": "17", "سبعة عشر": "17",
	# "طمنتاشر": "18", "ثمانية عشر": "18",
	# "تسعتاشر": "19", "تسعة عشر": "19",
	# "عشرين": "20", "٢٠": "20",
	# "تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
	# "اربعين": "40", "أربعين": "40", "٤٠": "40",
	# "خمسين": "50", "٥٠": "50",
	# "ستين": "60", "٦٠": "60",
	# "سبعين": "70", "٧٠": "70",
	# "تمانين": "80", "ثمانين": "80", "٨٠": "80",
	# "تسعين": "90", "٩٠": "90",
	# "مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
	# "ميتين": "200", "مائتين": "200",
	# "تلاتمية": "300", "ثلاثمائة": "300",
	# "اربعمية": "400", "أربعمائة": "400",
	# "خمسمية": "500", "خمسمائة": "500",
	# "ستمية": "600", "ستمائة": "600",
	# "سبعمية": "700", "سبعمائة": "700",
	# "تمانمية": "800", "ثمانمائة": "800",
	# "تسعمية": "900", "تسعمائة": "900",
	# "ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
	# "ألفين": "2000", "الفين": "2000",
	# "تلات تلاف": "3000", "ثلاثة آلاف": "3000",
	# "اربعة آلاف": "4000", "أربعة آلاف": "4000",
	# "خمسة آلاف": "5000",
	# "ستة آلاف": "6000",
	# "سبعة آلاف": "7000",
	# "تمانية آلاف": "8000", "ثمانية آلاف": "8000",
	# "تسعة آلاف": "9000",
	# "عشرة آلاف": "10000",
	# "مية ألف": "100000", "مائة ألف": "100000",
	# "مليون": "1000000", "ملايين": "1000000",
	# "مليار": "1000000000"
	# }

	# # ===== Helpers =====
	# def normalize_arabic(text: str) -> str:
	# diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
	# text = re.sub(diacritics, '', text)
	# text = re.sub(r'[إأآا]', 'ا', text)
	# text = re.sub(r'ى', 'ي', text)
	# text = re.sub(r'ؤ', 'و', text)
	# text = re.sub(r'ئ', 'ي', text)
	# text = re.sub(r'ة', 'ه', text)
	# return text

	# def replace_arabic_numbers(text: str) -> str:
	# for word, digit in arabic_numbers.items():
	# text = re.sub(fr"(?:^\|\s){word}(?:$\|\s)", f" {digit} ", text)
	# return " ".join(text.split())

	# def join_digit_sequences(text: str) -> str:
	# tokens = text.split()
	# out, buffer = [], []
	# for tok in tokens:
	# if tok.isdigit() and len(tok) == 1:
	# buffer.append(tok)
	# else:
	# if buffer:
	# out.append("".join(buffer))
	# buffer = []
	# out.append(tok)
	# if buffer:
	# out.append("".join(buffer))
	# return " ".join(out)

	# def ensure_16k_wav(input_path, output_path):
	# y, sr = librosa.load(input_path, sr=16000, mono=True)
	# sf.write(output_path, y, 16000)

	# # ===== FastAPI app =====
	# app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic digit conversion")

	# @app.on_event("startup")
	# def load_model():
	# global asr_model
	# model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/asr-egyptian-nemo-v2.0.nemo"
	# asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)

	# @app.post("/transcribe")
	# async def transcribe_audio(file: UploadFile = File(...)):
	# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	# tmp.write(await file.read())
	# tmp_path = tmp.name

	# # Resample to 16kHz
	# resampled_path = tmp_path.replace(".wav", "_16k.wav")
	# ensure_16k_wav(tmp_path, resampled_path)

	# try:
	# result = asr_model.transcribe([resampled_path])
	# raw_text = result[0].text

	# raw_text = normalize_arabic(raw_text)
	# cleaned_text = replace_arabic_numbers(raw_text)
	# cleaned_text = join_digit_sequences(cleaned_text)

	# return JSONResponse(content={"transcription": cleaned_text})

	# finally:
	# os.remove(tmp_path)
	# if os.path.exists(resampled_path):
	# os.remove(resampled_path)

	# @app.post("/transcribe-bytes")
	# async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
	# with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	# tmp.write(audio_bytes)
	# tmp_path = tmp.name

	# resampled_path = tmp_path.replace(".wav", "_16k.wav")
	# ensure_16k_wav(tmp_path, resampled_path)

	# try:
	# result = asr_model.transcribe([resampled_path])
	# raw_text = result[0].text

	# raw_text = normalize_arabic(raw_text)
	# cleaned_text = replace_arabic_numbers(raw_text)
	# cleaned_text = join_digit_sequences(cleaned_text)

	# return JSONResponse(content={"transcription": cleaned_text})

	# finally:
	# os.remove(tmp_path)
	# if os.path.exists(resampled_path):
	# os.remove(resampled_path)

	# if __name__ == "__main__":
	# uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)
	from fastapi import FastAPI, File, UploadFile
	from fastapi.responses import JSONResponse
	import uvicorn
	import tempfile
	import nemo.collections.asr as nemo_asr
	import re
	import os
	import librosa
	import soundfile as sf

	# ===== Arabic + English number mapping (expanded) =====
	arabic_numbers = {
	# Arabic digits
	"صفر": "0", "زيرو": "0", "٠": "0", "زيو": "0", "زير": "0","زينوا": "0",
	"واحد": "1", "واحدة": "1", "١": "1",
	"اتنين": "2", "اثنين": "2", "اثنان": "2", "٢": "2",
	"تلاتة": "3", "ثلاثة": "3", "٣": "3", "ثلاث": "3", "تلات": "3",
	"اربعة": "4", "أربعة": "4", "٤": "4",
	"خمسة": "5", "٥": "5", "خمسه": "5",
	"ستة": "6", "٦": "6",
	"سبعة": "7", "٧": "7", "سبعه": "7",
	"تمانية": "8", "ثمانية": "8", "٨": "8",
	"تسعة": "9", "٩": "9",
	"عشرة": "10", "١٠": "10", "عشره": "10",
	"حداشر": "11", "احد عشر": "11", "احداشر": "11",
	"اتناشر": "12", "اثنا عشر": "12",
	"تلتاشر": "13", "ثلاثة عشر": "13",
	"اربعتاشر": "14", "أربعة عشر": "14",
	"خمستاشر": "15", "خمسة عشر": "15",
	"ستاشر": "16", "ستة عشر": "16",
	"سبعتاشر": "17", "سبعة عشر": "17",
	"طمنتاشر": "18", "ثمانية عشر": "18",
	"تسعتاشر": "19", "تسعة عشر": "19",
	"عشرين": "20", "٢٠": "20",
	"تلاتين": "30", "ثلاثين": "30", "٣٠": "30",
	"اربعين": "40", "أربعين": "40", "٤٠": "40",
	"خمسين": "50", "٥٠": "50",
	"ستين": "60", "٦٠": "60",
	"سبعين": "70", "٧٠": "70",
	"تمانين": "80", "ثمانين": "80", "٨٠": "80",
	"تسعين": "90", "٩٠": "90",
	"مية": "100", "مائة": "100", "مئة": "100", "١٠٠": "100",
	"ميتين": "200", "مائتين": "200",
	"تلاتمية": "300", "ثلاثمائة": "300",
	"اربعمية": "400", "أربعمائة": "400",
	"خمسمية": "500", "خمسمائة": "500",
	"ستمية": "600", "ستمائة": "600",
	"سبعمية": "700", "سبعمائة": "700",
	"تمانمية": "800", "ثمانمائة": "800",
	"تسعمية": "900", "تسعمائة": "900",
	"ألف": "1000", "الف": "1000", "١٠٠٠": "1000",
	"ألفين": "2000", "الفين": "2000",
	"تلات تلاف": "3000", "ثلاثة آلاف": "3000",
	"اربعة آلاف": "4000", "أربعة آلاف": "4000",
	"خمسة آلاف": "5000",
	"ستة آلاف": "6000",
	"سبعة آلاف": "7000",
	"تمانية آلاف": "8000", "ثمانية آلاف": "8000",
	"تسعة آلاف": "9000",
	"عشرة آلاف": "10000",
	"مية ألف": "100000", "مائة ألف": "100000",
	"مليون": "1000000", "ملايين": "1000000",
	"مليار": "1000000000",

	# English digits
	"zero": "0", "one": "1", "two": "2", "to": "2", "too": "2", "three": "3",
	"four": "4", "for": "4", "five": "5", "six": "6", "seven": "7",
	"eight": "8", "nine": "9", "ten": "10",
	"eleven": "11", "twelve": "12", "thirteen": "13", "fourteen": "14",
	"fifteen": "15", "sixteen": "16", "seventeen": "17",
	"eighteen": "18", "nineteen": "19", "twenty": "20",
	"thirty": "30", "forty": "40", "fifty": "50",
	"sixty": "60", "seventy": "70", "eighty": "80", "ninety": "90",
	"hundred": "100", "thousand": "1000", "million": "1000000",

	# Arabic variants
	"تلاته": "3", "اربعه": "4", "سته": "6",
	"تمانيه": "8", "ثماني": "80", "تسعه": "9",
	"واحده": "1", "عشره": "10",
	"حداشر": "11", "اتناشر": "12",
	"تلاته عشر": "13", "اربعه عشر": "14",
	"خمسه عشر": "15", "سته عشر": "16",
	"سبعه عشر": "17", "ثمانيه عشر": "18",
	"تسعه عشر": "19",

	# English tricky forms
	"oh": "0",
	"double zero": "00",
	"double one": "11",
	"double two": "22",
	"double three": "33",
	"double four": "44",
	"double five": "55",
	"double six": "66",
	"double seven": "77",
	"double eight": "88",
	"double nine": "99",
	"for": "4", "to": "2", "too": "2",
	"nite": "9", "fiv": "5",

	# 🔹 Repeated Digits
	"واحد واحد": "11",
	"اثنين اثنين": "22",
	"اتنين اتنين": "22",
	"ثلاثة ثلاثة": "33",
	"تلاتة تلاتة": "33",
	"أربعة أربعة": "44",
	"اربعة اربعة": "44",
	"خمسة خمسة": "55",
	"ستة ستة": "66",
	"سبعة سبعة": "77",
	"ثمانية ثمانية": "88",
	"تمانية تمانية": "88",
	"تسعة تسعة": "99",

	# 🔹 Hundreds
	"مئة": "100",
	"مية": "100",
	"مئتين": "200",
	"ميتين": "200",
	"ثلاثمية": "300",
	"تلتمية": "300",
	"أربعمية": "400",
	"اربعمية": "400",
	"خمسمية": "500",
	"ستمية": "600",
	"سبعمية": "700",
	"تمانمية": "800",
	"تسعمية": "900",

	# 🔹 Teens
	"أحد عشر": "11",
	"حداشر": "11",
	"اثنا عشر": "12",
	"اتناشر": "12",
	"ثلاثة عشر": "13",
	"تلاتاشر": "13",
	"أربعة عشر": "14",
	"اربعتاشر": "14",
	"خمسة عشر": "15",
	"خمسطاشر": "15",
	"ستة عشر": "16",
	"ستاشر": "16",
	"سبعة عشر": "17",
	"سبعتاشر": "17",
	"ثمانية عشر": "18",
	"طمنتاشر": "18",
	"تسعة عشر": "19",
	"تسعتاشر": "19",

	# 🔹 Tens
	"عشرين": "20",
	"تلاتين": "30",
	"ثلاثين": "30",
	"أربعين": "40",
	"اربعين": "40",
	"خمسين": "50",
	"ستين": "60",
	"سبعين": "70",
	"ثمانين": "80",
	"تمانين": "80",
	"ثامنين": "80",
	"تسعين": "90",

	# 🔹 Mixed Word + Digits
	"خمسة صفر": "50",
	"ثلاثة صفر صفر": "300",
	"تلاتة صفر صفر": "300",
	"واحد صفر": "10",
	"واحد صفر واحد": "101",
	"واحد اثنين": "12",
	"واحد اتنين": "12",
	"واحد ثلاثة": "13",
	"واحد تلاتة": "13",
	"واحد خمسة": "15",
	"عشرة عشرة": "1010",
	# Zero
	"صفر": "0",
	"زيرو": "0",
	"زيو": "0",
	"٠": "0",

	# One
	"واحد": "1",
	"واحدة": "1",
	"١": "1",

	# Two
	"اتنين": "2",
	"اتنين": "2",
	"اثنين": "2",
	"اثنان": "2",
	"٢": "2",

	# Three
	"تلاتة": "3",
	"تلات": "3",
	"ثلاثة": "3",
	"ثلاث": "3",
	"٣": "3",

	# Four
	"اربعة": "4",
	"أربعة": "4",
	"٤": "4",

	# Five
	"خمسة": "5",
	"خمسه": "5",
	"٥": "5",

	# Six
	"ستة": "6",
	"ست": "6",
	"٦": "6",

	# Seven
	"سبعة": "7",
	"سبعه": "7",
	"٧": "7",

	# Eight
	"تمانية": "8",
	"تمنية": "8",
	"ثمانية": "8",
	"ثماني": "8",
	"ثمان": "8",
	"ثمانيّة": "8",
	"ثماني": "8",
	"٨": "8",

	# Nine
	"تسعة": "9",
	"تسعه": "9",
	"٩": "9"
	}

	# ===== Helpers =====
	def normalize_arabic(text: str) -> str:
	diacritics = re.compile(r'[\u0617-\u061A\u064B-\u0652]')
	text = re.sub(diacritics, '', text)
	text = re.sub(r'[إأآا]', 'ا', text)
	text = re.sub(r'ى', 'ي', text)
	text = re.sub(r'ؤ', 'و', text)
	text = re.sub(r'ئ', 'ي', text)
	text = re.sub(r'ة', 'ه', text)
	return text

	def replace_arabic_numbers(text: str) -> str:
	for word, digit in arabic_numbers.items():
	text = re.sub(fr"(?:^\|\s){word}(?:$\|\s)", f" {digit} ", text, flags=re.IGNORECASE)
	return " ".join(text.split())

	def join_digit_sequences(text: str) -> str:
	tokens = text.split()
	out, buffer = [], []
	for tok in tokens:
	if tok.isdigit() and len(tok) == 1:
	buffer.append(tok)
	else:
	if buffer:
	out.append("".join(buffer)) # join sequences like 8 5 -> 85
	buffer = []
	out.append(tok)
	if buffer:
	out.append("".join(buffer))
	return " ".join(out)

	def ensure_16k_wav(input_path, output_path):
	y, sr = librosa.load(input_path, sr=16000, mono=True)
	sf.write(output_path, y, 16000)

	# ===== FastAPI app =====
	app = FastAPI(title="Arabic ASR API", description="ASR API with NeMo and Arabic/English digit conversion")

	@app.on_event("startup")
	def load_model():
	global asr_model
	model_path = "C:/Users/thegh/Python_Projects/Expertflow/UnderProgress/Peter_Projects/NP_Detection_Nvidia_conformer/stt_ar_fastconformer_hybrid_large_pc_v1.0.nemo"
	asr_model = nemo_asr.models.EncDecCTCModel.restore_from(model_path)

	@app.post("/transcribe")
	async def transcribe_audio(file: UploadFile = File(...)):
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	tmp.write(await file.read())
	tmp_path = tmp.name

	resampled_path = tmp_path.replace(".wav", "_16k.wav")
	ensure_16k_wav(tmp_path, resampled_path)

	try:
	result = asr_model.transcribe([resampled_path])
	raw_text = result[0].text
	print(raw_text)
	raw_text = normalize_arabic(raw_text)
	cleaned_text = replace_arabic_numbers(raw_text)
	cleaned_text = join_digit_sequences(cleaned_text)

	return JSONResponse(content={"transcription": cleaned_text})

	finally:
	os.remove(tmp_path)
	if os.path.exists(resampled_path):
	os.remove(resampled_path)

	@app.post("/transcribe-bytes")
	async def transcribe_audio_bytes(audio_bytes: bytes = File(...)):
	with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp:
	tmp.write(audio_bytes)
	tmp_path = tmp.name

	resampled_path = tmp_path.replace(".wav", "_16k.wav")
	ensure_16k_wav(tmp_path, resampled_path)

	try:
	result = asr_model.transcribe([resampled_path])
	raw_text = result[0].text

	raw_text = normalize_arabic(raw_text)
	cleaned_text = replace_arabic_numbers(raw_text)
	cleaned_text = join_digit_sequences(cleaned_text)

	return JSONResponse(content={"transcription": cleaned_text})

	finally:
	os.remove(tmp_path)
	if os.path.exists(resampled_path):
	os.remove(resampled_path)

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=8000, reload=True)