sae8d commited on
Commit
36d4c40
·
verified ·
1 Parent(s): 4495edf

Upload main.py

Browse files
Files changed (1) hide show
  1. main.py +359 -343
main.py CHANGED
@@ -1,344 +1,360 @@
1
- from fastapi import FastAPI, UploadFile, File, HTTPException
2
- from fastapi.middleware.cors import CORSMiddleware
3
- from fastapi.responses import JSONResponse
4
- import torch
5
- from transformers import pipeline
6
- import json
7
- import os
8
- from difflib import SequenceMatcher
9
- from typing import Dict, Any, Optional
10
- import tempfile
11
- import subprocess
12
- import shutil
13
-
14
- app = FastAPI(
15
- title="Bayan AI بيان",
16
- description="",
17
- version="1.0.0"
18
- )
19
-
20
- app.add_middleware(
21
- CORSMiddleware,
22
- allow_origins=["*"], # Allow all origins for local development
23
- allow_credentials=True,
24
- allow_methods=["*"],
25
- allow_headers=["*"],
26
- )
27
-
28
- # CPU only on free tier
29
- device = -1
30
-
31
- # Load Whisper pipeline (model downloads/caches automatically on first run)
32
- pipe = pipeline(
33
- "automatic-speech-recognition",
34
- model="tarteel-ai/whisper-tiny-ar-quran",
35
- device=device,
36
- )
37
-
38
- # Standard Surah names (1–114)
39
- surah_names = {
40
- 1: "Al-Fatiha (الفاتحة)",
41
- 2: "Al-Baqarah (البقرة)",
42
- 3: "Aal-E-Imran (آل عمران)",
43
- 4: "An-Nisa (النساء)",
44
- 5: "Al-Maidah (المائدة)",
45
- 6: "Al-An'am (الأنعام)",
46
- 7: "Al-A'raf (الأعراف)",
47
- 8: "Al-Anfal (الأنفال)",
48
- 9: "At-Tawbah (التوبة)",
49
- 10: "Yunus (يونس)",
50
- 11: "Hud (هود)",
51
- 12: "Yusuf (يوسف)",
52
- 13: "Ar-Ra'd (الرعد)",
53
- 14: "Ibrahim (إبراهيم)",
54
- 15: "Al-Hijr (الحجر)",
55
- 16: "An-Nahl (النحل)",
56
- 17: "Al-Isra (الإسراء)",
57
- 18: "Al-Kahf (الكهف)",
58
- 19: "Maryam (مريم)",
59
- 20: "Ta-Ha (طه)",
60
- 21: "Al-Anbiya (الأنبياء)",
61
- 22: "Al-Hajj (الحج)",
62
- 23: "Al-Mu'minun (المؤمنون)",
63
- 24: "An-Nur (النور)",
64
- 25: "Al-Furqan (الفرقان)",
65
- 26: "Ash-Shu'ara (الشعراء)",
66
- 27: "An-Naml (النمل)",
67
- 28: "Al-Qasas (القصص)",
68
- 29: "Al-Ankabut (العنكبوت)",
69
- 30: "Ar-Rum (الروم)",
70
- 31: "Luqman (لقمان)",
71
- 32: "As-Sajdah (السجدة)",
72
- 33: "Al-Ahzab (الأحزاب)",
73
- 34: "Saba (سبأ)",
74
- 35: "Fatir (فاطر)",
75
- 36: "Ya-Sin (يس)",
76
- 37: "As-Saffat (الصافات)",
77
- 38: "Sad (ص)",
78
- 39: "Az-Zumar (الزمر)",
79
- 40: "Ghafir (غافر)",
80
- 41: "Fussilat (فصلت)",
81
- 42: "Ash-Shura (الشورى)",
82
- 43: "Az-Zukhruf (الزخرف)",
83
- 44: "Ad-Dukhkhan (الدخان)",
84
- 45: "Al-Jathiya (الجاثية)",
85
- 46: "Al-Ahqaf (الأحقاف)",
86
- 47: "Muhammad (محمد)",
87
- 48: "Al-Fath (الفتح)",
88
- 49: "Al-Hujurat (الحجرات)",
89
- 50: "Qaf (ق)",
90
- 51: "Adh-Dhariyat (الذاريات)",
91
- 52: "At-Tur (الطور)",
92
- 53: "An-Najm (النجم)",
93
- 54: "Al-Qamar (القمر)",
94
- 55: "Ar-Rahman (الرحمن)",
95
- 56: "Al-Waqi'ah (الواقعة)",
96
- 57: "Al-Hadid (الحديد)",
97
- 58: "Al-Mujadila (المجادلة)",
98
- 59: "Al-Hashr (الحشر)",
99
- 60: "Al-Mumtahina (الممتحنة)",
100
- 61: "As-Saff (الصف)",
101
- 62: "Al-Jumu'ah (الجمعة)",
102
- 63: "Al-Munafiqoon (المنافقون)",
103
- 64: "At-Taghabun (التغابن)",
104
- 65: "At-Talaq (الطلاق)",
105
- 66: "At-Tahrim (التحريم)",
106
- 67: "Al-Mulk (الملك)",
107
- 68: "Al-Qalam (القلم)",
108
- 69: "Al-Haqqah (الحاقة)",
109
- 70: "Al-Ma'arij (المعارج)",
110
- 71: "Nooh (نوح)",
111
- 72: "Al-Jinn (الجن)",
112
- 73: "Al-Muzzammil (المزمل)",
113
- 74: "Al-Muddathir (المدثر)",
114
- 75: "Al-Qiyamah (القيامة)",
115
- 76: "Al-Insan (الإنسان)",
116
- 77: "Al-Mursalat (المرسلات)",
117
- 78: "An-Naba (النبأ)",
118
- 79: "An-Nazi'at (النازعات)",
119
- 80: "Abasa (عبس)",
120
- 81: "At-Takwir (التكوير)",
121
- 82: "Al-Infitar (الإنفطار)",
122
- 83: "Al-Mutaffifin (المطففين)",
123
- 84: "Al-Inshiqaq (الإنشقاق)",
124
- 85: "Al-Buruj (البروج)",
125
- 86: "At-Tariq (الطارق)",
126
- 87: "Al-A'la (الأعلى)",
127
- 88: "Al-Ghashiyah (الغاشية)",
128
- 89: "Al-Fajr (الفجر)",
129
- 90: "Al-Balad (البلد)",
130
- 91: "Ash-Shams (الشمس)",
131
- 92: "Al-Lail (الليل)",
132
- 93: "Ad-Duha (الضحى)",
133
- 94: "Ash-Sharh (الشرح)",
134
- 95: "At-Tin (التين)",
135
- 96: "Al-Alaq (العلق)",
136
- 97: "Al-Qadr (القدر)",
137
- 98: "Al-Bayyina (البينة)",
138
- 99: "Az-Zalzalah (الزلزلة)",
139
- 100: "Al-Adiyat (العاديات)",
140
- 101: "Al-Qari'ah (القارعة)",
141
- 102: "At-Takathur (التكاثر)",
142
- 103: "Al-Asr (العصر)",
143
- 104: "Al-Humazah (الهمزة)",
144
- 105: "Al-Fil (الفيل)",
145
- 106: "Quraish (قريش)",
146
- 107: "Al-Ma'un (الماعون)",
147
- 108: "Al-Kawthar (الكوثر)",
148
- 109: "Al-Kafirun (الكافرون)",
149
- 110: "An-Nasr (النصر)",
150
- 111: "Al-Masad (المسد)",
151
- 112: "Al-Ikhlas (الإخلاص)",
152
- 113: "Al-Falaq (الفلق)",
153
- 114: "An-Nas (الناس)",
154
- }
155
-
156
- import re
157
-
158
- def normalize_text(text: str) -> str:
159
- """Robust normalization for Arabic text."""
160
- text = re.sub(r"[إأآاٱ]", "ا", text)
161
- text = re.sub(r"ى", "ي", text)
162
- text = re.sub(r"ؤ", "ء", text)
163
- text = re.sub(r"ئ", "ء", text)
164
- text = re.sub(r"g", "ة", text)
165
- text = re.sub(r"ة", "ه", text)
166
- text = re.sub(r"[\u064B-\u065F\u0670]", "", text) # Tashkeel
167
- text = re.sub(r"[\u06D6-\u06ED]", "", text)
168
- text = re.sub(r"ء", "", text) # Remove Hamza to handle varying forms
169
- return " ".join(text.strip().split())
170
-
171
- # Pre-load all verses at startup
172
- all_verses = []
173
-
174
- surahs_dir = "surahs_json_files"
175
- if not os.path.isdir(surahs_dir):
176
- raise FileNotFoundError("Missing 'surahs_json_files/' folder.")
177
-
178
- for filename in sorted(os.listdir(surahs_dir)):
179
- if filename.endswith(".json"):
180
- try:
181
- surah_number = int(filename.split("_")[0])
182
- except:
183
- continue
184
- surah_name = surah_names.get(surah_number, f"Surah {surah_number}")
185
- file_path = os.path.join(surahs_dir, filename)
186
-
187
- with open(file_path, "r", encoding="utf-8") as f:
188
- data = json.load(f)
189
-
190
- verses = [ayah["text"] for ayah in data.get("ayahs", []) if "text" in ayah]
191
-
192
- for ayah_number, verse_text in enumerate(verses, start=1):
193
- verse_norm = normalize_text(verse_text)
194
- all_verses.append({
195
- "surah_number": surah_number,
196
- "surah_name": surah_name,
197
- "ayah_number": ayah_number,
198
- "verse_text": verse_text,
199
- "verse_norm": verse_norm
200
- })
201
-
202
- print(f"Loaded {len(all_verses)} verses from {len(os.listdir(surahs_dir))} surahs.")
203
-
204
- def find_best_verse(transcription: str) -> Dict[str, Any]:
205
- transcription_norm = normalize_text(transcription)
206
- if not transcription_norm:
207
- return {"error": "Empty transcription"}
208
-
209
- candidates = []
210
-
211
- # Pre-compile regex for whole word check
212
- pattern_str = r'(?:^|\s)' + re.escape(transcription_norm) + r'(?:\s|$)'
213
- whole_word_regex = re.compile(pattern_str)
214
-
215
- for verse in all_verses:
216
- verse_norm = verse["verse_norm"]
217
-
218
- is_whole_word = False
219
- containment = 0.0
220
- ratio = 0.0
221
-
222
- # Fast substring check
223
- if transcription_norm in verse_norm:
224
- containment = 1.0
225
- matcher = SequenceMatcher(None, transcription_norm, verse_norm)
226
- ratio = matcher.ratio()
227
-
228
- # Check for whole word match
229
- if whole_word_regex.search(verse_norm):
230
- is_whole_word = True
231
- else:
232
- matcher = SequenceMatcher(None, transcription_norm, verse_norm)
233
- match = matcher.find_longest_match(0, len(transcription_norm), 0, len(verse_norm))
234
- containment = match.size / len(transcription_norm) if len(transcription_norm) > 0 else 0
235
- ratio = matcher.ratio()
236
-
237
- candidates.append({
238
- "verse": verse,
239
- "containment": containment,
240
- "ratio": ratio,
241
- "is_whole_word": is_whole_word
242
- })
243
-
244
- # Sort by whole_word (desc), containment (desc), ratio (desc)
245
- candidates.sort(key=lambda x: (x["is_whole_word"], x["containment"], x["ratio"]), reverse=True)
246
-
247
- # If we have whole word matches, ignore partial matches
248
- if candidates and candidates[0]["is_whole_word"]:
249
- candidates = [c for c in candidates if c["is_whole_word"]]
250
-
251
- # Filter strong matches (>= 80% containment)
252
- strong_matches = [c for c in candidates if c["containment"] >= 0.8]
253
-
254
- def format_match(candidate):
255
- verse_data = candidate["verse"]
256
- return {
257
- "surah_number": verse_data["surah_number"],
258
- "surah_name": verse_data["surah_name"],
259
- "ayah_number": verse_data["ayah_number"],
260
- "verse_text": verse_data["verse_text"],
261
- "similarity_score": round(candidate["containment"], 4)
262
- }
263
-
264
- if not strong_matches:
265
- # No strong matches found
266
- if candidates:
267
- top_match = candidates[0]
268
- return {
269
- "error": "No confident match found",
270
- "best_similarity": round(top_match["containment"], 4),
271
- "possible_match": format_match(top_match)
272
- }
273
- else:
274
- return {"error": "No matches found"}
275
-
276
- if len(strong_matches) > 1:
277
- # Multiple strong matches -> return top 5
278
- top_5 = strong_matches[:5]
279
- return {
280
- "matches": [format_match(m) for m in top_5]
281
- }
282
- else:
283
- # Single dominant match
284
- return format_match(strong_matches[0])
285
-
286
- @app.get("/")
287
- def root():
288
- return {"message": "Bayan AI بيان... LIVE!"}
289
-
290
- @app.post("/recognize")
291
- async def recognize(file: UploadFile = File(...)):
292
- # Allow both audio and video
293
- is_video = file.content_type and file.content_type.startswith("video/")
294
- is_audio = file.content_type and file.content_type.startswith("audio/")
295
-
296
- if not is_audio and not is_video:
297
- raise HTTPException(status_code=400, detail="File must be an audio or video file")
298
-
299
- # Save to temp file
300
- contents = await file.read()
301
- file_extension = os.path.splitext(file.filename)[1] or (".mp4" if is_video else ".wav")
302
-
303
- with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp:
304
- tmp.write(contents)
305
- input_path = tmp.name
306
-
307
- audio_path = input_path
308
- temp_audio_path = None
309
-
310
- try:
311
- if is_video:
312
- # Check if ffmpeg is installed
313
- if not shutil.which("ffmpeg"):
314
- raise HTTPException(status_code=500, detail="ffmpeg not found on server")
315
-
316
- temp_audio_path = input_path + "_converted.wav"
317
- # Extract audio quickly and silently
318
- # -vn: no video, -acodec pcm_s16le: wav format, -ar 16000: whisper preferred sample rate
319
- # -y: overwrite, -loglevel error: be silent
320
- cmd = [
321
- "ffmpeg", "-y", "-i", input_path,
322
- "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
323
- "-loglevel", "error",
324
- temp_audio_path
325
- ]
326
- subprocess.run(cmd, check=True)
327
- audio_path = temp_audio_path
328
-
329
- transcription = pipe(audio_path)["text"]
330
- except subprocess.CalledProcessError as e:
331
- raise HTTPException(status_code=500, detail=f"Video conversion error: {str(e)}")
332
- except Exception as e:
333
- raise HTTPException(status_code=500, detail=f"Transcription error: {str(e)}")
334
- finally:
335
- # Clean up all temp files
336
- if os.path.exists(input_path):
337
- os.unlink(input_path)
338
- if temp_audio_path and os.path.exists(temp_audio_path):
339
- os.unlink(temp_audio_path)
340
-
341
- result = find_best_verse(transcription)
342
- result["transcription"] = transcription
343
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
344
  return JSONResponse(content=result)
 
1
+ from fastapi import FastAPI, UploadFile, File, HTTPException
2
+ from fastapi.middleware.cors import CORSMiddleware
3
+ from fastapi.responses import JSONResponse
4
+ import torch
5
+ from transformers import pipeline
6
+ import json
7
+ import os
8
+ from difflib import SequenceMatcher
9
+ from typing import Dict, Any, Optional
10
+ import tempfile
11
+ import subprocess
12
+ import shutil
13
+
14
+ app = FastAPI(
15
+ title="Bayan AI بيان",
16
+ description="",
17
+ version="1.0.0"
18
+ )
19
+
20
+ app.add_middleware(
21
+ CORSMiddleware,
22
+ allow_origins=["*"], # Allow all origins for local development
23
+ allow_credentials=True,
24
+ allow_methods=["*"],
25
+ allow_headers=["*"],
26
+ )
27
+
28
+ # CPU only on free tier
29
+ device = -1
30
+
31
+ # Load Whisper pipeline (model downloads/caches automatically on first run)
32
+ pipe = pipeline(
33
+ "automatic-speech-recognition",
34
+ model="tarteel-ai/whisper-tiny-ar-quran",
35
+ device=device,
36
+ )
37
+
38
+ # Standard Surah names (1–114)
39
+ surah_names = {
40
+ 1: "Al-Fatiha (الفاتحة)",
41
+ 2: "Al-Baqarah (البقرة)",
42
+ 3: "Aal-E-Imran (آل عمران)",
43
+ 4: "An-Nisa (النساء)",
44
+ 5: "Al-Maidah (المائدة)",
45
+ 6: "Al-An'am (الأنعام)",
46
+ 7: "Al-A'raf (الأعراف)",
47
+ 8: "Al-Anfal (الأنفال)",
48
+ 9: "At-Tawbah (التوبة)",
49
+ 10: "Yunus (يونس)",
50
+ 11: "Hud (هود)",
51
+ 12: "Yusuf (يوسف)",
52
+ 13: "Ar-Ra'd (الرعد)",
53
+ 14: "Ibrahim (إبراهيم)",
54
+ 15: "Al-Hijr (الحجر)",
55
+ 16: "An-Nahl (النحل)",
56
+ 17: "Al-Isra (الإسراء)",
57
+ 18: "Al-Kahf (الكهف)",
58
+ 19: "Maryam (مريم)",
59
+ 20: "Ta-Ha (طه)",
60
+ 21: "Al-Anbiya (الأنبياء)",
61
+ 22: "Al-Hajj (الحج)",
62
+ 23: "Al-Mu'minun (المؤمنون)",
63
+ 24: "An-Nur (النور)",
64
+ 25: "Al-Furqan (الفرقان)",
65
+ 26: "Ash-Shu'ara (الشعراء)",
66
+ 27: "An-Naml (النمل)",
67
+ 28: "Al-Qasas (القصص)",
68
+ 29: "Al-Ankabut (العنكبوت)",
69
+ 30: "Ar-Rum (الروم)",
70
+ 31: "Luqman (لقمان)",
71
+ 32: "As-Sajdah (السجدة)",
72
+ 33: "Al-Ahzab (الأحزاب)",
73
+ 34: "Saba (سبأ)",
74
+ 35: "Fatir (فاطر)",
75
+ 36: "Ya-Sin (يس)",
76
+ 37: "As-Saffat (الصافات)",
77
+ 38: "Sad (ص)",
78
+ 39: "Az-Zumar (الزمر)",
79
+ 40: "Ghafir (غافر)",
80
+ 41: "Fussilat (فصلت)",
81
+ 42: "Ash-Shura (الشورى)",
82
+ 43: "Az-Zukhruf (الزخرف)",
83
+ 44: "Ad-Dukhkhan (الدخان)",
84
+ 45: "Al-Jathiya (الجاثية)",
85
+ 46: "Al-Ahqaf (الأحقاف)",
86
+ 47: "Muhammad (محمد)",
87
+ 48: "Al-Fath (الفتح)",
88
+ 49: "Al-Hujurat (الحجرات)",
89
+ 50: "Qaf (ق)",
90
+ 51: "Adh-Dhariyat (الذاريات)",
91
+ 52: "At-Tur (الطور)",
92
+ 53: "An-Najm (النجم)",
93
+ 54: "Al-Qamar (القمر)",
94
+ 55: "Ar-Rahman (الرحمن)",
95
+ 56: "Al-Waqi'ah (الواقعة)",
96
+ 57: "Al-Hadid (الحديد)",
97
+ 58: "Al-Mujadila (المجادلة)",
98
+ 59: "Al-Hashr (الحشر)",
99
+ 60: "Al-Mumtahina (الممتحنة)",
100
+ 61: "As-Saff (الصف)",
101
+ 62: "Al-Jumu'ah (الجمعة)",
102
+ 63: "Al-Munafiqoon (المنافقون)",
103
+ 64: "At-Taghabun (التغابن)",
104
+ 65: "At-Talaq (الطلاق)",
105
+ 66: "At-Tahrim (التحريم)",
106
+ 67: "Al-Mulk (الملك)",
107
+ 68: "Al-Qalam (القلم)",
108
+ 69: "Al-Haqqah (الحاقة)",
109
+ 70: "Al-Ma'arij (المعارج)",
110
+ 71: "Nooh (نوح)",
111
+ 72: "Al-Jinn (الجن)",
112
+ 73: "Al-Muzzammil (المزمل)",
113
+ 74: "Al-Muddathir (المدثر)",
114
+ 75: "Al-Qiyamah (القيامة)",
115
+ 76: "Al-Insan (الإنسان)",
116
+ 77: "Al-Mursalat (المرسلات)",
117
+ 78: "An-Naba (النبأ)",
118
+ 79: "An-Nazi'at (النازعات)",
119
+ 80: "Abasa (عبس)",
120
+ 81: "At-Takwir (التكوير)",
121
+ 82: "Al-Infitar (الإنفطار)",
122
+ 83: "Al-Mutaffifin (المطففين)",
123
+ 84: "Al-Inshiqaq (الإنشقاق)",
124
+ 85: "Al-Buruj (البروج)",
125
+ 86: "At-Tariq (الطارق)",
126
+ 87: "Al-A'la (الأعلى)",
127
+ 88: "Al-Ghashiyah (الغاشية)",
128
+ 89: "Al-Fajr (الفجر)",
129
+ 90: "Al-Balad (البلد)",
130
+ 91: "Ash-Shams (الشمس)",
131
+ 92: "Al-Lail (الليل)",
132
+ 93: "Ad-Duha (الضحى)",
133
+ 94: "Ash-Sharh (الشرح)",
134
+ 95: "At-Tin (التين)",
135
+ 96: "Al-Alaq (العلق)",
136
+ 97: "Al-Qadr (القدر)",
137
+ 98: "Al-Bayyina (البينة)",
138
+ 99: "Az-Zalzalah (الزلزلة)",
139
+ 100: "Al-Adiyat (العاديات)",
140
+ 101: "Al-Qari'ah (القارعة)",
141
+ 102: "At-Takathur (التكاثر)",
142
+ 103: "Al-Asr (العصر)",
143
+ 104: "Al-Humazah (الهمزة)",
144
+ 105: "Al-Fil (الفيل)",
145
+ 106: "Quraish (قريش)",
146
+ 107: "Al-Ma'un (الماعون)",
147
+ 108: "Al-Kawthar (الكوثر)",
148
+ 109: "Al-Kafirun (الكافرون)",
149
+ 110: "An-Nasr (النصر)",
150
+ 111: "Al-Masad (المسد)",
151
+ 112: "Al-Ikhlas (الإخلاص)",
152
+ 113: "Al-Falaq (الفلق)",
153
+ 114: "An-Nas (الناس)",
154
+ }
155
+
156
+ # Phrases to ignore (e.g., common introductions)
157
+ PHRASES_TO_IGNORE = [
158
+ "بِسْمِ اللَّهِ الرَّحْمَنِ الرَّحِيمِ",
159
+ "أعوذ بالله من الشيطان الرجيم",
160
+ "صدق الله العظيم",
161
+ ]
162
+
163
+ import re
164
+
165
+ def normalize_text(text: str) -> str:
166
+ """Robust normalization for Arabic text."""
167
+ text = re.sub(r"[إأآاٱ]", "ا", text)
168
+ text = re.sub(r"ى", "ي", text)
169
+ text = re.sub(r"ؤ", "ء", text)
170
+ text = re.sub(r"ئ", "ء", text)
171
+ text = re.sub(r"g", "ة", text)
172
+ text = re.sub(r"ة", "ه", text)
173
+ text = re.sub(r"[\u064B-\u065F\u0670]", "", text) # Tashkeel
174
+ text = re.sub(r"[\u06D6-\u06ED]", "", text)
175
+ text = re.sub(r"ء", "", text) # Remove Hamza to handle varying forms
176
+ return " ".join(text.strip().split())
177
+
178
+ # Pre-load all verses at startup
179
+ all_verses = []
180
+
181
+ surahs_dir = "surahs_json_files"
182
+ if not os.path.isdir(surahs_dir):
183
+ raise FileNotFoundError("Missing 'surahs_json_files/' folder.")
184
+
185
+ for filename in sorted(os.listdir(surahs_dir)):
186
+ if filename.endswith(".json"):
187
+ try:
188
+ surah_number = int(filename.split("_")[0])
189
+ except:
190
+ continue
191
+ surah_name = surah_names.get(surah_number, f"Surah {surah_number}")
192
+ file_path = os.path.join(surahs_dir, filename)
193
+
194
+ with open(file_path, "r", encoding="utf-8") as f:
195
+ data = json.load(f)
196
+
197
+ verses = [ayah["text"] for ayah in data.get("ayahs", []) if "text" in ayah]
198
+
199
+ for ayah_number, verse_text in enumerate(verses, start=1):
200
+ verse_norm = normalize_text(verse_text)
201
+ all_verses.append({
202
+ "surah_number": surah_number,
203
+ "surah_name": surah_name,
204
+ "ayah_number": ayah_number,
205
+ "verse_text": verse_text,
206
+ "verse_norm": verse_norm
207
+ })
208
+
209
+ print(f"Loaded {len(all_verses)} verses from {len(os.listdir(surahs_dir))} surahs.")
210
+
211
+ def find_best_verse(transcription: str) -> Dict[str, Any]:
212
+ transcription_norm = normalize_text(transcription)
213
+
214
+ # Remove phrases to ignore (Bismillah, A'udhu billah)
215
+ for phrase in PHRASES_TO_IGNORE:
216
+ phrase_norm = normalize_text(phrase)
217
+ if phrase_norm in transcription_norm:
218
+ # Replace and clean up extra spaces
219
+ transcription_norm = transcription_norm.replace(phrase_norm, "").strip()
220
+ transcription_norm = " ".join(transcription_norm.split())
221
+
222
+ if not transcription_norm:
223
+ return {"error": "Empty transcription"}
224
+
225
+ candidates = []
226
+
227
+ # Pre-compile regex for whole word check
228
+ pattern_str = r'(?:^|\s)' + re.escape(transcription_norm) + r'(?:\s|$)'
229
+ whole_word_regex = re.compile(pattern_str)
230
+
231
+ for verse in all_verses:
232
+ verse_norm = verse["verse_norm"]
233
+
234
+ is_whole_word = False
235
+ containment = 0.0
236
+ ratio = 0.0
237
+
238
+ # Fast substring check
239
+ if transcription_norm in verse_norm:
240
+ containment = 1.0
241
+ matcher = SequenceMatcher(None, transcription_norm, verse_norm)
242
+ ratio = matcher.ratio()
243
+
244
+ # Check for whole word match
245
+ if whole_word_regex.search(verse_norm):
246
+ is_whole_word = True
247
+ else:
248
+ matcher = SequenceMatcher(None, transcription_norm, verse_norm)
249
+ match = matcher.find_longest_match(0, len(transcription_norm), 0, len(verse_norm))
250
+ containment = match.size / len(transcription_norm) if len(transcription_norm) > 0 else 0
251
+ ratio = matcher.ratio()
252
+
253
+ candidates.append({
254
+ "verse": verse,
255
+ "containment": containment,
256
+ "ratio": ratio,
257
+ "is_whole_word": is_whole_word
258
+ })
259
+
260
+ # Sort by whole_word (desc), containment (desc), ratio (desc)
261
+ candidates.sort(key=lambda x: (x["is_whole_word"], x["containment"], x["ratio"]), reverse=True)
262
+
263
+ # If we have whole word matches, ignore partial matches
264
+ if candidates and candidates[0]["is_whole_word"]:
265
+ candidates = [c for c in candidates if c["is_whole_word"]]
266
+
267
+ # Filter strong matches (>= 80% containment)
268
+ strong_matches = [c for c in candidates if c["containment"] >= 0.8]
269
+
270
+ def format_match(candidate):
271
+ verse_data = candidate["verse"]
272
+ return {
273
+ "surah_number": verse_data["surah_number"],
274
+ "surah_name": verse_data["surah_name"],
275
+ "ayah_number": verse_data["ayah_number"],
276
+ "verse_text": verse_data["verse_text"],
277
+ "similarity_score": round(candidate["containment"], 4)
278
+ }
279
+
280
+ if not strong_matches:
281
+ # No strong matches found
282
+ if candidates:
283
+ top_match = candidates[0]
284
+ return {
285
+ "error": "No confident match found",
286
+ "best_similarity": round(top_match["containment"], 4),
287
+ "possible_match": format_match(top_match)
288
+ }
289
+ else:
290
+ return {"error": "No matches found"}
291
+
292
+ if len(strong_matches) > 1:
293
+ # Multiple strong matches -> return top 5
294
+ top_5 = strong_matches[:5]
295
+ return {
296
+ "matches": [format_match(m) for m in top_5]
297
+ }
298
+ else:
299
+ # Single dominant match
300
+ return format_match(strong_matches[0])
301
+
302
+ @app.get("/")
303
+ def root():
304
+ return {"message": "Bayan AI بيان... LIVE!"}
305
+
306
+ @app.post("/recognize")
307
+ async def recognize(file: UploadFile = File(...)):
308
+ # Allow both audio and video
309
+ is_video = file.content_type and file.content_type.startswith("video/")
310
+ is_audio = file.content_type and file.content_type.startswith("audio/")
311
+
312
+ if not is_audio and not is_video:
313
+ raise HTTPException(status_code=400, detail="File must be an audio or video file")
314
+
315
+ # Save to temp file
316
+ contents = await file.read()
317
+ file_extension = os.path.splitext(file.filename)[1] or (".mp4" if is_video else ".wav")
318
+
319
+ with tempfile.NamedTemporaryFile(delete=False, suffix=file_extension) as tmp:
320
+ tmp.write(contents)
321
+ input_path = tmp.name
322
+
323
+ audio_path = input_path
324
+ temp_audio_path = None
325
+
326
+ try:
327
+ if is_video:
328
+ # Check if ffmpeg is installed
329
+ if not shutil.which("ffmpeg"):
330
+ raise HTTPException(status_code=500, detail="ffmpeg not found on server")
331
+
332
+ temp_audio_path = input_path + "_converted.wav"
333
+ # Extract audio quickly and silently
334
+ # -vn: no video, -acodec pcm_s16le: wav format, -ar 16000: whisper preferred sample rate
335
+ # -y: overwrite, -loglevel error: be silent
336
+ cmd = [
337
+ "ffmpeg", "-y", "-i", input_path,
338
+ "-vn", "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
339
+ "-loglevel", "error",
340
+ temp_audio_path
341
+ ]
342
+ subprocess.run(cmd, check=True)
343
+ audio_path = temp_audio_path
344
+
345
+ transcription = pipe(audio_path)["text"]
346
+ except subprocess.CalledProcessError as e:
347
+ raise HTTPException(status_code=500, detail=f"Video conversion error: {str(e)}")
348
+ except Exception as e:
349
+ raise HTTPException(status_code=500, detail=f"Transcription error: {str(e)}")
350
+ finally:
351
+ # Clean up all temp files
352
+ if os.path.exists(input_path):
353
+ os.unlink(input_path)
354
+ if temp_audio_path and os.path.exists(temp_audio_path):
355
+ os.unlink(temp_audio_path)
356
+
357
+ result = find_best_verse(transcription)
358
+ result["transcription"] = transcription
359
+
360
  return JSONResponse(content=result)