Batnini commited on
Commit
3a975c1
·
verified ·
1 Parent(s): f2fd2de

Update tools/quran_search.py

Browse files
Files changed (1) hide show
  1. tools/quran_search.py +94 -38
tools/quran_search.py CHANGED
@@ -10,7 +10,6 @@ class QuranSearchEngine:
10
  def __init__(self):
11
  self.api_url = "https://quranapi.pages.dev/api/"
12
  self.logger = logging.getLogger(__name__)
13
- self.full_quran = None
14
  self.surahs = None
15
  self.all_verses = [] # List of {'surah_id': int, 'verse_num': int, 'text': str}
16
  self.verse_embeddings = None
@@ -20,8 +19,6 @@ class QuranSearchEngine:
20
  print(f"Surahs loaded: {len(self.surahs) if self.surahs else 0}") # Debug
21
  self._load_all_verses_and_embeddings()
22
  print(f"Verses loaded: {len(self.all_verses)}") # Debug
23
- if not self.model:
24
- self.logger.error("Model initialization failed, using fallback behavior")
25
 
26
  def _load_full_quran(self):
27
  max_retries = 3
@@ -29,54 +26,113 @@ class QuranSearchEngine:
29
  try:
30
  response = requests.get(f"{self.api_url}surah.json", timeout=10)
31
  response.raise_for_status()
32
- self.surahs = response.json() # Array of surah metadata (no 'id')
33
  # Add 'id' to surahs for consistency
34
  for i, s in enumerate(self.surahs):
35
  s['id'] = i + 1
36
  # Fetch full verses
37
- self.full_quran = []
38
  for surah_id in range(1, 115):
39
  surah_response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
40
  surah_response.raise_for_status()
41
- surah_data = surah_response.json()
42
- surah_data['id'] = surah_id # Add id for consistency
43
- self.full_quran.append(surah_data)
 
 
 
 
 
44
  break
45
  except Exception as e:
46
- self.logger.error(f"Attempt {attempt + 1}/{max_retries} failed to fetch full Quran from API: {e}")
47
  if attempt == max_retries - 1:
48
- self._load_fallback_quran()
49
  time.sleep(2 ** attempt)
50
 
51
- def _load_fallback_quran(self):
52
- max_retries = 3
53
- for source in QURAN_DATA_SOURCES:
54
- for attempt in range(max_retries):
55
- try:
56
- response = requests.get(source, timeout=10)
57
- response.raise_for_status()
58
- self.full_quran = response.json() # Array of surah dicts
59
- self.surahs = self.full_quran
60
- break
61
- except Exception as e:
62
- self.logger.error(f"Attempt {attempt + 1}/{max_retries} failed to fetch fallback from {source}: {e}")
63
- if attempt == max_retries - 1 and source == QURAN_DATA_SOURCES[-1]:
64
- self.surahs = self._load_fallback_surahs()
65
- time.sleep(2 ** attempt)
66
 
67
  def _load_all_verses_and_embeddings(self):
68
- if not self.full_quran:
69
- self.logger.error("No full Quran loaded, skipping verse loading")
70
- self.all_verses = [
71
- {'surah_id': 1, 'verse_num': 1, 'text': "بِسْمِ ٱللَّهِ ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ"},
72
- ]
73
  return
74
 
75
- for surah in self.full_quran:
76
- surah_id = surah.get('id', 1)
77
- if 'arabic1' in surah: # API structure: verses as list of str
78
- verses = surah['arabic1']
79
- for verse_num, text in enumerate(verses, start=1):
80
- self.all_verses.append({
81
- 'surah_id': surah_id,
82
- 'verse_num': verse_num,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def __init__(self):
11
  self.api_url = "https://quranapi.pages.dev/api/"
12
  self.logger = logging.getLogger(__name__)
 
13
  self.surahs = None
14
  self.all_verses = [] # List of {'surah_id': int, 'verse_num': int, 'text': str}
15
  self.verse_embeddings = None
 
19
  print(f"Surahs loaded: {len(self.surahs) if self.surahs else 0}") # Debug
20
  self._load_all_verses_and_embeddings()
21
  print(f"Verses loaded: {len(self.all_verses)}") # Debug
 
 
22
 
23
  def _load_full_quran(self):
24
  max_retries = 3
 
26
  try:
27
  response = requests.get(f"{self.api_url}surah.json", timeout=10)
28
  response.raise_for_status()
29
+ self.surahs = response.json()
30
  # Add 'id' to surahs for consistency
31
  for i, s in enumerate(self.surahs):
32
  s['id'] = i + 1
33
  # Fetch full verses
 
34
  for surah_id in range(1, 115):
35
  surah_response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
36
  surah_response.raise_for_status()
37
+ data = surah_response.json()
38
+ verses = data['arabic1'] # Arabic with tashkeel
39
+ for verse_num, text in enumerate(verses, start=1):
40
+ self.all_verses.append({
41
+ 'surah_id': surah_id,
42
+ 'verse_num': verse_num,
43
+ 'text': text
44
+ })
45
  break
46
  except Exception as e:
47
+ self.logger.error(f"Attempt {attempt + 1}/{max_retries} failed to fetch Quran data: {e}")
48
  if attempt == max_retries - 1:
49
+ self._load_fallback_data()
50
  time.sleep(2 ** attempt)
51
 
52
+ def _load_fallback_data(self):
53
+ self.surahs = self._load_fallback_surahs()
54
+ self.all_verses = [
55
+ {'surah_id': 1, 'verse_num': 1, 'text': "بِسْمِ ٱللَّهِ ٱلرَّحْمَـٰنِ ٱلرَّحِيمِ"},
56
+ {'surah_id': 1, 'verse_num': 2, 'text': "ٱلْحَمْدُ لِلَّهِ رَبِّ ٱلْعَٰلَمِينَ"}
57
+ # Add more hardcoded verses if needed, but limited
58
+ ]
 
 
 
 
 
 
 
 
59
 
60
  def _load_all_verses_and_embeddings(self):
61
+ if not self.all_verses:
 
 
 
 
62
  return
63
 
64
+ try:
65
+ self.model = SentenceTransformer(MODEL_NAME)
66
+ verse_texts = [v['text'] for v in self.all_verses]
67
+ self.verse_embeddings = []
68
+ for i in range(0, len(verse_texts), CHUNK_SIZE):
69
+ chunk = verse_texts[i:i + CHUNK_SIZE]
70
+ embeddings = self.model.encode(chunk, convert_to_tensor=False)
71
+ self.verse_embeddings.append(embeddings)
72
+ self.verse_embeddings = np.vstack(self.verse_embeddings)
73
+ except Exception as e:
74
+ self.logger.error(f"Failed to compute embeddings: {e}")
75
+ self.verse_embeddings = None
76
+
77
+ def get_surahs(self):
78
+ if self.surahs:
79
+ return [
80
+ (s['surahNameArabicLong'], s['id'])
81
+ for s in self.surahs
82
+ ]
83
+ return self._load_fallback_surahs()
84
+
85
+ def get_surah_text(self, surah_id):
86
+ max_retries = 3
87
+ for attempt in range(max_retries):
88
+ try:
89
+ response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
90
+ response.raise_for_status()
91
+ data = response.json()
92
+ verses = data['arabic1']
93
+ return "\n\n".join(f"آية {i + 1}: {v}" for i, v in enumerate(verses))
94
+ except Exception as e:
95
+ self.logger.error(f"Attempt {attempt + 1}/{max_retries} failed to fetch surah {surah_id}: {e}")
96
+ if attempt == max_retries - 1:
97
+ return self._load_fallback_verse()
98
+ time.sleep(2 ** attempt)
99
+
100
+ def search_verses(self, query, top_k=5):
101
+ if self.verse_embeddings is None or not self.all_verses:
102
+ return self._keyword_fallback_search(query, top_k)
103
+
104
+ try:
105
+ query_embedding = self.model.encode([query], convert_to_tensor=False)
106
+ similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
107
+ top_indices = np.argsort(similarities)[-top_k:][::-1]
108
+
109
+ results = []
110
+ for idx in top_indices:
111
+ verse = self.all_verses[idx]
112
+ surah_name = self.surahs[verse['surah_id'] - 1]['surahNameArabicLong']
113
+ results.append(
114
+ f"سورة {surah_name} - آية {verse['verse_num']}:\n{verse['text']}"
115
+ )
116
+ return "\n\n".join(results)
117
+ except Exception as e:
118
+ self.logger.error(f"Search failed: {e}")
119
+ return "حدث خطأ أثناء البحث. جرب مرة أخرى."
120
+
121
+ def _keyword_fallback_search(self, query, top_k=5):
122
+ query_lower = query.lower()
123
+ matches = []
124
+ for verse in self.all_verses:
125
+ if query_lower in verse['text'].lower():
126
+ surah_name = self.surahs[verse['surah_id'] - 1]['surahNameArabicLong'] if self.surahs else f"سورة {verse['surah_id']}"
127
+ matches.append(f"سورة {surah_name} - آية {verse['verse_num']}:\n{verse['text']}")
128
+ return "\n\n".join(matches[:top_k]) or "لا توجد نتائج مطابقة."
129
+
130
+ def _load_fallback_surahs(self):
131
+ return [
132
+ ("الفاتحة", 1),
133
+ ("البقرة", 2),
134
+ ("آل عمران", 3)
135
+ ]
136
+
137
+ def _load_fallback_verse(self):
138
+ return "بسم الله الرحمن الرحيم\nالله لا إله إلا هو الحي القيوم"