Batnini commited on
Commit
01939f6
ยท
verified ยท
1 Parent(s): d9b828d

Update tools/quran_search.py

Browse files
Files changed (1) hide show
  1. tools/quran_search.py +35 -34
tools/quran_search.py CHANGED
@@ -5,6 +5,7 @@ from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from config import QURAN_DATA_SOURCES, MODEL_NAME, CHUNK_SIZE
7
  import time
 
8
 
9
  class QuranSearchEngine:
10
  def __init__(self):
@@ -14,28 +15,32 @@ class QuranSearchEngine:
14
  self.all_verses = [] # List of {'surah_id': int, 'verse_num': int, 'text': str}
15
  self.verse_embeddings = None
16
  self.model = None
17
- print("Starting QuranSearchEngine initialization...") # Debug
18
- self._load_full_quran()
19
- print(f"Surahs loaded: {len(self.surahs) if self.surahs else 0}") # Debug
20
- self._load_all_verses_and_embeddings()
21
- print(f"Verses loaded: {len(self.all_verses)}") # Debug
22
-
 
 
 
 
 
23
  def _load_full_quran(self):
24
- max_retries = 3
25
  for attempt in range(max_retries):
26
  try:
27
- response = requests.get(f"{self.api_url}surah.json", timeout=10)
28
  response.raise_for_status()
29
  self.surahs = response.json()
30
- # Add 'id' to surahs for consistency
31
  for i, s in enumerate(self.surahs):
32
  s['id'] = i + 1
33
- # Fetch full verses
34
  for surah_id in range(1, 115):
35
- surah_response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
36
  surah_response.raise_for_status()
37
  data = surah_response.json()
38
- verses = data['arabic1'] # Arabic with tashkeel
39
  for verse_num, text in enumerate(verses, start=1):
40
  self.all_verses.append({
41
  'surah_id': surah_id,
@@ -47,22 +52,24 @@ class QuranSearchEngine:
47
  self.logger.error(f"Attempt {attempt + 1}/{max_retries} failed to fetch Quran data: {e}")
48
  if attempt == max_retries - 1:
49
  self._load_fallback_data()
50
- time.sleep(2 ** attempt)
51
-
52
  def _load_fallback_data(self):
 
53
  self.surahs = self._load_fallback_surahs()
54
  self.all_verses = [
55
  {'surah_id': 1, 'verse_num': 1, 'text': "ุจูุณู’ู…ู ูฑู„ู„ู‘ูŽู‡ู ูฑู„ุฑู‘ูŽุญู’ู…ูŽู€ูฐู†ู ูฑู„ุฑู‘ูŽุญููŠู…ู"},
56
  {'surah_id': 1, 'verse_num': 2, 'text': "ูฑู„ู’ุญูŽู…ู’ุฏู ู„ูู„ู‘ูŽู‡ู ุฑูŽุจู‘ู ูฑู„ู’ุนูŽูฐู„ูŽู…ููŠู†ูŽ"}
57
- # Add more hardcoded verses if needed, but limited
58
  ]
59
-
60
  def _load_all_verses_and_embeddings(self):
61
  if not self.all_verses:
62
  return
63
 
64
  try:
 
65
  self.model = SentenceTransformer(MODEL_NAME)
 
66
  verse_texts = [v['text'] for v in self.all_verses]
67
  self.verse_embeddings = []
68
  for i in range(0, len(verse_texts), CHUNK_SIZE):
@@ -70,23 +77,22 @@ class QuranSearchEngine:
70
  embeddings = self.model.encode(chunk, convert_to_tensor=False)
71
  self.verse_embeddings.append(embeddings)
72
  self.verse_embeddings = np.vstack(self.verse_embeddings)
 
73
  except Exception as e:
74
- self.logger.error(f"Failed to compute embeddings: {e}")
75
  self.verse_embeddings = None
76
-
 
77
  def get_surahs(self):
78
  if self.surahs:
79
- return [
80
- (s['surahNameArabicLong'], s['id'])
81
- for s in self.surahs
82
- ]
83
  return self._load_fallback_surahs()
84
-
85
  def get_surah_text(self, surah_id):
86
  max_retries = 3
87
  for attempt in range(max_retries):
88
  try:
89
- response = requests.get(f"{self.api_url}{surah_id}.json", timeout=10)
90
  response.raise_for_status()
91
  data = response.json()
92
  verses = data['arabic1']
@@ -96,12 +102,13 @@ class QuranSearchEngine:
96
  if attempt == max_retries - 1:
97
  return self._load_fallback_verse()
98
  time.sleep(2 ** attempt)
99
-
100
  def search_verses(self, query, top_k=5):
101
  if self.verse_embeddings is None or not self.all_verses:
102
  return self._keyword_fallback_search(query, top_k)
103
 
104
  try:
 
105
  query_embedding = self.model.encode([query], convert_to_tensor=False)
106
  similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
107
  top_indices = np.argsort(similarities)[-top_k:][::-1]
@@ -110,12 +117,10 @@ class QuranSearchEngine:
110
  for idx in top_indices:
111
  verse = self.all_verses[idx]
112
  surah_name = self.surahs[verse['surah_id'] - 1]['surahNameArabicLong']
113
- results.append(
114
- f"ุณูˆุฑุฉ {surah_name} - ุขูŠุฉ {verse['verse_num']}:\n{verse['text']}"
115
- )
116
  return "\n\n".join(results)
117
  except Exception as e:
118
- self.logger.error(f"Search failed: {e}")
119
  return "ุญุฏุซ ุฎุทุฃ ุฃุซู†ุงุก ุงู„ุจุญุซ. ุฌุฑุจ ู…ุฑุฉ ุฃุฎุฑู‰."
120
 
121
  def _keyword_fallback_search(self, query, top_k=5):
@@ -128,11 +133,7 @@ class QuranSearchEngine:
128
  return "\n\n".join(matches[:top_k]) or "ู„ุง ุชูˆุฌุฏ ู†ุชุงุฆุฌ ู…ุทุงุจู‚ุฉ."
129
 
130
  def _load_fallback_surahs(self):
131
- return [
132
- ("ุงู„ูุงุชุญุฉ", 1),
133
- ("ุงู„ุจู‚ุฑุฉ", 2),
134
- ("ุขู„ ุนู…ุฑุงู†", 3)
135
- ]
136
 
137
  def _load_fallback_verse(self):
138
  return "ุจุณู… ุงู„ู„ู‡ ุงู„ุฑุญู…ู† ุงู„ุฑุญูŠู…\nุงู„ู„ู‡ ู„ุง ุฅู„ู‡ ุฅู„ุง ู‡ูˆ ุงู„ุญูŠ ุงู„ู‚ูŠูˆู…"
 
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from config import QURAN_DATA_SOURCES, MODEL_NAME, CHUNK_SIZE
7
  import time
8
+ import sys
9
 
10
  class QuranSearchEngine:
11
  def __init__(self):
 
15
  self.all_verses = [] # List of {'surah_id': int, 'verse_num': int, 'text': str}
16
  self.verse_embeddings = None
17
  self.model = None
18
+ print("Starting QuranSearchEngine initialization at", time.ctime(), file=sys.stderr) # Debug to stderr
19
+ try:
20
+ self._load_full_quran()
21
+ print(f"Surahs loaded: {len(self.surahs) if self.surahs else 0}", file=sys.stderr) # Debug
22
+ self._load_all_verses_and_embeddings()
23
+ print(f"Verses loaded: {len(self.all_verses)}", file=sys.stderr) # Debug
24
+ except Exception as e:
25
+ self.logger.error(f"Initialization failed: {e}", exc_info=True)
26
+ print(f"Initialization error: {e}", file=sys.stderr)
27
+ self._load_fallback_data() # Ensure minimal startup
28
+
29
  def _load_full_quran(self):
30
+ max_retries = 5 # Increased retries
31
  for attempt in range(max_retries):
32
  try:
33
+ response = requests.get(f"{self.api_url}surah.json", timeout=15) # Increased timeout
34
  response.raise_for_status()
35
  self.surahs = response.json()
 
36
  for i, s in enumerate(self.surahs):
37
  s['id'] = i + 1
38
+ self.all_verses = [] # Reset verses
39
  for surah_id in range(1, 115):
40
+ surah_response = requests.get(f"{self.api_url}{surah_id}.json", timeout=15)
41
  surah_response.raise_for_status()
42
  data = surah_response.json()
43
+ verses = data['arabic1']
44
  for verse_num, text in enumerate(verses, start=1):
45
  self.all_verses.append({
46
  'surah_id': surah_id,
 
52
  self.logger.error(f"Attempt {attempt + 1}/{max_retries} failed to fetch Quran data: {e}")
53
  if attempt == max_retries - 1:
54
  self._load_fallback_data()
55
+ time.sleep(2 ** attempt) # Exponential backoff
56
+
57
  def _load_fallback_data(self):
58
+ self.logger.warning("Falling back to minimal data due to API failure")
59
  self.surahs = self._load_fallback_surahs()
60
  self.all_verses = [
61
  {'surah_id': 1, 'verse_num': 1, 'text': "ุจูุณู’ู…ู ูฑู„ู„ู‘ูŽู‡ู ูฑู„ุฑู‘ูŽุญู’ู…ูŽู€ูฐู†ู ูฑู„ุฑู‘ูŽุญููŠู…ู"},
62
  {'surah_id': 1, 'verse_num': 2, 'text': "ูฑู„ู’ุญูŽู…ู’ุฏู ู„ูู„ู‘ูŽู‡ู ุฑูŽุจู‘ู ูฑู„ู’ุนูŽูฐู„ูŽู…ููŠู†ูŽ"}
 
63
  ]
64
+
65
  def _load_all_verses_and_embeddings(self):
66
  if not self.all_verses:
67
  return
68
 
69
  try:
70
+ print("Attempting to load model...", file=sys.stderr) # Debug
71
  self.model = SentenceTransformer(MODEL_NAME)
72
+ print("Model loaded successfully", file=sys.stderr) # Debug
73
  verse_texts = [v['text'] for v in self.all_verses]
74
  self.verse_embeddings = []
75
  for i in range(0, len(verse_texts), CHUNK_SIZE):
 
77
  embeddings = self.model.encode(chunk, convert_to_tensor=False)
78
  self.verse_embeddings.append(embeddings)
79
  self.verse_embeddings = np.vstack(self.verse_embeddings)
80
+ print("Embeddings computed successfully", file=sys.stderr) # Debug
81
  except Exception as e:
82
+ self.logger.error(f"Failed to compute embeddings: {e}", exc_info=True)
83
  self.verse_embeddings = None
84
+ self.logger.warning("Falling back to keyword-based search due to embedding failure")
85
+
86
  def get_surahs(self):
87
  if self.surahs:
88
+ return [(s['surahNameArabicLong'], s['id']) for s in self.surahs]
 
 
 
89
  return self._load_fallback_surahs()
90
+
91
  def get_surah_text(self, surah_id):
92
  max_retries = 3
93
  for attempt in range(max_retries):
94
  try:
95
+ response = requests.get(f"{self.api_url}{surah_id}.json", timeout=15)
96
  response.raise_for_status()
97
  data = response.json()
98
  verses = data['arabic1']
 
102
  if attempt == max_retries - 1:
103
  return self._load_fallback_verse()
104
  time.sleep(2 ** attempt)
105
+
106
  def search_verses(self, query, top_k=5):
107
  if self.verse_embeddings is None or not self.all_verses:
108
  return self._keyword_fallback_search(query, top_k)
109
 
110
  try:
111
+ print(f"Encoding query: {query}", file=sys.stderr) # Debug
112
  query_embedding = self.model.encode([query], convert_to_tensor=False)
113
  similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
114
  top_indices = np.argsort(similarities)[-top_k:][::-1]
 
117
  for idx in top_indices:
118
  verse = self.all_verses[idx]
119
  surah_name = self.surahs[verse['surah_id'] - 1]['surahNameArabicLong']
120
+ results.append(f"ุณูˆุฑุฉ {surah_name} - ุขูŠุฉ {verse['verse_num']}:\n{verse['text']}")
 
 
121
  return "\n\n".join(results)
122
  except Exception as e:
123
+ self.logger.error(f"Search failed: {e}", exc_info=True)
124
  return "ุญุฏุซ ุฎุทุฃ ุฃุซู†ุงุก ุงู„ุจุญุซ. ุฌุฑุจ ู…ุฑุฉ ุฃุฎุฑู‰."
125
 
126
  def _keyword_fallback_search(self, query, top_k=5):
 
133
  return "\n\n".join(matches[:top_k]) or "ู„ุง ุชูˆุฌุฏ ู†ุชุงุฆุฌ ู…ุทุงุจู‚ุฉ."
134
 
135
  def _load_fallback_surahs(self):
136
+ return [("ุงู„ูุงุชุญุฉ", 1), ("ุงู„ุจู‚ุฑุฉ", 2), ("ุขู„ ุนู…ุฑุงู†", 3)]
 
 
 
 
137
 
138
  def _load_fallback_verse(self):
139
  return "ุจุณู… ุงู„ู„ู‡ ุงู„ุฑุญู…ู† ุงู„ุฑุญูŠู…\nุงู„ู„ู‡ ู„ุง ุฅู„ู‡ ุฅู„ุง ู‡ูˆ ุงู„ุญูŠ ุงู„ู‚ูŠูˆู…"