Batnini commited on
Commit
860a19b
·
verified ·
1 Parent(s): cede722

Update tools/quran_search.py

Browse files
Files changed (1) hide show
  1. tools/quran_search.py +93 -63
tools/quran_search.py CHANGED
@@ -5,100 +5,130 @@ from sklearn.metrics.pairwise import cosine_similarity
5
  import numpy as np
6
  import requests
7
 
8
- # Configure logging
9
- logging.basicConfig(
10
- level=logging.INFO,
11
- format='%(asctime)s - %(levelname)s - %(message)s',
12
- handlers=[
13
- logging.StreamHandler(),
14
- logging.FileHandler('app.log')
15
- ]
16
- )
17
-
18
- # Quran data configuration
19
- QURAN_DATA_SOURCES = [
20
- "https://cdn.jsdelivr.net/gh/mafahim/quran-json/quran_clean.csv",
21
- "https://raw.githubusercontent.com/mafahim/quran-json/main/quran_clean.csv",
22
- "https://gitlab.com/mafahim/quran-json/-/raw/main/quran_clean.csv"
23
- ]
24
-
25
- # Model configuration
26
- MODEL_NAME = 'paraphrase-multilingual-MiniLM-L12-v2'
27
- CHUNK_SIZE = 50 # For memory management
28
-
29
  class QuranSearchEngine:
30
  def __init__(self):
31
  self.data_loaded = False
32
  self.model = None
33
  self.verse_embeddings = None
34
  self.quran_df = None
35
- self.surah_names = {
36
- 1: "الفاتحة", 2: "البقرة", 3: "آل عمران",
37
- 4: "النساء", 5: "المائدة", 6: "الأنعام",
38
- 114: "الناس"
39
- }
40
  self.load_data()
41
 
42
  def load_data(self):
43
  if not self.data_loaded:
44
  try:
45
- # Load from the first available API source
46
- for source in QURAN_DATA_SOURCES:
47
- response = requests.get(source, timeout=10)
48
- if response.status_code == 200:
49
- verses = response.json().get('verses', [])
50
- verses_data = []
51
- for verse in verses:
52
- verses_data.append({
53
- 'surah': verse['chapter_id'],
54
- 'ayah': verse['verse_number'],
55
- 'text': ' '.join([w['text_uthmani'] for w in verse['words']])
56
- })
57
- self.quran_df = pd.DataFrame(verses_data)
58
- self.model = SentenceTransformer(MODEL_NAME)
59
- self.verse_embeddings = self.model.encode(self.quran_df['text'].tolist())
60
- self.data_loaded = True
61
- logging.info("Quran data loaded successfully.")
62
- return
63
- logging.error("Failed to load Quran data from all sources.")
 
 
 
 
 
 
 
 
 
 
64
  except Exception as e:
65
- logging.error(f"Error loading data: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  def search(self, query, top_k=5):
68
  if not self.data_loaded:
69
- logging.error("Data not loaded properly.")
70
  return []
71
-
72
  try:
 
 
 
 
 
 
73
  query_embedding = self.model.encode([query])
74
  similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
 
 
75
  top_indices = np.argsort(similarities)[-top_k:][::-1]
76
 
 
77
  results = []
78
  for idx in top_indices:
79
  verse = self.quran_df.iloc[idx]
80
  results.append({
81
- "surah": self.surah_names.get(verse['surah'], "سورة غير معروفة"),
82
  "ayah": verse['ayah'],
83
  "text": verse['text'],
84
  "similarity": f"{similarities[idx]:.2f}",
85
  "surah_num": verse['surah'],
86
  "ayah_num": verse['ayah']
87
  })
 
88
  return results
89
 
90
  except Exception as e:
91
- logging.error(f"Search Error: {str(e)}")
92
- return []
93
-
94
- # Example usage
95
- if __name__ == "__main__":
96
- quran_searcher = QuranSearchEngine()
97
- query = "العدل في الإسلام"
98
- results = quran_searcher.search(query, top_k=5)
99
-
100
- for result in results:
101
- print(f"سورة: {result['surah']} ({result['surah_num']}:{result['ayah_num']})")
102
- print(f"**التشابه**: {result['similarity']}")
103
- print(f"{result['text']}")
104
- print("---")
 
5
  import numpy as np
6
  import requests
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  class QuranSearchEngine:
9
  def __init__(self):
10
  self.data_loaded = False
11
  self.model = None
12
  self.verse_embeddings = None
13
  self.quran_df = None
14
+ self.surah_names = {}
15
+ self.base_api_url = "https://quranapi.pages.dev/api/verses"
16
+
17
+ # Initialize with progress tracking
 
18
  self.load_data()
19
 
20
  def load_data(self):
21
  if not self.data_loaded:
22
  try:
23
+ # Step 1: Load Surah names
24
+ self._load_surah_names()
25
+
26
+ # Step 2: Fetch verses in batches
27
+ all_verses = []
28
+ for surah_num in range(1, 115): # All 114 Surahs
29
+ verses = self._fetch_verses(surah_num)
30
+ if verses:
31
+ all_verses.extend(verses)
32
+
33
+ # Step 3: Create DataFrame
34
+ self.quran_df = pd.DataFrame(all_verses)
35
+
36
+ # Step 4: Initialize model
37
+ self.model = SentenceTransformer(
38
+ 'paraphrase-multilingual-MiniLM-L12-v2',
39
+ device='cpu'
40
+ )
41
+
42
+ # Step 5: Generate embeddings in chunks
43
+ texts = self.quran_df['text'].tolist()
44
+ self.verse_embeddings = np.concatenate([
45
+ self.model.encode(texts[i:i+100])
46
+ for i in range(0, len(texts), 100)
47
+ ])
48
+
49
+ self.data_loaded = True
50
+ logging.info("Quran data loaded successfully")
51
+
52
  except Exception as e:
53
+ logging.error(f"Data loading failed: {str(e)}")
54
+ self._load_backup_data()
55
+
56
+ def _load_surah_names(self):
57
+ """Fetch surah names from API"""
58
+ try:
59
+ response = requests.get(f"{self.base_api_url}/surahs")
60
+ if response.status_code == 200:
61
+ surahs = response.json()
62
+ self.surah_names = {s['number']: s['name'] for s in surahs}
63
+ except Exception as e:
64
+ logging.warning(f"Couldn't fetch surah names: {str(e)}")
65
+ # Fallback to minimal names
66
+ self.surah_names = {i: f"سورة {i}" for i in range(1, 115)}
67
+
68
+ def _fetch_verses(self, surah_num):
69
+ """Fetch verses for a specific surah"""
70
+ try:
71
+ response = requests.get(
72
+ f"{self.base_api_url}/{surah_num}",
73
+ timeout=10
74
+ )
75
+ if response.status_code == 200:
76
+ verses_data = response.json()
77
+ return [{
78
+ 'surah': surah_num,
79
+ 'ayah': v['verse'],
80
+ 'text': v['text'],
81
+ 'surah_name': self.surah_names.get(surah_num, "")
82
+ } for v in verses_data]
83
+ except Exception as e:
84
+ logging.warning(f"Failed to fetch surah {surah_num}: {str(e)}")
85
+ return []
86
+
87
+ def _load_backup_data(self):
88
+ """Emergency fallback"""
89
+ backup = [
90
+ {"surah": 1, "ayah": 1, "text": "بسم الله الرحمن الرحيم", "surah_name": "الفاتحة"},
91
+ {"surah": 2, "ayah": 255, "text": "الله لا إله إلا هو الحي القيوم...", "surah_name": "البقرة"},
92
+ {"surah": 36, "ayah": 1, "text": "يس والقرآن الحكيم", "surah_name": "يس"}
93
+ ]
94
+ self.quran_df = pd.DataFrame(backup)
95
+ self.model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
96
+ self.verse_embeddings = self.model.encode(self.quran_df['text'].tolist())
97
+ self.data_loaded = True
98
+ logging.warning("Using backup data")
99
 
100
  def search(self, query, top_k=5):
101
  if not self.data_loaded:
 
102
  return []
103
+
104
  try:
105
+ # Clean and validate query
106
+ query = str(query).strip()
107
+ if len(query) < 2:
108
+ return []
109
+
110
+ # Encode query and calculate similarities
111
  query_embedding = self.model.encode([query])
112
  similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
113
+
114
+ # Get top results
115
  top_indices = np.argsort(similarities)[-top_k:][::-1]
116
 
117
+ # Format results
118
  results = []
119
  for idx in top_indices:
120
  verse = self.quran_df.iloc[idx]
121
  results.append({
122
+ "surah": verse['surah_name'],
123
  "ayah": verse['ayah'],
124
  "text": verse['text'],
125
  "similarity": f"{similarities[idx]:.2f}",
126
  "surah_num": verse['surah'],
127
  "ayah_num": verse['ayah']
128
  })
129
+
130
  return results
131
 
132
  except Exception as e:
133
+ logging.error(f"Search error: {str(e)}")
134
+ return []