Batnini commited on
Commit
e12f2fb
·
verified ·
1 Parent(s): d66deaf

Update tools/quran_search.py

Browse files
Files changed (1) hide show
  1. tools/quran_search.py +65 -110
tools/quran_search.py CHANGED
@@ -1,134 +1,89 @@
1
  import logging
2
- import pandas as pd
3
  from sentence_transformers import SentenceTransformer
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  import numpy as np
6
- import requests
7
 
8
  class QuranSearchEngine:
9
  def __init__(self):
10
- self.data_loaded = False
11
- self.model = None
12
- self.verse_embeddings = None
13
- self.quran_df = None
14
- self.surah_names = {}
15
- self.base_api_url = "https://quranapi.pages.dev/api/verses"
16
 
17
- # Initialize with progress tracking
18
- self.load_data()
 
 
 
19
 
20
- def load_data(self):
21
- if not self.data_loaded:
22
- try:
23
- # Step 1: Load Surah names
24
- self._load_surah_names()
25
-
26
- # Step 2: Fetch verses in batches
27
- all_verses = []
28
- for surah_num in range(1, 115): # All 114 Surahs
29
- verses = self._fetch_verses(surah_num)
30
- if verses:
31
- all_verses.extend(verses)
32
-
33
- # Step 3: Create DataFrame
34
- self.quran_df = pd.DataFrame(all_verses)
35
-
36
- # Step 4: Initialize model
37
- self.model = SentenceTransformer(
38
- 'paraphrase-multilingual-MiniLM-L12-v2',
39
- device='cpu'
40
- )
41
-
42
- # Step 5: Generate embeddings in chunks
43
- texts = self.quran_df['text'].tolist()
44
- self.verse_embeddings = np.concatenate([
45
- self.model.encode(texts[i:i+100])
46
- for i in range(0, len(texts), 100)
47
- ])
48
-
49
- self.data_loaded = True
50
- logging.info("Quran data loaded successfully")
51
-
52
- except Exception as e:
53
- logging.error(f"Data loading failed: {str(e)}")
54
- self._load_backup_data()
55
-
56
- def _load_surah_names(self):
57
- """Fetch surah names from API"""
58
- try:
59
- response = requests.get(f"{self.base_api_url}/surahs")
60
- if response.status_code == 200:
61
- surahs = response.json()
62
- self.surah_names = {s['number']: s['name'] for s in surahs}
63
- except Exception as e:
64
- logging.warning(f"Couldn't fetch surah names: {str(e)}")
65
- # Fallback to minimal names
66
- self.surah_names = {i: f"سورة {i}" for i in range(1, 115)}
67
-
68
- def _fetch_verses(self, surah_num):
69
- """Fetch verses for a specific surah"""
70
  try:
71
  response = requests.get(
72
- f"{self.base_api_url}/{surah_num}",
73
- timeout=10
 
74
  )
75
- if response.status_code == 200:
76
- verses_data = response.json()
77
- return [{
78
- 'surah': surah_num,
79
- 'ayah': v['verse'],
80
- 'text': v['text'],
81
- 'surah_name': self.surah_names.get(surah_num, "")
82
- } for v in verses_data]
83
- except Exception as e:
84
- logging.warning(f"Failed to fetch surah {surah_num}: {str(e)}")
85
  return []
86
 
87
- def _load_backup_data(self):
88
- """Emergency fallback"""
89
- backup = [
90
- {"surah": 1, "ayah": 1, "text": "بسم الله الرحمن الرحيم", "surah_name": "الفاتحة"},
91
- {"surah": 2, "ayah": 255, "text": "الله لا إله إلا هو الحي القيوم...", "surah_name": "البقرة"},
92
- {"surah": 36, "ayah": 1, "text": "يس والقرآن الحكيم", "surah_name": "يس"}
93
- ]
94
- self.quran_df = pd.DataFrame(backup)
95
- self.model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
96
- self.verse_embeddings = self.model.encode(self.quran_df['text'].tolist())
97
- self.data_loaded = True
98
- logging.warning("Using backup data")
99
 
100
  def search(self, query, top_k=5):
101
- if not self.data_loaded:
 
 
 
102
  return []
103
-
104
  try:
105
- # Clean and validate query
106
- query = str(query).strip()
107
- if len(query) < 2:
108
  return []
109
-
110
- # Encode query and calculate similarities
111
- query_embedding = self.model.encode([query])
112
- similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
113
 
114
- # Get top results
115
- top_indices = np.argsort(similarities)[-top_k:][::-1]
 
 
 
 
 
 
116
 
117
- # Format results
118
- results = []
119
- for idx in top_indices:
120
- verse = self.quran_df.iloc[idx]
121
- results.append({
122
- "surah": verse['surah_name'],
123
- "ayah": verse['ayah'],
124
- "text": verse['text'],
125
- "similarity": f"{similarities[idx]:.2f}",
126
- "surah_num": verse['surah'],
127
- "ayah_num": verse['ayah']
128
- })
129
-
130
- return results
131
 
 
 
132
  except Exception as e:
133
- logging.error(f"Search error: {str(e)}")
134
  return []
 
1
  import logging
2
+ import requests
3
  from sentence_transformers import SentenceTransformer
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  import numpy as np
 
6
 
7
  class QuranSearchEngine:
8
  def __init__(self):
9
+ self.api_url = "https://api.quran.com/api/v3/search"
10
+ self.model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2', device='cpu')
11
+ self.embedding_cache = {}
12
+ self.min_query_length = 2
 
 
13
 
14
+ # Configure logging
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(levelname)s - %(message)s'
18
+ )
19
 
20
+ def _fetch_verses(self, query, limit=5):
21
+ """Fetch verses from Quran API with error handling"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  try:
23
  response = requests.get(
24
+ f"{self.api_url}?q={query}&size={limit}",
25
+ timeout=15,
26
+ headers={'Accept': 'application/json'}
27
  )
28
+ response.raise_for_status()
29
+ return response.json().get('results', [])
30
+ except requests.exceptions.RequestException as e:
31
+ logging.error(f"API request failed: {str(e)}")
32
+ return []
33
+ except ValueError as e:
34
+ logging.error(f"Invalid API response: {str(e)}")
 
 
 
35
  return []
36
 
37
+ def _process_verse(self, verse, similarity):
38
+ """Standardize verse format"""
39
+ return {
40
+ 'surah': verse.get('surah_name', ''),
41
+ 'ayah': verse.get('verse_id', 0),
42
+ 'text': verse.get('text', ''),
43
+ 'similarity': f"{similarity:.2f}",
44
+ 'surah_num': verse.get('surah_id', 0),
45
+ 'ayah_num': verse.get('verse_id', 0)
46
+ }
 
 
47
 
48
  def search(self, query, top_k=5):
49
+ """Main search method with validation and caching"""
50
+ # Validate input
51
+ query = str(query).strip()
52
+ if len(query) < self.min_query_length:
53
  return []
54
+
55
  try:
56
+ # 1. Get initial results from API
57
+ verses = self._fetch_verses(query, top_k)
58
+ if not verses:
59
  return []
60
+
61
+ # 2. Prepare texts for embedding
62
+ texts = [v['text'] for v in verses]
 
63
 
64
+ # 3. Get or create embeddings
65
+ if query in self.embedding_cache:
66
+ query_embedding = self.embedding_cache[query]
67
+ else:
68
+ query_embedding = self.model.encode([query])[0]
69
+ self.embedding_cache[query] = query_embedding
70
+
71
+ verse_embeddings = self.model.encode(texts)
72
 
73
+ # 4. Calculate similarities
74
+ similarities = cosine_similarity(
75
+ [query_embedding],
76
+ verse_embeddings
77
+ )[0]
78
+
79
+ # 5. Combine and sort results
80
+ results = [
81
+ self._process_verse(verse, similarities[i])
82
+ for i, verse in enumerate(verses)
83
+ ]
 
 
 
84
 
85
+ return sorted(results, key=lambda x: float(x['similarity']), reverse=True)
86
+
87
  except Exception as e:
88
+ logging.error(f"Search processing failed: {str(e)}")
89
  return []