Batnini commited on
Commit
cede722
ยท
verified ยท
1 Parent(s): a486f87

Update tools/quran_search.py

Browse files
Files changed (1) hide show
  1. tools/quran_search.py +60 -36
tools/quran_search.py CHANGED
@@ -1,9 +1,30 @@
 
1
  import pandas as pd
2
  from sentence_transformers import SentenceTransformer
3
  from sklearn.metrics.pairwise import cosine_similarity
4
  import numpy as np
5
  import requests
6
- import logging
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
 
8
  class QuranSearchEngine:
9
  def __init__(self):
@@ -16,47 +37,38 @@ class QuranSearchEngine:
16
  4: "ุงู„ู†ุณุงุก", 5: "ุงู„ู…ุงุฆุฏุฉ", 6: "ุงู„ุฃู†ุนุงู…",
17
  114: "ุงู„ู†ุงุณ"
18
  }
 
19
 
20
  def load_data(self):
21
  if not self.data_loaded:
22
  try:
23
- # Try to load from API
24
- verses_url = "https://api.quran.com/api/v4/verses/by_chapter/2?language=ar&words=true"
25
- response = requests.get(verses_url, timeout=10)
26
- verses = response.json().get('verses', [])
27
-
28
- verses_data = []
29
- for verse in verses:
30
- verses_data.append({
31
- 'surah': verse['chapter_id'],
32
- 'ayah': verse['verse_number'],
33
- 'text': ' '.join([w['text_uthmani'] for w in verse['words']])
34
- })
35
-
36
- self.quran_df = pd.DataFrame(verses_data)
37
- self.model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
38
- self.verse_embeddings = self.model.encode(self.quran_df['text'].tolist())
39
- self.data_loaded = True
40
-
 
41
  except Exception as e:
42
- logging.error(f"API Error: {str(e)}")
43
- self.load_backup_data()
44
-
45
- def load_backup_data(self):
46
- """Load backup data if API fails"""
47
- backup = [
48
- {"surah": 2, "ayah": 163, "text": "ูˆุฅู„ู‡ูƒู… ุฅู„ู‡ ูˆุงุญุฏ ู„ุง ุฅู„ู‡ ุฅู„ุง ู‡ูˆ ุงู„ุฑุญู…ู† ุงู„ุฑุญูŠู…"},
49
- {"surah": 3, "ayah": 134, "text": "ุงู„ุฐูŠู† ูŠู†ูู‚ูˆู† ููŠ ุงู„ุณุฑุงุก ูˆุงู„ุถุฑุงุก ูˆุงู„ูƒุงุธู…ูŠู† ุงู„ุบูŠุธ ูˆุงู„ุนุงููŠู† ุนู† ุงู„ู†ุงุณ ูˆุงู„ู„ู‡ ูŠุญุจ ุงู„ู…ุญุณู†ูŠู†"},
50
- {"surah": 4, "ayah": 135, "text": "ูŠุง ุฃูŠู‡ุง ุงู„ุฐูŠู† ุขู…ู†ูˆุง ูƒูˆู†ูˆุง ู‚ูˆุงู…ูŠู† ุจุงู„ู‚ุณุท ุดู‡ุฏุงุก ู„ู„ู‡ ูˆู„ูˆ ุนู„ู‰ ุฃู†ูุณูƒู… ุฃูˆ ุงู„ูˆุงู„ุฏูŠู† ูˆุงู„ุฃู‚ุฑุจูŠู†"}
51
- ]
52
- self.quran_df = pd.DataFrame(backup)
53
- if not hasattr(self, 'model'):
54
- self.model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
55
- self.verse_embeddings = self.model.encode(self.quran_df['text'].tolist())
56
- self.data_loaded = True
57
 
58
  def search(self, query, top_k=5):
59
- self.load_data()
 
 
 
60
  try:
61
  query_embedding = self.model.encode([query])
62
  similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
@@ -77,4 +89,16 @@ class QuranSearchEngine:
77
 
78
  except Exception as e:
79
  logging.error(f"Search Error: {str(e)}")
80
- return []
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
  import pandas as pd
3
  from sentence_transformers import SentenceTransformer
4
  from sklearn.metrics.pairwise import cosine_similarity
5
  import numpy as np
6
  import requests
7
+
8
+ # Configure logging
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ format='%(asctime)s - %(levelname)s - %(message)s',
12
+ handlers=[
13
+ logging.StreamHandler(),
14
+ logging.FileHandler('app.log')
15
+ ]
16
+ )
17
+
18
+ # Quran data configuration
19
+ QURAN_DATA_SOURCES = [
20
+ "https://cdn.jsdelivr.net/gh/mafahim/quran-json/quran_clean.csv",
21
+ "https://raw.githubusercontent.com/mafahim/quran-json/main/quran_clean.csv",
22
+ "https://gitlab.com/mafahim/quran-json/-/raw/main/quran_clean.csv"
23
+ ]
24
+
25
+ # Model configuration
26
+ MODEL_NAME = 'paraphrase-multilingual-MiniLM-L12-v2'
27
+ CHUNK_SIZE = 50 # For memory management
28
 
29
  class QuranSearchEngine:
30
  def __init__(self):
 
37
  4: "ุงู„ู†ุณุงุก", 5: "ุงู„ู…ุงุฆุฏุฉ", 6: "ุงู„ุฃู†ุนุงู…",
38
  114: "ุงู„ู†ุงุณ"
39
  }
40
+ self.load_data()
41
 
42
  def load_data(self):
43
  if not self.data_loaded:
44
  try:
45
+ # Load from the first available API source
46
+ for source in QURAN_DATA_SOURCES:
47
+ response = requests.get(source, timeout=10)
48
+ if response.status_code == 200:
49
+ verses = response.json().get('verses', [])
50
+ verses_data = []
51
+ for verse in verses:
52
+ verses_data.append({
53
+ 'surah': verse['chapter_id'],
54
+ 'ayah': verse['verse_number'],
55
+ 'text': ' '.join([w['text_uthmani'] for w in verse['words']])
56
+ })
57
+ self.quran_df = pd.DataFrame(verses_data)
58
+ self.model = SentenceTransformer(MODEL_NAME)
59
+ self.verse_embeddings = self.model.encode(self.quran_df['text'].tolist())
60
+ self.data_loaded = True
61
+ logging.info("Quran data loaded successfully.")
62
+ return
63
+ logging.error("Failed to load Quran data from all sources.")
64
  except Exception as e:
65
+ logging.error(f"Error loading data: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
 
67
  def search(self, query, top_k=5):
68
+ if not self.data_loaded:
69
+ logging.error("Data not loaded properly.")
70
+ return []
71
+
72
  try:
73
  query_embedding = self.model.encode([query])
74
  similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
 
89
 
90
  except Exception as e:
91
  logging.error(f"Search Error: {str(e)}")
92
+ return []
93
+
94
+ # Example usage
95
+ if __name__ == "__main__":
96
+ quran_searcher = QuranSearchEngine()
97
+ query = "ุงู„ุนุฏู„ ููŠ ุงู„ุฅุณู„ุงู…"
98
+ results = quran_searcher.search(query, top_k=5)
99
+
100
+ for result in results:
101
+ print(f"ุณูˆุฑุฉ: {result['surah']} ({result['surah_num']}:{result['ayah_num']})")
102
+ print(f"**ุงู„ุชุดุงุจู‡**: {result['similarity']}")
103
+ print(f"{result['text']}")
104
+ print("---")