Spaces:

Batnini
/

radius

Paused

App Files Files Community

Batnini commited on Aug 11, 2025

Commit

860a19b

verified ·

1 Parent(s): cede722

Update tools/quran_search.py

Browse files

Files changed (1) hide show

tools/quran_search.py +93 -63

tools/quran_search.py CHANGED Viewed

@@ -5,100 +5,130 @@ from sklearn.metrics.pairwise import cosine_similarity
 import numpy as np
 import requests
-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s',
-    handlers=[
-        logging.StreamHandler(),
-        logging.FileHandler('app.log')
-    ]
-)
-# Quran data configuration
-QURAN_DATA_SOURCES = [
-    "https://cdn.jsdelivr.net/gh/mafahim/quran-json/quran_clean.csv",
-    "https://raw.githubusercontent.com/mafahim/quran-json/main/quran_clean.csv",
-    "https://gitlab.com/mafahim/quran-json/-/raw/main/quran_clean.csv"
-]
-# Model configuration
-MODEL_NAME = 'paraphrase-multilingual-MiniLM-L12-v2'
-CHUNK_SIZE = 50  # For memory management
 class QuranSearchEngine:
     def __init__(self):
         self.data_loaded = False
         self.model = None
         self.verse_embeddings = None
         self.quran_df = None
-        self.surah_names = {
-            1: "الفاتحة", 2: "البقرة", 3: "آل عمران",
-            4: "النساء", 5: "المائدة", 6: "الأنعام",
-            114: "الناس"
-        }
         self.load_data()
     def load_data(self):
         if not self.data_loaded:
             try:
-                # Load from the first available API source
-                for source in QURAN_DATA_SOURCES:
-                    response = requests.get(source, timeout=10)
-                    if response.status_code == 200:
-                        verses = response.json().get('verses', [])
-                        verses_data = []
-                        for verse in verses:
-                            verses_data.append({
-                                'surah': verse['chapter_id'],
-                                'ayah': verse['verse_number'],
-                                'text': ' '.join([w['text_uthmani'] for w in verse['words']])
-                            })
-                        self.quran_df = pd.DataFrame(verses_data)
-                        self.model = SentenceTransformer(MODEL_NAME)
-                        self.verse_embeddings = self.model.encode(self.quran_df['text'].tolist())
-                        self.data_loaded = True
-                        logging.info("Quran data loaded successfully.")
-                        return
-                logging.error("Failed to load Quran data from all sources.")
             except Exception as e:
-                logging.error(f"Error loading data: {str(e)}")
     def search(self, query, top_k=5):
         if not self.data_loaded:
-            logging.error("Data not loaded properly.")
             return []
         try:
             query_embedding = self.model.encode([query])
             similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
             top_indices = np.argsort(similarities)[-top_k:][::-1]
             results = []
             for idx in top_indices:
                 verse = self.quran_df.iloc[idx]
                 results.append({
-                    "surah": self.surah_names.get(verse['surah'], "سورة غير معروفة"),
                     "ayah": verse['ayah'],
                     "text": verse['text'],
                     "similarity": f"{similarities[idx]:.2f}",
                     "surah_num": verse['surah'],
                     "ayah_num": verse['ayah']
                 })
             return results
         except Exception as e:
-            logging.error(f"Search Error: {str(e)}")
-            return []
-# Example usage
-if __name__ == "__main__":
-    quran_searcher = QuranSearchEngine()
-    query = "العدل في الإسلام"
-    results = quran_searcher.search(query, top_k=5)
-    for result in results:
-        print(f"سورة: {result['surah']} ({result['surah_num']}:{result['ayah_num']})")
-        print(f"**التشابه**: {result['similarity']}")
-        print(f"{result['text']}")
-        print("---")

 import numpy as np
 import requests
 class QuranSearchEngine:
     def __init__(self):
         self.data_loaded = False
         self.model = None
         self.verse_embeddings = None
         self.quran_df = None
+        self.surah_names = {}
+        self.base_api_url = "https://quranapi.pages.dev/api/verses"
+        # Initialize with progress tracking
         self.load_data()
     def load_data(self):
         if not self.data_loaded:
             try:
+                # Step 1: Load Surah names
+                self._load_surah_names()
+                # Step 2: Fetch verses in batches
+                all_verses = []
+                for surah_num in range(1, 115):  # All 114 Surahs
+                    verses = self._fetch_verses(surah_num)
+                    if verses:
+                        all_verses.extend(verses)
+                # Step 3: Create DataFrame
+                self.quran_df = pd.DataFrame(all_verses)
+                # Step 4: Initialize model
+                self.model = SentenceTransformer(
+                    'paraphrase-multilingual-MiniLM-L12-v2',
+                    device='cpu'
+                )
+                # Step 5: Generate embeddings in chunks
+                texts = self.quran_df['text'].tolist()
+                self.verse_embeddings = np.concatenate([
+                    self.model.encode(texts[i:i+100])
+                    for i in range(0, len(texts), 100)
+                ])
+                self.data_loaded = True
+                logging.info("Quran data loaded successfully")
             except Exception as e:
+                logging.error(f"Data loading failed: {str(e)}")
+                self._load_backup_data()
+    def _load_surah_names(self):
+        """Fetch surah names from API"""
+        try:
+            response = requests.get(f"{self.base_api_url}/surahs")
+            if response.status_code == 200:
+                surahs = response.json()
+                self.surah_names = {s['number']: s['name'] for s in surahs}
+        except Exception as e:
+            logging.warning(f"Couldn't fetch surah names: {str(e)}")
+            # Fallback to minimal names
+            self.surah_names = {i: f"سورة {i}" for i in range(1, 115)}
+    def _fetch_verses(self, surah_num):
+        """Fetch verses for a specific surah"""
+        try:
+            response = requests.get(
+                f"{self.base_api_url}/{surah_num}",
+                timeout=10
+            )
+            if response.status_code == 200:
+                verses_data = response.json()
+                return [{
+                    'surah': surah_num,
+                    'ayah': v['verse'],
+                    'text': v['text'],
+                    'surah_name': self.surah_names.get(surah_num, "")
+                } for v in verses_data]
+        except Exception as e:
+            logging.warning(f"Failed to fetch surah {surah_num}: {str(e)}")
+            return []
+    def _load_backup_data(self):
+        """Emergency fallback"""
+        backup = [
+            {"surah": 1, "ayah": 1, "text": "بسم الله الرحمن الرحيم", "surah_name": "الفاتحة"},
+            {"surah": 2, "ayah": 255, "text": "الله لا إله إلا هو الحي القيوم...", "surah_name": "البقرة"},
+            {"surah": 36, "ayah": 1, "text": "يس والقرآن الحكيم", "surah_name": "يس"}
+        ]
+        self.quran_df = pd.DataFrame(backup)
+        self.model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
+        self.verse_embeddings = self.model.encode(self.quran_df['text'].tolist())
+        self.data_loaded = True
+        logging.warning("Using backup data")
     def search(self, query, top_k=5):
         if not self.data_loaded:
             return []
         try:
+            # Clean and validate query
+            query = str(query).strip()
+            if len(query) < 2:
+                return []
+            # Encode query and calculate similarities
             query_embedding = self.model.encode([query])
             similarities = cosine_similarity(query_embedding, self.verse_embeddings)[0]
+            # Get top results
             top_indices = np.argsort(similarities)[-top_k:][::-1]
+            # Format results
             results = []
             for idx in top_indices:
                 verse = self.quran_df.iloc[idx]
                 results.append({
+                    "surah": verse['surah_name'],
                     "ayah": verse['ayah'],
                     "text": verse['text'],
                     "similarity": f"{similarities[idx]:.2f}",
                     "surah_num": verse['surah'],
                     "ayah_num": verse['ayah']
                 })
             return results
         except Exception as e:
+            logging.error(f"Search error: {str(e)}")
+            return []