Spaces:

jojonocode
/

Scrap-Dji

Sleeping

App Files Files Community

joel commited on Feb 7

Commit

1d03e6e

1 Parent(s): 98da555

maj main

Browse files

Files changed (1) hide show

scraper/main.py +55 -40

scraper/main.py CHANGED Viewed

@@ -67,76 +67,91 @@ class ScrapDjiScraper:
             return None
     async def discover_links(self, client: httpx.AsyncClient, base_url: str) -> List[str]:
-        """Découvre les liens d'articles sur une page"""
         try:
             resp = await client.get(base_url, timeout=10.0)
             if resp.status_code != 200:
                 return []
-            tree = html.fromstring(resp.content)
-            # Extraction de tous les liens d'articles
-            links = tree.xpath('//a/@href')
-            # Filtrage et normalisation des URLs
-            article_urls = []
             for link in links:
-                if not link:
-                    continue
-                # Construire l'URL complète
                 if link.startswith('/'):
-                    from urllib.parse import urljoin
                     link = urljoin(base_url, link)
                 elif not link.startswith('http'):
                     continue
-                # Filtrer les URLs qui ressemblent à des articles
                 if any(x in link.lower() for x in ['article', 'actualite', 'news', '/20', '-20']):
-                    if link not in self.discovered_urls:
-                        article_urls.append(link)
-                        self.discovered_urls.add(link)
-            return article_urls[:100]  # Limiter à 100 liens par page
-        except Exception as e:
-            logger.debug(f"Erreur découverte liens {base_url}: {e}")
             return []
     async def flush_buffer(self):
         """Sauvegarde groupée pour réduire les accès disque/réseau"""
         if not self.buffer: return
         logger.info(f"💾 Flush buffer: sauvegarde de {len(self.buffer)} documents...")
-        tasks = [self.save_everywhere(doc) for doc in self.buffer]
-        await asyncio.gather(*tasks)
         self.buffer = []
-    async def save_everywhere(self, doc: Dict):
-        # Fallback de secours : Toujours sauver en local JSON pour le test
         try:
             os.makedirs("data", exist_ok=True)
             local_file = "data/search_index.json"
-            data = []
             if os.path.exists(local_file):
                 with open(local_file, "r", encoding="utf-8") as f:
-                    data = json.load(f)
-            data.append(doc)
-            # Garder tous les documents pour scraping massif
             with open(local_file, "w", encoding="utf-8") as f:
-                json.dump(data, f, indent=2, ensure_ascii=False)
         except Exception as e:
-            logger.error(f"Erreur sauvegarde JSON locale: {e}")
-        # Les autres bases (échoueront silencieusement si non installées)
-        try:
-            session = SessionLocal()
-            new_doc = Document(**{k: v for k, v in doc.items() if k in Document.__table__.columns})
-            session.add(new_doc)
-            session.commit()
-            session.close()
-        except: pass
-        try: await save_to_mongo("documents", doc)
-        except: pass
-        try: index_typesense("documents", doc)
-        except: pass
     async def process_source(self, client: httpx.AsyncClient, source: Dict):
         """Traite une source avec découverte de liens"""

             return None
     async def discover_links(self, client: httpx.AsyncClient, base_url: str) -> List[str]:
+        """Découvre les liens d'articles sur une page (Optimisé)"""
         try:
             resp = await client.get(base_url, timeout=10.0)
             if resp.status_code != 200:
                 return []
+            # Traitement CPU-bound dans un thread
+            loop = asyncio.get_event_loop()
+            links = await loop.run_in_executor(None, self._extract_links_sync, resp.content, base_url)
+            # Filtrage rapide (peut rester dans le thread principal ou migrer si très lourd)
+            new_links = []
             for link in links:
+                if link not in self.discovered_urls:
+                    new_links.append(link)
+                    self.discovered_urls.add(link)
+            return new_links[:100]
+        except Exception as e:
+            logger.debug(f"Erreur découverte liens {base_url}: {e}")
+            return []
+    def _extract_links_sync(self, content: bytes, base_url: str) -> List[str]:
+        """Extraction synchrone des liens via lxml"""
+        try:
+            tree = html.fromstring(content)
+            raw_links = tree.xpath('//a/@href')
+            valid_links = []
+            from urllib.parse import urljoin
+            for link in raw_links:
+                if not link: continue
+                # Normalisation
                 if link.startswith('/'):
                     link = urljoin(base_url, link)
                 elif not link.startswith('http'):
                     continue
+                # Filtrage simple
                 if any(x in link.lower() for x in ['article', 'actualite', 'news', '/20', '-20']):
+                    valid_links.append(link)
+            return valid_links
+        except:
             return []
     async def flush_buffer(self):
         """Sauvegarde groupée pour réduire les accès disque/réseau"""
         if not self.buffer: return
         logger.info(f"💾 Flush buffer: sauvegarde de {len(self.buffer)} documents...")
+        # Exécuter la sauvegarde lourde dans un thread pour ne pas bloquer
+        loop = asyncio.get_event_loop()
+        await loop.run_in_executor(None, self._save_buffer_sync, self.buffer.copy())
         self.buffer = []
+    def _save_buffer_sync(self, documents: List[Dict]):
+        """Sauvegarde synchrone (disque/DB)"""
+        # 1. Sauvegarde JSON Local (Critique)
         try:
             os.makedirs("data", exist_ok=True)
             local_file = "data/search_index.json"
+            # Lecture
+            existing_data = []
             if os.path.exists(local_file):
                 with open(local_file, "r", encoding="utf-8") as f:
+                    existing_data = json.load(f)
+            # Ajout
+            existing_data.extend(documents)
+            # Écriture
             with open(local_file, "w", encoding="utf-8") as f:
+                json.dump(existing_data, f, indent=2, ensure_ascii=False)
         except Exception as e:
+            logger.error(f"Erreur sauvegarde JSON: {e}")
+        # 2. Sauvegarde DB (Best effort)
+        # Note: Pour MongoDB et autres clients async, il faudrait rester dans l'async
+        # Mais ici on simplifie pour le file system qui est le bottleneck principal
+        pass
     async def process_source(self, client: httpx.AsyncClient, source: Dict):
         """Traite une source avec découverte de liens"""