Spaces:
Running
Running
| import logging | |
| import asyncio | |
| import sys | |
| import os | |
| sys.path.append(os.path.join(os.path.dirname(__file__), "..", "..")) | |
| try: | |
| from backend.integrations.isap_client import ISAPClient | |
| from backend.integrations.parp_client import PARPClient | |
| from backend.rag_pipeline.scraper import scrape_grant_url | |
| from backend.rag_pipeline.ingest import process_and_ingest | |
| except ImportError: | |
| ISAPClient = None | |
| PARPClient = None | |
| logger = logging.getLogger(__name__) | |
| class ScraperAgent: | |
| """ | |
| Inteligentny Agent, kt贸ry decyduje w jaki spos贸b pozyska膰 dane. | |
| Przeznaczony do dzia艂ania z Celery Beat / APScheduler do cyklicznych aktualizacji. | |
| """ | |
| def __init__(self): | |
| if ISAPClient and PARPClient: | |
| self.isap_client = ISAPClient() | |
| self.parp_client = PARPClient() | |
| else: | |
| self.isap_client = None | |
| self.parp_client = None | |
| # Namespace do og贸lnodost臋pnej przestrzeni akt贸w i regulamin贸w | |
| self.public_namespace = "public_legal" | |
| async def run_sync_job(self): | |
| """Uruchamia cykliczny proces synchronizacji dotacji i prawa""" | |
| if not self.isap_client or not self.parp_client: | |
| logger.error("[AGENT] Brak klientow ISAP/PARP. Synchronizacja anulowana.") | |
| return | |
| logger.info( | |
| "[AGENT] Rozpocz臋cie automatycznego zadania synchronizacji bazy wiedzy..." | |
| ) | |
| # 1. PARP (Regulaminy Nabor贸w) | |
| grants = self.parp_client.fetch_grants() | |
| for grant in grants: | |
| url = grant["url"] | |
| logger.info(f"[AGENT] Zlecam scrapowanie dla dotacji: {grant['id']}") | |
| try: | |
| text, _ = await scrape_grant_url(url) | |
| if text: | |
| process_and_ingest( | |
| text, url, priority="high", namespace=self.public_namespace | |
| ) | |
| except Exception as e: | |
| logger.error(f"[AGENT] B艂膮d fetchowania {url}: {e}") | |
| # 2. ISAP (Ustawa z dn 6 marca 2018 - Prawo przedsi臋biorc贸w) | |
| logger.info("[AGENT] Pobieranie ram prawnych (ISAP)") | |
| act_info = self.isap_client.fetch_act("WDU", 2018, 646) | |
| if act_info: | |
| url = act_info["text_url"] | |
| try: | |
| # Firecrawl poradzi sobie z ujednoliconym PDF w HTML wrapperze ISAP | |
| text, _ = await scrape_grant_url(url) | |
| if text: | |
| process_and_ingest( | |
| text, url, priority="critical", namespace=self.public_namespace | |
| ) | |
| except Exception as e: | |
| logger.error(f"[AGENT] B艂膮d parsowania ISAP {url}: {e}") | |
| logger.info("[AGENT] Zako艅czono automatyczny cykl agenta synchronizacyjnego.") | |
| if __name__ == "__main__": | |
| logging.basicConfig(level=logging.INFO) | |
| agent = ScraperAgent() | |
| asyncio.run(agent.run_sync_job()) | |