Spaces:
Running
Running
| import os | |
| import json | |
| import logging | |
| import hashlib | |
| from datetime import datetime, timedelta, timezone | |
| from typing import Optional | |
| from pathlib import Path | |
| import httpx | |
| logger = logging.getLogger(__name__) | |
| CACHE_DIR = Path(__file__).parent.parent / "cache" | |
| CACHE_DIR.mkdir(exist_ok=True) | |
| ZUS_CACHE_FILE = CACHE_DIR / "zus_nabory.json" | |
| ZUS_CACHE_TTL_HOURS = 24 | |
| # ZUS zazwyczaj organizuje Konkurs na Dofinansowanie BHP | |
| ZUS_BHP_URL = "https://bip.zus.pl/konkurs-bhp" | |
| class ZUSClient: | |
| """ | |
| Klient pobieraj膮cy aktualne programy wsparcia z ZUS (g艂贸wnie Dofinansowanie na popraw臋 BHP). | |
| """ | |
| def _load_cache(self) -> Optional[dict]: | |
| if not ZUS_CACHE_FILE.exists(): | |
| return None | |
| try: | |
| with open(ZUS_CACHE_FILE, "r", encoding="utf-8") as f: | |
| data = json.load(f) | |
| fetched_at = datetime.fromisoformat(data.get("fetched_at", "2000-01-01")) | |
| if fetched_at.tzinfo is None: | |
| fetched_at = fetched_at.replace(tzinfo=timezone.utc) | |
| if datetime.now(timezone.utc) - fetched_at < timedelta(hours=ZUS_CACHE_TTL_HOURS): | |
| return data | |
| except Exception as e: | |
| logger.warning(f"B艂膮d odczytu ZUS cache: {e}") | |
| return None | |
| def _save_cache(self, nabory: list) -> None: | |
| try: | |
| payload = { | |
| "fetched_at": datetime.now(timezone.utc).isoformat(), | |
| "nabory": nabory, | |
| } | |
| with open(ZUS_CACHE_FILE, "w", encoding="utf-8") as f: | |
| json.dump(payload, f, ensure_ascii=False, indent=2) | |
| except Exception as e: | |
| logger.warning(f"B艂膮d zapisu ZUS cache: {e}") | |
| async def _fetch_live(self) -> list: | |
| from core.date_utils import filter_outdated_grants | |
| import os | |
| import requests | |
| logger.info("Rozpoczynam pobieranie na 偶ywo nabor贸w ZUS...") | |
| api_key = os.getenv("FIRECRAWL_API_KEY") | |
| all_grants = [] | |
| if api_key: | |
| logger.info("U偶ywam Firecrawl do omini臋cia zabezpiecze艅 ZUS...") | |
| try: | |
| resp = requests.post( | |
| "https://api.firecrawl.dev/v1/scrape", | |
| headers={"Authorization": f"Bearer {api_key}"}, | |
| json={"url": ZUS_BHP_URL, "formats": ["markdown"]}, | |
| timeout=30.0 | |
| ) | |
| if resp.status_code == 200: | |
| data = resp.json() | |
| md = data.get("data", {}).get("markdown", "") | |
| if md: | |
| all_grants = await self._parse_firecrawl_markdown(md) | |
| logger.info(f"Firecrawl zwr贸ci艂 {len(all_grants)} nabor贸w z ZUS.") | |
| else: | |
| logger.warning(f"B艂膮d Firecrawl API (ZUS): {resp.status_code} - {resp.text}") | |
| except Exception as e: | |
| logger.error(f"Wyj膮tek podczas wywo艂ania Firecrawl API (ZUS): {e}") | |
| else: | |
| logger.warning("Brak klucza FIRECRAWL_API_KEY. Brak nabor贸w z ZUS.") | |
| # Filtrowanie przestarza艂ych dat | |
| active_grants = filter_outdated_grants(all_grants) | |
| return active_grants | |
| async def _parse_firecrawl_markdown(self, md: str) -> list: | |
| """Skanuje markdown za pomoc膮 LLM w celu wydobycia listy nabor贸w ZUS.""" | |
| try: | |
| from core.llm_router import get_llm | |
| from pydantic import BaseModel, Field | |
| from typing import List | |
| class Grant(BaseModel): | |
| name: str = Field(description="Tytu艂 konkursu/naboru ZUS") | |
| deadline: str = Field(default="", description="Termin sk艂adania wniosk贸w (deadline) w formacie YYYY-MM-DD. Je艣li brak, zostaw puste.") | |
| class GrantsList(BaseModel): | |
| grants: List[Grant] | |
| llm = get_llm("fast").with_structured_output(GrantsList) | |
| md_subset = md[:10000] | |
| prompt = f"Wydob膮d藕 list臋 aktualnych konkurs贸w lub dofinansowa艅 ZUS z poni偶szego tekstu Markdown:\n\n{md_subset}" | |
| result = await llm.ainvoke(prompt) | |
| nabory = [] | |
| for g in result.grants: | |
| uid = hashlib.md5(g.name.encode()).hexdigest()[:12] | |
| nabory.append({ | |
| "id": uid, | |
| "name": g.name, | |
| "program": "ZUS", | |
| "type": "Bezpiecze艅stwo pracy", | |
| "status": "active", | |
| "url": ZUS_BHP_URL, | |
| "deadline": g.deadline, | |
| "max_dofinansowanie_pln": 300000, | |
| "min_dofinansowanie_pln": 10000, | |
| "dofinansowanie_pct_max": 80, | |
| "eligible_regions": ["Ca艂a Polska"], | |
| "eligible_company_sizes": ["mikro", "ma艂e", "艣rednie", "du偶e"], | |
| "description": "Program wsparcia ZUS dla p艂atnik贸w sk艂adek na inwestycje zmniejszaj膮ce ryzyko wypadk贸w przy pracy (BHP).", | |
| "legal_source": "Regulamin Konkursu na dofinansowanie przez ZUS", | |
| "source": "zus_scrape", | |
| "fetched_at": datetime.now(timezone.utc).isoformat(), | |
| }) | |
| return nabory | |
| except Exception as e: | |
| logger.warning(f"B艂膮d parsowania markdowna z LLM (ZUS): {e}") | |
| return [] | |
| def _enrich_urls(self, nabory: list) -> None: | |
| import urllib.parse | |
| for n in nabory: | |
| q_gov = n.get("name", "") | |
| if "official_doc_url" not in n: | |
| n["official_doc_url"] = f"https://bip.zus.pl/wyszukiwarka?query={urllib.parse.quote(q_gov)}" | |
| if "eurlex_url" not in n: | |
| n["eurlex_url"] = "" # Brak zwi膮zku ZUS z prawem UE | |
| async def get_active_nabory(self, force_refresh: bool = False) -> list: | |
| if not force_refresh: | |
| cached = self._load_cache() | |
| if cached: | |
| nabory = cached["nabory"] | |
| self._enrich_urls(nabory) | |
| return nabory | |
| nabory = await self._fetch_live() | |
| self._enrich_urls(nabory) | |
| self._save_cache(nabory) | |
| return nabory | |
| zus_client = ZUSClient() | |