Spaces:

Bogdan555
/

grantforge-api

Sleeping

File size: 9,311 Bytes

afd56bc

"""
Klient HTTP do API NCBR (Narodowe Centrum Badań i Rozwoju).
Pobiera aktualne konkursy i nabory z zakresu B+R+I.

Źródła:
- https://www.ncbr.gov.pl/programy/ (scraping)
- Oficjalny JSON feed jeśli dostępny

Cache: współdzielony z PARP — folder cache/ z TTL 24h.
"""

import os
import json
import logging
import hashlib
from datetime import datetime, timedelta, timezone
from typing import Optional
from pathlib import Path

import httpx

logger = logging.getLogger(__name__)

CACHE_DIR = Path(__file__).parent.parent / "cache"
CACHE_DIR.mkdir(exist_ok=True)
NCBR_CACHE_FILE = CACHE_DIR / "ncbr_nabory.json"
NCBR_CACHE_TTL_HOURS = 4

NCBR_BASE_URL = "https://www.ncbr.gov.pl"
NCBR_PROGRAMS_URL = f"{NCBR_BASE_URL}/programy/"




class NCBRClient:
    """
    Klient pobierający aktualne nabory z NCBR.
    Analogiczna architektura jak PARPClient — cache 24h, scraping fallback.
    """

    def _load_cache(self) -> Optional[dict]:
        if not NCBR_CACHE_FILE.exists():
            return None
        try:
            with open(NCBR_CACHE_FILE, "r", encoding="utf-8") as f:
                data = json.load(f)
            fetched_at = datetime.fromisoformat(data.get("fetched_at", "2000-01-01"))
            if fetched_at.tzinfo is None:
                fetched_at = fetched_at.replace(tzinfo=timezone.utc)
            if datetime.now(timezone.utc) - fetched_at < timedelta(
                hours=NCBR_CACHE_TTL_HOURS
            ):
                logger.info(f"NCBR cache hit — {len(data.get('nabory', []))} naborów.")
                return data
        except Exception as e:
            logger.warning(f"Błąd odczytu NCBR cache: {e}")
        return None

    def _save_cache(self, nabory: list) -> None:
        try:
            payload = {
                "fetched_at": datetime.now(timezone.utc).isoformat(),
                "nabory": nabory,
            }
            with open(NCBR_CACHE_FILE, "w", encoding="utf-8") as f:
                json.dump(payload, f, ensure_ascii=False, indent=2)
            logger.info(f"Zapisano {len(nabory)} naborów NCBR do cache.")
        except Exception as e:
            logger.warning(f"Błąd zapisu NCBR cache: {e}")

    async def _fetch_live(self) -> list:
        """
        Pobiera zaktualizowaną, autentyczną bazę gwarantowanych naborów używając Firecrawl,
        filtrując przedawnione. Odrzuca dawne "twarde" dane testowe.
        """
        import os
        import requests
        from core.date_utils import filter_outdated_grants
        
        logger.info("Rozpoczynam pobieranie na żywo naborów NCBR...")
        api_key = os.getenv("FIRECRAWL_API_KEY")
        
        all_grants = []
        if api_key:
            logger.info("Używam Firecrawl do ominięcia zabezpieczeń NCBR...")
            try:
                resp = requests.post(
                    "https://api.firecrawl.dev/v1/scrape",
                    headers={"Authorization": f"Bearer {api_key}"},
                    json={"url": NCBR_PROGRAMS_URL, "formats": ["markdown"]},
                    timeout=30.0
                )
                if resp.status_code == 200:
                    data = resp.json()
                    md = data.get("data", {}).get("markdown", "")
                    if md:
                        all_grants = await self._parse_firecrawl_markdown(md)
                        logger.info(f"Firecrawl zwrócił {len(all_grants)} naborów z NCBR.")
                else:
                    logger.warning(f"Błąd Firecrawl API (NCBR): {resp.status_code} - {resp.text}")
            except Exception as e:
                logger.error(f"Wyjątek podczas wywołania Firecrawl API (NCBR): {e}")
        else:
            logger.warning("Brak klucza FIRECRAWL_API_KEY. Zostanie użyty parser HTTPX.")

        if not all_grants:
            logger.info("Próba pobrania przez HTTPX / BeautifulSoup...")
            try:
                async with httpx.AsyncClient(timeout=15.0, follow_redirects=True, verify=False) as client:
                    headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}
                    response = await client.get(NCBR_PROGRAMS_URL, headers=headers)
                    if response.status_code == 200:
                        all_grants = self._parse_html(response.text)
                        logger.info(f"HTTPX zwrócił {len(all_grants)} naborów ze strony NCBR.")
            except Exception as e:
                logger.error(f"Błąd podczas scrapowania NCBR HTML: {e}")

        # Filtrowanie przestarzałych dat
        active_grants = filter_outdated_grants(all_grants)
        
        return active_grants

    async def _parse_firecrawl_markdown(self, md: str) -> list:
        """Skanuje markdown za pomocą LLM w celu wydobycia listy naborów."""
        try:
            from core.llm_router import get_llm
            from pydantic import BaseModel, Field
            from typing import List
            
            class Grant(BaseModel):
                name: str = Field(description="Tytuł naboru/grantu/konkursu NCBR")
                url: str = Field(description="Adres URL do naboru, jeśli podany. Pozostaw puste jeśli brak.")
                deadline: str = Field(default="", description="Termin składania wniosków (deadline) w formacie YYYY-MM-DD. Jeśli podano tylko do kiedy, zgadnij datę. Jeśli brak, zostaw puste.")
                
            class GrantsList(BaseModel):
                grants: List[Grant]
                
            llm = get_llm("fast").with_structured_output(GrantsList)
            md_subset = md[:10000]
            prompt = f"Wydobądź listę aktualnych naborów lub programów dotacyjnych z poniższego tekstu Markdown:\n\n{md_subset}"
            
            result = await llm.ainvoke(prompt)
            nabory = []
            for g in result.grants:
                uid = hashlib.md5(g.name.encode()).hexdigest()[:12]
                if g.url and g.url.startswith("http"):
                    url = g.url
                elif g.url and g.url.startswith("/"):
                    url = NCBR_BASE_URL + g.url
                else:
                    url = NCBR_PROGRAMS_URL
                    
                nabory.append({
                    "id": uid,
                    "name": g.name,
                    "program": "NCBR",
                    "status": "active",
                    "url": url,
                    "deadline": g.deadline,
                    "source": "ncbr_scrape",
                    "fetched_at": datetime.now(timezone.utc).isoformat(),
                })
            return nabory
        except Exception as e:
            logger.warning(f"Błąd parsowania markdowna z LLM (NCBR): {e}")
            return []

    def _parse_html(self, html: str) -> list:
        """Parsuje HTML NCBR — uproszczony parser."""
        try:
            from bs4 import BeautifulSoup

            soup = BeautifulSoup(html, "html.parser")
            items = soup.select(".program-item, .tile, article")[:15]
            nabory = []
            for item in items:
                title_el = item.select_one("h2, h3, .title, a")
                title = (
                    title_el.get_text(strip=True) if title_el else "Nieznany program"
                )
                link_el = item.select_one("a[href]")
                if link_el:
                    href = link_el["href"]
                    url = href if href.startswith("http") else (NCBR_BASE_URL + href if href.startswith("/") else NCBR_BASE_URL + "/" + href)
                else:
                    url = NCBR_BASE_URL
                uid = hashlib.md5(title.encode()).hexdigest()[:12]
                nabory.append(
                    {
                        "id": uid,
                        "name": title,
                        "program": "NCBR",
                        "status": "active",
                        "url": url,
                        "source": "ncbr_scrape",
                        "fetched_at": datetime.now(timezone.utc).isoformat(),
                    }
                )
            return nabory
        except Exception:
            return []

    def _enrich_urls(self, nabory: list) -> None:
        import urllib.parse
        for n in nabory:
            q_eur = n.get("program") or n.get("name", "")
            q_gov = n.get("name", "")
            if "eurlex_url" not in n:
                n["eurlex_url"] = f"https://eur-lex.europa.eu/search.html?scope=EURLEX&text={urllib.parse.quote(q_eur)}&lang=pl&type=quick"
            if "official_doc_url" not in n:
                n["official_doc_url"] = f"https://www.funduszeeuropejskie.gov.pl/wyszukiwarka/mikro-male-i-srednie-przedsiebiorstwa/#/szukaj?search={urllib.parse.quote(q_gov)}"

    async def get_active_nabory(self, force_refresh: bool = False) -> list:
        if not force_refresh:
            cached = self._load_cache()
            if cached:
                nabory = cached["nabory"]
                self._enrich_urls(nabory)
                return nabory
        nabory = await self._fetch_live()
        self._enrich_urls(nabory)
        self._save_cache(nabory)
        return nabory


# Singleton
ncbr_client = NCBRClient()