Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Process missing species from missing_species_alias.csv and add to RAG.""" | |
| import csv | |
| import os | |
| from pathlib import Path | |
| from urllib.parse import urlparse, unquote | |
| from dotenv import load_dotenv | |
| import chromadb | |
| from openai import OpenAI | |
| from build_plant_rag import process_species | |
| load_dotenv() | |
| BASE_DIR = Path(__file__).resolve().parent | |
| DATA_DIR = BASE_DIR / "data" | |
| RAG_DB_PATH = Path(os.getenv("RAG_DB_PATH", str(DATA_DIR / "plant_rag"))) | |
| SQLITE_DB_PATH = Path(os.getenv("PLANTS_SQLITE_PATH", str(DATA_DIR / "plants.db"))) | |
| ALIAS_CSV = BASE_DIR / "missing_species_alias.csv" | |
| DEFAULT_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini") | |
| TRANSLATE_TO_ITALIAN = True | |
| def extract_wiki_info(url: str) -> dict[str, str]: | |
| """Extract language and page title from Wikipedia URL.""" | |
| if not url or not url.strip(): | |
| return {} | |
| url = url.strip() | |
| parsed = urlparse(url) | |
| # Extract lang from domain: it.wikipedia.org -> it, en.wikipedia.org -> en | |
| netloc = parsed.netloc.lower() # e.g. "it.wikipedia.org" | |
| if "wikipedia.org" in netloc: | |
| lang = netloc.split(".")[0] # e.g. "it" | |
| else: | |
| return {} | |
| # Extract title from path: /wiki/Page_Title -> Page_Title | |
| path = parsed.path | |
| if "/wiki/" in path: | |
| title = path.split("/wiki/")[1] | |
| title = unquote(title) # decode URL encoding | |
| else: | |
| return {} | |
| return {"lang": lang, "title": title} | |
| def process_missing_species(): | |
| """Read missing_species_alias.csv and process each species via RAG.""" | |
| client = OpenAI() | |
| chroma_client = chromadb.PersistentClient(path=str(RAG_DB_PATH)) | |
| collection = chroma_client.get_or_create_collection(name="plants") | |
| species_processed = 0 | |
| species_failed = 0 | |
| with open(ALIAS_CSV, "r", encoding="utf-8") as f: | |
| reader = csv.DictReader(f, delimiter=";") | |
| for row in reader: | |
| species_name = (row.get("species_name") or "").strip() | |
| wiki_url = (row.get("wikipedia_url") or "").strip() | |
| if not species_name: | |
| continue | |
| print(f"\n[*] Processing {species_name}...") | |
| alias_info = extract_wiki_info(wiki_url) if wiki_url else None | |
| try: | |
| result = process_species( | |
| species_name=species_name, | |
| collection=collection, | |
| wiki_langs=("it", "en"), | |
| alias_info=alias_info, | |
| translator_client=client, | |
| translation_model=DEFAULT_MODEL, | |
| translate_non_italian=TRANSLATE_TO_ITALIAN, | |
| sqlite_path=SQLITE_DB_PATH, | |
| ) | |
| status = result.get("status", "unknown") | |
| if status == "ok": | |
| print(f" ✓ Added to RAG") | |
| species_processed += 1 | |
| else: | |
| print(f" [!] Status: {status}") | |
| species_failed += 1 | |
| except Exception as e: | |
| print(f" ✗ Error: {e}") | |
| species_failed += 1 | |
| print(f"\n[✓] Processing complete: {species_processed} added, {species_failed} failed") | |
| if __name__ == "__main__": | |
| process_missing_species() | |