GreenAssistent / process_missing_species_alias.py
outshine84
complete db
39c76f7
#!/usr/bin/env python3
"""Process missing species from missing_species_alias.csv and add to RAG."""
import csv
import os
from pathlib import Path
from urllib.parse import urlparse, unquote
from dotenv import load_dotenv
import chromadb
from openai import OpenAI
from build_plant_rag import process_species
load_dotenv()
BASE_DIR = Path(__file__).resolve().parent
DATA_DIR = BASE_DIR / "data"
RAG_DB_PATH = Path(os.getenv("RAG_DB_PATH", str(DATA_DIR / "plant_rag")))
SQLITE_DB_PATH = Path(os.getenv("PLANTS_SQLITE_PATH", str(DATA_DIR / "plants.db")))
ALIAS_CSV = BASE_DIR / "missing_species_alias.csv"
DEFAULT_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
TRANSLATE_TO_ITALIAN = True
def extract_wiki_info(url: str) -> dict[str, str]:
"""Extract language and page title from Wikipedia URL."""
if not url or not url.strip():
return {}
url = url.strip()
parsed = urlparse(url)
# Extract lang from domain: it.wikipedia.org -> it, en.wikipedia.org -> en
netloc = parsed.netloc.lower() # e.g. "it.wikipedia.org"
if "wikipedia.org" in netloc:
lang = netloc.split(".")[0] # e.g. "it"
else:
return {}
# Extract title from path: /wiki/Page_Title -> Page_Title
path = parsed.path
if "/wiki/" in path:
title = path.split("/wiki/")[1]
title = unquote(title) # decode URL encoding
else:
return {}
return {"lang": lang, "title": title}
def process_missing_species():
"""Read missing_species_alias.csv and process each species via RAG."""
client = OpenAI()
chroma_client = chromadb.PersistentClient(path=str(RAG_DB_PATH))
collection = chroma_client.get_or_create_collection(name="plants")
species_processed = 0
species_failed = 0
with open(ALIAS_CSV, "r", encoding="utf-8") as f:
reader = csv.DictReader(f, delimiter=";")
for row in reader:
species_name = (row.get("species_name") or "").strip()
wiki_url = (row.get("wikipedia_url") or "").strip()
if not species_name:
continue
print(f"\n[*] Processing {species_name}...")
alias_info = extract_wiki_info(wiki_url) if wiki_url else None
try:
result = process_species(
species_name=species_name,
collection=collection,
wiki_langs=("it", "en"),
alias_info=alias_info,
translator_client=client,
translation_model=DEFAULT_MODEL,
translate_non_italian=TRANSLATE_TO_ITALIAN,
sqlite_path=SQLITE_DB_PATH,
)
status = result.get("status", "unknown")
if status == "ok":
print(f" ✓ Added to RAG")
species_processed += 1
else:
print(f" [!] Status: {status}")
species_failed += 1
except Exception as e:
print(f" ✗ Error: {e}")
species_failed += 1
print(f"\n[✓] Processing complete: {species_processed} added, {species_failed} failed")
if __name__ == "__main__":
process_missing_species()