Spaces:

AChierici84
/

GreenAssistent

Sleeping

GreenAssistent / process_missing_species_alias.py

outshine84

complete db

39c76f7 21 days ago

3.35 kB

	#!/usr/bin/env python3
	"""Process missing species from missing_species_alias.csv and add to RAG."""

	import csv
	import os
	from pathlib import Path
	from urllib.parse import urlparse, unquote
	from dotenv import load_dotenv
	import chromadb
	from openai import OpenAI

	from build_plant_rag import process_species

	load_dotenv()

	BASE_DIR = Path(__file__).resolve().parent
	DATA_DIR = BASE_DIR / "data"
	RAG_DB_PATH = Path(os.getenv("RAG_DB_PATH", str(DATA_DIR / "plant_rag")))
	SQLITE_DB_PATH = Path(os.getenv("PLANTS_SQLITE_PATH", str(DATA_DIR / "plants.db")))
	ALIAS_CSV = BASE_DIR / "missing_species_alias.csv"

	DEFAULT_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
	TRANSLATE_TO_ITALIAN = True

	def extract_wiki_info(url: str) -> dict[str, str]:
	"""Extract language and page title from Wikipedia URL."""
	if not url or not url.strip():
	return {}

	url = url.strip()
	parsed = urlparse(url)

	# Extract lang from domain: it.wikipedia.org -> it, en.wikipedia.org -> en
	netloc = parsed.netloc.lower() # e.g. "it.wikipedia.org"
	if "wikipedia.org" in netloc:
	lang = netloc.split(".")[0] # e.g. "it"
	else:
	return {}

	# Extract title from path: /wiki/Page_Title -> Page_Title
	path = parsed.path
	if "/wiki/" in path:
	title = path.split("/wiki/")[1]
	title = unquote(title) # decode URL encoding
	else:
	return {}

	return {"lang": lang, "title": title}

	def process_missing_species():
	"""Read missing_species_alias.csv and process each species via RAG."""

	client = OpenAI()
	chroma_client = chromadb.PersistentClient(path=str(RAG_DB_PATH))
	collection = chroma_client.get_or_create_collection(name="plants")

	species_processed = 0
	species_failed = 0

	with open(ALIAS_CSV, "r", encoding="utf-8") as f:
	reader = csv.DictReader(f, delimiter=";")
	for row in reader:
	species_name = (row.get("species_name") or "").strip()
	wiki_url = (row.get("wikipedia_url") or "").strip()

	if not species_name:
	continue

	print(f"\n[*] Processing {species_name}...")

	alias_info = extract_wiki_info(wiki_url) if wiki_url else None

	try:
	result = process_species(
	species_name=species_name,
	collection=collection,
	wiki_langs=("it", "en"),
	alias_info=alias_info,
	translator_client=client,
	translation_model=DEFAULT_MODEL,
	translate_non_italian=TRANSLATE_TO_ITALIAN,
	sqlite_path=SQLITE_DB_PATH,
	)

	status = result.get("status", "unknown")
	if status == "ok":
	print(f" ✓ Added to RAG")
	species_processed += 1
	else:
	print(f" [!] Status: {status}")
	species_failed += 1
	except Exception as e:
	print(f" ✗ Error: {e}")
	species_failed += 1

	print(f"\n[✓] Processing complete: {species_processed} added, {species_failed} failed")

	if __name__ == "__main__":
	process_missing_species()