GreenAssistent / missing_species.py
outshine84
complete db
39c76f7
import os
from pathlib import Path
import chromadb
import pandas as pd
import serpapi
from dotenv import load_dotenv
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction
from add_species_to_faiss import add_to_faiss, resolve_title
from build_plant_rag import (
COLLECTION_NAME,
DEFAULT_TRANSLATION_MODEL,
EMBEDDING_MODEL,
RAG_DIR,
process_species,
)
from build_plants_sqlite import SQLITE_DB_PATH, get_rag_collection, get_rag_context, init_db, sqlite3, upsert_plant
load_dotenv(dotenv_path=Path(__file__).with_name(".env"))
def _extract_image_url(image: dict) -> str | None:
for key in ("original", "thumbnail", "source", "link", "image_url"):
url = image.get(key)
if isinstance(url, str) and url.strip():
return url.strip()
return None
def download_images( species_name, location="Italy", num_images=100):
client = serpapi.Client(api_key=os.getenv("SERP_API_KEY"))
results = client.search({
"engine": "google_images",
"q": species_name,
"location": location,
"google_domain": "google.it",
"hl": "it",
"gl": "it"
})
images_results = results["images_results"]
return images_results
def load_missing_species():
df= pd.read_csv("missing_species.csv")
return df["species_name"].tolist()
def _ensure_plants_db_record(species_name: str, indexed: bool) -> None:
with sqlite3.connect(SQLITE_DB_PATH) as conn:
init_db(conn)
upsert_plant(conn, species_name=species_name, indexed=indexed, profile=None)
conn.commit()
def _sync_indexed_from_rag(species_name: str) -> bool:
collection = get_rag_collection()
context = get_rag_context(collection, species_name)
indexed = bool((context or "").strip())
_ensure_plants_db_record(species_name, indexed=indexed)
return indexed
def _fallback_add_without_images(
species_name: str,
rag_collection,
lang: str,
resolved_title: str,
) -> None:
alias = {
"lang": lang,
"title": resolved_title,
"url": f"https://{lang}.wikipedia.org/wiki/{resolved_title.replace(' ', '_')}",
}
try:
result = process_species(
species_name=species_name,
collection=rag_collection,
wiki_langs=("it", "en", "fr", "es", "de", "pt"),
alias_info=alias,
translator_client=None,
translation_model=DEFAULT_TRANSLATION_MODEL,
translate_non_italian=False,
sqlite_path=SQLITE_DB_PATH,
)
indexed = _sync_indexed_from_rag(species_name)
print(
f"[ok] {species_name}: fallback without images -> "
f"RAG status={result.get('status')} | plants.db indexed={indexed}"
)
except Exception as e:
_ensure_plants_db_record(species_name, indexed=False)
print(
f"[warn] {species_name}: RAG fallback failed ({e}); "
"inserted placeholder in plants.db (indexed=0)"
)
def main():
missing_species = load_missing_species()
RAG_DIR.mkdir(parents=True, exist_ok=True)
ef = SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL)
rag_client = chromadb.PersistentClient(path=str(RAG_DIR))
rag_collection = rag_client.get_or_create_collection(
name=COLLECTION_NAME,
embedding_function=ef,
metadata={"hnsw:space": "cosine"},
)
for species in missing_species:
#verifica se presente un file con i risultati per questa specie
if os.path.exists(f"logs/images/{species.replace(' ', '_')}_images.txt"):
#carica i risultati dal file
with open(f"logs/images/{species.replace(' ', '_')}_images.txt", "r") as f:
images = f.read().splitlines()
print(f"Loaded {len(images)} images for {species} from logs/images/{species.replace(' ', '_')}_images.txt")
else:
print(f"Searching images for {species}...")
try:
images_raw = download_images(species)
print(f"Downloaded {len(images_raw)} images for {species}.")
# salva i risultati in un file nella cartella logs/images
images = []
with open(f"logs/images/{species.replace(' ', '_')}_images.txt", "w") as f:
for image in images_raw:
url = _extract_image_url(image)
if url:
f.write(url + "\n")
images.append(url)
print(f"Saved image URLs for {species} in logs/images/{species.replace(' ', '_')}_images.txt")
except Exception as e:
print(f"[warn] Image download failed for {species}: {e} | continuing with RAG/plants.db only")
images = []
# Ensure images is a list of URL strings
if images and isinstance(images[0], dict):
images = [u for img in images if (u := _extract_image_url(img))]
# Try to resolve Wikipedia page (with fallback variants)
try:
langs = ("it", "en", "fr", "es", "de", "pt")
lang, resolved_title = resolve_title(species, "", langs)
print(f"Resolved Wikipedia page: {lang}:{resolved_title}")
except RuntimeError as e:
_ensure_plants_db_record(species, indexed=False)
print(f"[skip] {species}: {e} | inserted in plants.db with indexed=0")
continue
if not images:
_fallback_add_without_images(species, rag_collection, lang, resolved_title)
continue
# Add to FAISS, RAG, and plants.db
try:
add_to_faiss(
species,
images,
lang=lang,
resolved_title=resolved_title,
)
print(f"[ok] {species}: added to FAISS, RAG, and plants.db")
except Exception as e:
print(f"[error] {species}: {e}")
if __name__ == "__main__":
main()