Spaces:

AChierici84
/

GreenAssistent

Sleeping

GreenAssistent / missing_species.py

outshine84

complete db

39c76f7 26 days ago

6.07 kB

	import os
	from pathlib import Path

	import chromadb
	import pandas as pd
	import serpapi
	from dotenv import load_dotenv
	from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

	from add_species_to_faiss import add_to_faiss, resolve_title
	from build_plant_rag import (
	COLLECTION_NAME,
	DEFAULT_TRANSLATION_MODEL,
	EMBEDDING_MODEL,
	RAG_DIR,
	process_species,
	)
	from build_plants_sqlite import SQLITE_DB_PATH, get_rag_collection, get_rag_context, init_db, sqlite3, upsert_plant


	load_dotenv(dotenv_path=Path(__file__).with_name(".env"))


	def _extract_image_url(image: dict) -> str \| None:
	for key in ("original", "thumbnail", "source", "link", "image_url"):
	url = image.get(key)
	if isinstance(url, str) and url.strip():
	return url.strip()
	return None


	def download_images( species_name, location="Italy", num_images=100):
	client = serpapi.Client(api_key=os.getenv("SERP_API_KEY"))
	results = client.search({
	"engine": "google_images",
	"q": species_name,
	"location": location,
	"google_domain": "google.it",
	"hl": "it",
	"gl": "it"
	})
	images_results = results["images_results"]
	return images_results


	def load_missing_species():
	df= pd.read_csv("missing_species.csv")
	return df["species_name"].tolist()


	def _ensure_plants_db_record(species_name: str, indexed: bool) -> None:
	with sqlite3.connect(SQLITE_DB_PATH) as conn:
	init_db(conn)
	upsert_plant(conn, species_name=species_name, indexed=indexed, profile=None)
	conn.commit()


	def _sync_indexed_from_rag(species_name: str) -> bool:
	collection = get_rag_collection()
	context = get_rag_context(collection, species_name)
	indexed = bool((context or "").strip())
	_ensure_plants_db_record(species_name, indexed=indexed)
	return indexed


	def _fallback_add_without_images(
	species_name: str,
	rag_collection,
	lang: str,
	resolved_title: str,
	) -> None:
	alias = {
	"lang": lang,
	"title": resolved_title,
	"url": f"https://{lang}.wikipedia.org/wiki/{resolved_title.replace(' ', '_')}",
	}
	try:
	result = process_species(
	species_name=species_name,
	collection=rag_collection,
	wiki_langs=("it", "en", "fr", "es", "de", "pt"),
	alias_info=alias,
	translator_client=None,
	translation_model=DEFAULT_TRANSLATION_MODEL,
	translate_non_italian=False,
	sqlite_path=SQLITE_DB_PATH,
	)
	indexed = _sync_indexed_from_rag(species_name)
	print(
	f"[ok] {species_name}: fallback without images -> "
	f"RAG status={result.get('status')} \| plants.db indexed={indexed}"
	)
	except Exception as e:
	_ensure_plants_db_record(species_name, indexed=False)
	print(
	f"[warn] {species_name}: RAG fallback failed ({e}); "
	"inserted placeholder in plants.db (indexed=0)"
	)

	def main():
	missing_species = load_missing_species()

	RAG_DIR.mkdir(parents=True, exist_ok=True)
	ef = SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL)
	rag_client = chromadb.PersistentClient(path=str(RAG_DIR))
	rag_collection = rag_client.get_or_create_collection(
	name=COLLECTION_NAME,
	embedding_function=ef,
	metadata={"hnsw:space": "cosine"},
	)

	for species in missing_species:

	#verifica se presente un file con i risultati per questa specie
	if os.path.exists(f"logs/images/{species.replace(' ', '_')}_images.txt"):
	#carica i risultati dal file
	with open(f"logs/images/{species.replace(' ', '_')}_images.txt", "r") as f:
	images = f.read().splitlines()
	print(f"Loaded {len(images)} images for {species} from logs/images/{species.replace(' ', '_')}_images.txt")
	else:
	print(f"Searching images for {species}...")
	try:
	images_raw = download_images(species)
	print(f"Downloaded {len(images_raw)} images for {species}.")
	# salva i risultati in un file nella cartella logs/images
	images = []
	with open(f"logs/images/{species.replace(' ', '_')}_images.txt", "w") as f:
	for image in images_raw:
	url = _extract_image_url(image)
	if url:
	f.write(url + "\n")
	images.append(url)

	print(f"Saved image URLs for {species} in logs/images/{species.replace(' ', '_')}_images.txt")
	except Exception as e:
	print(f"[warn] Image download failed for {species}: {e} \| continuing with RAG/plants.db only")
	images = []

	# Ensure images is a list of URL strings
	if images and isinstance(images[0], dict):
	images = [u for img in images if (u := _extract_image_url(img))]

	# Try to resolve Wikipedia page (with fallback variants)
	try:
	langs = ("it", "en", "fr", "es", "de", "pt")
	lang, resolved_title = resolve_title(species, "", langs)
	print(f"Resolved Wikipedia page: {lang}:{resolved_title}")
	except RuntimeError as e:
	_ensure_plants_db_record(species, indexed=False)
	print(f"[skip] {species}: {e} \| inserted in plants.db with indexed=0")
	continue

	if not images:
	_fallback_add_without_images(species, rag_collection, lang, resolved_title)
	continue

	# Add to FAISS, RAG, and plants.db
	try:
	add_to_faiss(
	species,
	images,
	lang=lang,
	resolved_title=resolved_title,
	)
	print(f"[ok] {species}: added to FAISS, RAG, and plants.db")
	except Exception as e:
	print(f"[error] {species}: {e}")



	if __name__ == "__main__":
	main()