Spaces:
Sleeping
Sleeping
| import os | |
| from pathlib import Path | |
| import chromadb | |
| import pandas as pd | |
| import serpapi | |
| from dotenv import load_dotenv | |
| from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction | |
| from add_species_to_faiss import add_to_faiss, resolve_title | |
| from build_plant_rag import ( | |
| COLLECTION_NAME, | |
| DEFAULT_TRANSLATION_MODEL, | |
| EMBEDDING_MODEL, | |
| RAG_DIR, | |
| process_species, | |
| ) | |
| from build_plants_sqlite import SQLITE_DB_PATH, get_rag_collection, get_rag_context, init_db, sqlite3, upsert_plant | |
| load_dotenv(dotenv_path=Path(__file__).with_name(".env")) | |
| def _extract_image_url(image: dict) -> str | None: | |
| for key in ("original", "thumbnail", "source", "link", "image_url"): | |
| url = image.get(key) | |
| if isinstance(url, str) and url.strip(): | |
| return url.strip() | |
| return None | |
| def download_images( species_name, location="Italy", num_images=100): | |
| client = serpapi.Client(api_key=os.getenv("SERP_API_KEY")) | |
| results = client.search({ | |
| "engine": "google_images", | |
| "q": species_name, | |
| "location": location, | |
| "google_domain": "google.it", | |
| "hl": "it", | |
| "gl": "it" | |
| }) | |
| images_results = results["images_results"] | |
| return images_results | |
| def load_missing_species(): | |
| df= pd.read_csv("missing_species.csv") | |
| return df["species_name"].tolist() | |
| def _ensure_plants_db_record(species_name: str, indexed: bool) -> None: | |
| with sqlite3.connect(SQLITE_DB_PATH) as conn: | |
| init_db(conn) | |
| upsert_plant(conn, species_name=species_name, indexed=indexed, profile=None) | |
| conn.commit() | |
| def _sync_indexed_from_rag(species_name: str) -> bool: | |
| collection = get_rag_collection() | |
| context = get_rag_context(collection, species_name) | |
| indexed = bool((context or "").strip()) | |
| _ensure_plants_db_record(species_name, indexed=indexed) | |
| return indexed | |
| def _fallback_add_without_images( | |
| species_name: str, | |
| rag_collection, | |
| lang: str, | |
| resolved_title: str, | |
| ) -> None: | |
| alias = { | |
| "lang": lang, | |
| "title": resolved_title, | |
| "url": f"https://{lang}.wikipedia.org/wiki/{resolved_title.replace(' ', '_')}", | |
| } | |
| try: | |
| result = process_species( | |
| species_name=species_name, | |
| collection=rag_collection, | |
| wiki_langs=("it", "en", "fr", "es", "de", "pt"), | |
| alias_info=alias, | |
| translator_client=None, | |
| translation_model=DEFAULT_TRANSLATION_MODEL, | |
| translate_non_italian=False, | |
| sqlite_path=SQLITE_DB_PATH, | |
| ) | |
| indexed = _sync_indexed_from_rag(species_name) | |
| print( | |
| f"[ok] {species_name}: fallback without images -> " | |
| f"RAG status={result.get('status')} | plants.db indexed={indexed}" | |
| ) | |
| except Exception as e: | |
| _ensure_plants_db_record(species_name, indexed=False) | |
| print( | |
| f"[warn] {species_name}: RAG fallback failed ({e}); " | |
| "inserted placeholder in plants.db (indexed=0)" | |
| ) | |
| def main(): | |
| missing_species = load_missing_species() | |
| RAG_DIR.mkdir(parents=True, exist_ok=True) | |
| ef = SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL) | |
| rag_client = chromadb.PersistentClient(path=str(RAG_DIR)) | |
| rag_collection = rag_client.get_or_create_collection( | |
| name=COLLECTION_NAME, | |
| embedding_function=ef, | |
| metadata={"hnsw:space": "cosine"}, | |
| ) | |
| for species in missing_species: | |
| #verifica se presente un file con i risultati per questa specie | |
| if os.path.exists(f"logs/images/{species.replace(' ', '_')}_images.txt"): | |
| #carica i risultati dal file | |
| with open(f"logs/images/{species.replace(' ', '_')}_images.txt", "r") as f: | |
| images = f.read().splitlines() | |
| print(f"Loaded {len(images)} images for {species} from logs/images/{species.replace(' ', '_')}_images.txt") | |
| else: | |
| print(f"Searching images for {species}...") | |
| try: | |
| images_raw = download_images(species) | |
| print(f"Downloaded {len(images_raw)} images for {species}.") | |
| # salva i risultati in un file nella cartella logs/images | |
| images = [] | |
| with open(f"logs/images/{species.replace(' ', '_')}_images.txt", "w") as f: | |
| for image in images_raw: | |
| url = _extract_image_url(image) | |
| if url: | |
| f.write(url + "\n") | |
| images.append(url) | |
| print(f"Saved image URLs for {species} in logs/images/{species.replace(' ', '_')}_images.txt") | |
| except Exception as e: | |
| print(f"[warn] Image download failed for {species}: {e} | continuing with RAG/plants.db only") | |
| images = [] | |
| # Ensure images is a list of URL strings | |
| if images and isinstance(images[0], dict): | |
| images = [u for img in images if (u := _extract_image_url(img))] | |
| # Try to resolve Wikipedia page (with fallback variants) | |
| try: | |
| langs = ("it", "en", "fr", "es", "de", "pt") | |
| lang, resolved_title = resolve_title(species, "", langs) | |
| print(f"Resolved Wikipedia page: {lang}:{resolved_title}") | |
| except RuntimeError as e: | |
| _ensure_plants_db_record(species, indexed=False) | |
| print(f"[skip] {species}: {e} | inserted in plants.db with indexed=0") | |
| continue | |
| if not images: | |
| _fallback_add_without_images(species, rag_collection, lang, resolved_title) | |
| continue | |
| # Add to FAISS, RAG, and plants.db | |
| try: | |
| add_to_faiss( | |
| species, | |
| images, | |
| lang=lang, | |
| resolved_title=resolved_title, | |
| ) | |
| print(f"[ok] {species}: added to FAISS, RAG, and plants.db") | |
| except Exception as e: | |
| print(f"[error] {species}: {e}") | |
| if __name__ == "__main__": | |
| main() | |