import os
from pathlib import Path

import chromadb
import pandas as pd
import serpapi
from dotenv import load_dotenv
from chromadb.utils.embedding_functions import SentenceTransformerEmbeddingFunction

from add_species_to_faiss import add_to_faiss, resolve_title
from build_plant_rag import (
    COLLECTION_NAME,
    DEFAULT_TRANSLATION_MODEL,
    EMBEDDING_MODEL,
    RAG_DIR,
    process_species,
)
from build_plants_sqlite import SQLITE_DB_PATH, get_rag_collection, get_rag_context, init_db, sqlite3, upsert_plant


load_dotenv(dotenv_path=Path(__file__).with_name(".env"))


def _extract_image_url(image: dict) -> str | None:
    for key in ("original", "thumbnail", "source", "link", "image_url"):
        url = image.get(key)
        if isinstance(url, str) and url.strip():
            return url.strip()
    return None


def download_images( species_name, location="Italy", num_images=100):
    client = serpapi.Client(api_key=os.getenv("SERP_API_KEY"))
    results = client.search({
      "engine": "google_images",
      "q": species_name,
      "location": location,
      "google_domain": "google.it",
      "hl": "it",
      "gl": "it"
    })
    images_results = results["images_results"]
    return images_results   


def load_missing_species():
    df= pd.read_csv("missing_species.csv")
    return df["species_name"].tolist()


def _ensure_plants_db_record(species_name: str, indexed: bool) -> None:
    with sqlite3.connect(SQLITE_DB_PATH) as conn:
        init_db(conn)
        upsert_plant(conn, species_name=species_name, indexed=indexed, profile=None)
        conn.commit()


def _sync_indexed_from_rag(species_name: str) -> bool:
    collection = get_rag_collection()
    context = get_rag_context(collection, species_name)
    indexed = bool((context or "").strip())
    _ensure_plants_db_record(species_name, indexed=indexed)
    return indexed


def _fallback_add_without_images(
    species_name: str,
    rag_collection,
    lang: str,
    resolved_title: str,
) -> None:
    alias = {
        "lang": lang,
        "title": resolved_title,
        "url": f"https://{lang}.wikipedia.org/wiki/{resolved_title.replace(' ', '_')}",
    }
    try:
        result = process_species(
            species_name=species_name,
            collection=rag_collection,
            wiki_langs=("it", "en", "fr", "es", "de", "pt"),
            alias_info=alias,
            translator_client=None,
            translation_model=DEFAULT_TRANSLATION_MODEL,
            translate_non_italian=False,
            sqlite_path=SQLITE_DB_PATH,
        )
        indexed = _sync_indexed_from_rag(species_name)
        print(
            f"[ok] {species_name}: fallback without images -> "
            f"RAG status={result.get('status')} | plants.db indexed={indexed}"
        )
    except Exception as e:
        _ensure_plants_db_record(species_name, indexed=False)
        print(
            f"[warn] {species_name}: RAG fallback failed ({e}); "
            "inserted placeholder in plants.db (indexed=0)"
        )

def main():
    missing_species = load_missing_species()

    RAG_DIR.mkdir(parents=True, exist_ok=True)
    ef = SentenceTransformerEmbeddingFunction(model_name=EMBEDDING_MODEL)
    rag_client = chromadb.PersistentClient(path=str(RAG_DIR))
    rag_collection = rag_client.get_or_create_collection(
        name=COLLECTION_NAME,
        embedding_function=ef,
        metadata={"hnsw:space": "cosine"},
    )

    for species in missing_species:
        
        #verifica se presente un file con i risultati per questa specie
        if os.path.exists(f"logs/images/{species.replace(' ', '_')}_images.txt"):
            #carica i risultati dal file
            with open(f"logs/images/{species.replace(' ', '_')}_images.txt", "r") as f:
                images = f.read().splitlines()
            print(f"Loaded {len(images)} images for {species} from logs/images/{species.replace(' ', '_')}_images.txt")
        else:
            print(f"Searching images for {species}...")
            try:
                images_raw = download_images(species)
                print(f"Downloaded {len(images_raw)} images for {species}.")
                # salva i risultati in un file nella cartella logs/images
                images = []
                with open(f"logs/images/{species.replace(' ', '_')}_images.txt", "w") as f:
                    for image in images_raw:
                        url = _extract_image_url(image)
                        if url:
                            f.write(url + "\n")
                            images.append(url)

                print(f"Saved image URLs for {species} in logs/images/{species.replace(' ', '_')}_images.txt")
            except Exception as e:
                print(f"[warn] Image download failed for {species}: {e} | continuing with RAG/plants.db only")
                images = []
        
        # Ensure images is a list of URL strings
        if images and isinstance(images[0], dict):
            images = [u for img in images if (u := _extract_image_url(img))]
        
        # Try to resolve Wikipedia page (with fallback variants)
        try:
            langs = ("it", "en", "fr", "es", "de", "pt")
            lang, resolved_title = resolve_title(species, "", langs)
            print(f"Resolved Wikipedia page: {lang}:{resolved_title}")
        except RuntimeError as e:
            _ensure_plants_db_record(species, indexed=False)
            print(f"[skip] {species}: {e} | inserted in plants.db with indexed=0")
            continue

        if not images:
            _fallback_add_without_images(species, rag_collection, lang, resolved_title)
            continue

        # Add to FAISS, RAG, and plants.db
        try:
            add_to_faiss(
                species,
                images,
                lang=lang,
                resolved_title=resolved_title,
            )
            print(f"[ok] {species}: added to FAISS, RAG, and plants.db")
        except Exception as e:
            print(f"[error] {species}: {e}")


if __name__ == "__main__":
    main()