File size: 3,349 Bytes
39c76f7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#!/usr/bin/env python3
"""Process missing species from missing_species_alias.csv and add to RAG."""

import csv
import os
from pathlib import Path
from urllib.parse import urlparse, unquote
from dotenv import load_dotenv
import chromadb
from openai import OpenAI

from build_plant_rag import process_species

load_dotenv()

BASE_DIR = Path(__file__).resolve().parent
DATA_DIR = BASE_DIR / "data"
RAG_DB_PATH = Path(os.getenv("RAG_DB_PATH", str(DATA_DIR / "plant_rag")))
SQLITE_DB_PATH = Path(os.getenv("PLANTS_SQLITE_PATH", str(DATA_DIR / "plants.db")))
ALIAS_CSV = BASE_DIR / "missing_species_alias.csv"

DEFAULT_MODEL = os.getenv("OPENAI_MODEL", "gpt-4o-mini")
TRANSLATE_TO_ITALIAN = True

def extract_wiki_info(url: str) -> dict[str, str]:
    """Extract language and page title from Wikipedia URL."""
    if not url or not url.strip():
        return {}
    
    url = url.strip()
    parsed = urlparse(url)
    
    # Extract lang from domain: it.wikipedia.org -> it, en.wikipedia.org -> en
    netloc = parsed.netloc.lower()  # e.g. "it.wikipedia.org"
    if "wikipedia.org" in netloc:
        lang = netloc.split(".")[0]  # e.g. "it"
    else:
        return {}
    
    # Extract title from path: /wiki/Page_Title -> Page_Title
    path = parsed.path
    if "/wiki/" in path:
        title = path.split("/wiki/")[1]
        title = unquote(title)  # decode URL encoding
    else:
        return {}
    
    return {"lang": lang, "title": title}

def process_missing_species():
    """Read missing_species_alias.csv and process each species via RAG."""
    
    client = OpenAI()
    chroma_client = chromadb.PersistentClient(path=str(RAG_DB_PATH))
    collection = chroma_client.get_or_create_collection(name="plants")
    
    species_processed = 0
    species_failed = 0
    
    with open(ALIAS_CSV, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter=";")
        for row in reader:
            species_name = (row.get("species_name") or "").strip()
            wiki_url = (row.get("wikipedia_url") or "").strip()
            
            if not species_name:
                continue
            
            print(f"\n[*] Processing {species_name}...")
            
            alias_info = extract_wiki_info(wiki_url) if wiki_url else None
            
            try:
                result = process_species(
                    species_name=species_name,
                    collection=collection,
                    wiki_langs=("it", "en"),
                    alias_info=alias_info,
                    translator_client=client,
                    translation_model=DEFAULT_MODEL,
                    translate_non_italian=TRANSLATE_TO_ITALIAN,
                    sqlite_path=SQLITE_DB_PATH,
                )
                
                status = result.get("status", "unknown")
                if status == "ok":
                    print(f"    ✓ Added to RAG")
                    species_processed += 1
                else:
                    print(f"    [!] Status: {status}")
                    species_failed += 1
            except Exception as e:
                print(f"    ✗ Error: {e}")
                species_failed += 1
    
    print(f"\n[✓] Processing complete: {species_processed} added, {species_failed} failed")

if __name__ == "__main__":
    process_missing_species()