#!/usr/bin/env python3 """ Populate leafsnap_aliases table in plants.db and update missing_species.csv. For each LeafSnap species not in the DB: - If a DB species with the same genus has an epithet edit-distance <= EDIT_THRESHOLD (or the full name edit-distance <= EDIT_THRESHOLD), it is saved as an alias. - Otherwise it is appended to missing_species.csv for manual review. Run: python sync_leafsnap_aliases.py [--db data/plants.db] [--cache data/leafsnap_cache.pt] [--missing missing_species.csv] [--threshold 3] [--dry-run] """ import argparse import csv import os import sqlite3 from collections import defaultdict from pathlib import Path def _edit_distance(a: str, b: str) -> int: """Simple Levenshtein distance.""" a, b = a.lower(), b.lower() if a == b: return 0 m, n = len(a), len(b) dp = list(range(n + 1)) for i in range(1, m + 1): prev = dp[:] dp[0] = i for j in range(1, n + 1): cost = 0 if a[i - 1] == b[j - 1] else 1 dp[j] = min(dp[j] + 1, dp[j - 1] + 1, prev[j - 1] + cost) return dp[n] def find_best_alias( leafsnap_label: str, db_species: set[str], db_by_genus: dict[str, list[str]], threshold: int, ) -> str | None: """Return the best-matching DB species within *threshold* edit distance, or None.""" genus = leafsnap_label.split()[0] epithet = " ".join(leafsnap_label.split()[1:]) candidates = db_by_genus.get(genus, []) if not candidates: return None best_label: str | None = None best_dist = threshold + 1 for db_sp in candidates: db_epithet = " ".join(db_sp.split()[1:]) # Compare epithet only (genus already matches) d = _edit_distance(epithet, db_epithet) if d < best_dist: best_dist = d best_label = db_sp return best_label if best_dist <= threshold else None def main() -> None: parser = argparse.ArgumentParser(description="Sync LeafSnap aliases into plants.db") parser.add_argument("--db", default=os.getenv("PLANTS_SQLITE_PATH", "data/plants.db")) parser.add_argument("--cache", default=os.getenv("LEAFSNAP_CACHE_PATH", "data/leafsnap_cache.pt")) parser.add_argument("--missing", default="missing_species.csv") parser.add_argument("--threshold", type=int, default=2, help="Max Levenshtein distance on epithet to consider an alias (default: 2)") parser.add_argument("--dry-run", action="store_true", help="Print results without writing to DB or CSV") args = parser.parse_args() import torch print(f"Loading LeafSnap cache from {args.cache} ...") ls_data = torch.load(args.cache, map_location="cpu", weights_only=False) leafsnap_species = set(ls_data["labels"]) print(f"Loading DB species from {args.db} ...") conn = sqlite3.connect(args.db) # Ensure table exists conn.execute( """ CREATE TABLE IF NOT EXISTS leafsnap_aliases ( leafsnap_label TEXT PRIMARY KEY, db_species_name TEXT NOT NULL ) """ ) conn.commit() db_species = set(row[0] for row in conn.execute("SELECT species_name FROM plants")) existing_aliases = dict(conn.execute("SELECT leafsnap_label, db_species_name FROM leafsnap_aliases")) db_by_genus: dict[str, list[str]] = defaultdict(list) for s in db_species: db_by_genus[s.split()[0]].append(s) # Classify each LeafSnap species to_alias: list[tuple[str, str]] = [] # (leafsnap_label, db_species_name) to_missing: list[str] = [] # Hardcoded typo overrides (edit distance > threshold but unambiguously same species) TYPO_OVERRIDES: dict[str, str] = { "Aesculus hippocastamon": "Aesculus hippocastanum", } for sp in sorted(leafsnap_species): if sp in db_species: continue # already in DB, no alias needed if sp in existing_aliases: print(f" [skip] {sp} -> {existing_aliases[sp]} (already in aliases)") continue if sp in TYPO_OVERRIDES: target = TYPO_OVERRIDES[sp] if target in db_species: to_alias.append((sp, target)) continue match = find_best_alias(sp, db_species, db_by_genus, args.threshold) if match: to_alias.append((sp, match)) else: to_missing.append(sp) print(f"\nNew aliases found: {len(to_alias)}") for ls_lbl, db_lbl in to_alias: print(f" {ls_lbl} -> {db_lbl}") print(f"\nUnresolvable (-> missing_species.csv): {len(to_missing)}") for sp in to_missing: print(f" {sp}") if args.dry_run: print("\n[dry-run] No changes written.") conn.close() return # Write aliases to DB if to_alias: conn.executemany( "INSERT OR REPLACE INTO leafsnap_aliases (leafsnap_label, db_species_name) VALUES (?, ?)", to_alias, ) conn.commit() print(f"\nSaved {len(to_alias)} aliases to leafsnap_aliases table.") conn.close() # Append to missing_species.csv (avoid duplicates) missing_path = Path(args.missing) existing_missing: set[str] = set() if missing_path.exists(): with open(missing_path, newline="", encoding="utf-8") as f: reader = csv.DictReader(f) for row in reader: existing_missing.add(row["species_name"]) new_missing = [sp for sp in to_missing if sp not in existing_missing] if new_missing: write_header = not missing_path.exists() or missing_path.stat().st_size == 0 with open(missing_path, "a", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=["species_name"]) if write_header: writer.writeheader() for sp in new_missing: writer.writerow({"species_name": sp}) print(f"Appended {len(new_missing)} species to {missing_path}.") else: print("No new missing species to append.") if __name__ == "__main__": main()