GreenAssistent / sync_leafsnap_aliases.py
outshine84
fix vari
4f12e6d
#!/usr/bin/env python3
"""
Populate leafsnap_aliases table in plants.db and update missing_species.csv.
For each LeafSnap species not in the DB:
- If a DB species with the same genus has an epithet edit-distance <= EDIT_THRESHOLD
(or the full name edit-distance <= EDIT_THRESHOLD), it is saved as an alias.
- Otherwise it is appended to missing_species.csv for manual review.
Run:
python sync_leafsnap_aliases.py [--db data/plants.db] [--cache data/leafsnap_cache.pt]
[--missing missing_species.csv] [--threshold 3] [--dry-run]
"""
import argparse
import csv
import os
import sqlite3
from collections import defaultdict
from pathlib import Path
def _edit_distance(a: str, b: str) -> int:
"""Simple Levenshtein distance."""
a, b = a.lower(), b.lower()
if a == b:
return 0
m, n = len(a), len(b)
dp = list(range(n + 1))
for i in range(1, m + 1):
prev = dp[:]
dp[0] = i
for j in range(1, n + 1):
cost = 0 if a[i - 1] == b[j - 1] else 1
dp[j] = min(dp[j] + 1, dp[j - 1] + 1, prev[j - 1] + cost)
return dp[n]
def find_best_alias(
leafsnap_label: str,
db_species: set[str],
db_by_genus: dict[str, list[str]],
threshold: int,
) -> str | None:
"""Return the best-matching DB species within *threshold* edit distance, or None."""
genus = leafsnap_label.split()[0]
epithet = " ".join(leafsnap_label.split()[1:])
candidates = db_by_genus.get(genus, [])
if not candidates:
return None
best_label: str | None = None
best_dist = threshold + 1
for db_sp in candidates:
db_epithet = " ".join(db_sp.split()[1:])
# Compare epithet only (genus already matches)
d = _edit_distance(epithet, db_epithet)
if d < best_dist:
best_dist = d
best_label = db_sp
return best_label if best_dist <= threshold else None
def main() -> None:
parser = argparse.ArgumentParser(description="Sync LeafSnap aliases into plants.db")
parser.add_argument("--db", default=os.getenv("PLANTS_SQLITE_PATH", "data/plants.db"))
parser.add_argument("--cache", default=os.getenv("LEAFSNAP_CACHE_PATH", "data/leafsnap_cache.pt"))
parser.add_argument("--missing", default="missing_species.csv")
parser.add_argument("--threshold", type=int, default=2,
help="Max Levenshtein distance on epithet to consider an alias (default: 2)")
parser.add_argument("--dry-run", action="store_true",
help="Print results without writing to DB or CSV")
args = parser.parse_args()
import torch
print(f"Loading LeafSnap cache from {args.cache} ...")
ls_data = torch.load(args.cache, map_location="cpu", weights_only=False)
leafsnap_species = set(ls_data["labels"])
print(f"Loading DB species from {args.db} ...")
conn = sqlite3.connect(args.db)
# Ensure table exists
conn.execute(
"""
CREATE TABLE IF NOT EXISTS leafsnap_aliases (
leafsnap_label TEXT PRIMARY KEY,
db_species_name TEXT NOT NULL
)
"""
)
conn.commit()
db_species = set(row[0] for row in conn.execute("SELECT species_name FROM plants"))
existing_aliases = dict(conn.execute("SELECT leafsnap_label, db_species_name FROM leafsnap_aliases"))
db_by_genus: dict[str, list[str]] = defaultdict(list)
for s in db_species:
db_by_genus[s.split()[0]].append(s)
# Classify each LeafSnap species
to_alias: list[tuple[str, str]] = [] # (leafsnap_label, db_species_name)
to_missing: list[str] = []
# Hardcoded typo overrides (edit distance > threshold but unambiguously same species)
TYPO_OVERRIDES: dict[str, str] = {
"Aesculus hippocastamon": "Aesculus hippocastanum",
}
for sp in sorted(leafsnap_species):
if sp in db_species:
continue # already in DB, no alias needed
if sp in existing_aliases:
print(f" [skip] {sp} -> {existing_aliases[sp]} (already in aliases)")
continue
if sp in TYPO_OVERRIDES:
target = TYPO_OVERRIDES[sp]
if target in db_species:
to_alias.append((sp, target))
continue
match = find_best_alias(sp, db_species, db_by_genus, args.threshold)
if match:
to_alias.append((sp, match))
else:
to_missing.append(sp)
print(f"\nNew aliases found: {len(to_alias)}")
for ls_lbl, db_lbl in to_alias:
print(f" {ls_lbl} -> {db_lbl}")
print(f"\nUnresolvable (-> missing_species.csv): {len(to_missing)}")
for sp in to_missing:
print(f" {sp}")
if args.dry_run:
print("\n[dry-run] No changes written.")
conn.close()
return
# Write aliases to DB
if to_alias:
conn.executemany(
"INSERT OR REPLACE INTO leafsnap_aliases (leafsnap_label, db_species_name) VALUES (?, ?)",
to_alias,
)
conn.commit()
print(f"\nSaved {len(to_alias)} aliases to leafsnap_aliases table.")
conn.close()
# Append to missing_species.csv (avoid duplicates)
missing_path = Path(args.missing)
existing_missing: set[str] = set()
if missing_path.exists():
with open(missing_path, newline="", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
existing_missing.add(row["species_name"])
new_missing = [sp for sp in to_missing if sp not in existing_missing]
if new_missing:
write_header = not missing_path.exists() or missing_path.stat().st_size == 0
with open(missing_path, "a", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=["species_name"])
if write_header:
writer.writeheader()
for sp in new_missing:
writer.writerow({"species_name": sp})
print(f"Appended {len(new_missing)} species to {missing_path}.")
else:
print("No new missing species to append.")
if __name__ == "__main__":
main()