Spaces:
Sleeping
Sleeping
outshine84 commited on
Commit ·
4f12e6d
1
Parent(s): e89d6e2
fix vari
Browse files- .gitattributes +2 -0
- .gitignore +2 -1
- api.py +65 -6
- build_plant_rag.py +47 -3
- build_plants_sqlite.py +20 -1
- data/plants.db +2 -2
- plentclef.py +56 -17
- pwa-app/src/App.jsx +144 -1
- pwa-app/src/api.js +29 -7
- sync_leafsnap_aliases.py +181 -0
.gitattributes
CHANGED
|
@@ -36,3 +36,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 36 |
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
*.faiss filter=lfs diff=lfs merge=lfs -text
|
| 38 |
*.db filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 36 |
*.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
*.faiss filter=lfs diff=lfs merge=lfs -text
|
| 38 |
*.db filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
data/leafsnap_*.faiss filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
data/leafsnap_*.pt filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
|
@@ -7,4 +7,5 @@ client_secret*
|
|
| 7 |
pwa-app/.env
|
| 8 |
missing_species_alias.csv
|
| 9 |
unique_species_labels.csv
|
| 10 |
-
data/images/
|
|
|
|
|
|
| 7 |
pwa-app/.env
|
| 8 |
missing_species_alias.csv
|
| 9 |
unique_species_labels.csv
|
| 10 |
+
data/images/
|
| 11 |
+
missing_species.csv
|
api.py
CHANGED
|
@@ -28,6 +28,8 @@ load_dotenv()
|
|
| 28 |
INDEX_PATH = os.getenv("PLANCLEF_INDEX_PATH", "data/planclef.faiss")
|
| 29 |
CACHE_PATH = os.getenv("PLANCLEF_CACHE_PATH", "data/planclef_cache.pt")
|
| 30 |
MODEL_NAME = os.getenv("PLANCLEF_MODEL_NAME", "ViT-B-32")
|
|
|
|
|
|
|
| 31 |
RAG_DB_PATH = os.getenv("RAG_DB_PATH", "data/plant_rag")
|
| 32 |
WIKI_USER_AGENT = os.getenv(
|
| 33 |
"WIKI_USER_AGENT",
|
|
@@ -170,6 +172,20 @@ def _species_to_folder_name(species_name: str) -> str:
|
|
| 170 |
|
| 171 |
|
| 172 |
def _get_species_preview_image_url(species_name: str) -> str:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 173 |
try:
|
| 174 |
collection = get_rag_collection()
|
| 175 |
res = collection.get(
|
|
@@ -265,9 +281,37 @@ def get_plants_db_connection() -> sqlite3.Connection:
|
|
| 265 |
|
| 266 |
conn = sqlite3.connect(db_path)
|
| 267 |
conn.row_factory = sqlite3.Row
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
return conn
|
| 269 |
|
| 270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 271 |
def _sqlite_table_exists(conn: sqlite3.Connection, table_name: str) -> bool:
|
| 272 |
row = conn.execute(
|
| 273 |
"SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ? LIMIT 1",
|
|
@@ -790,10 +834,23 @@ def get_index():
|
|
| 790 |
try:
|
| 791 |
from plentclef import PlentClefIndex
|
| 792 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 793 |
index = PlentClefIndex(
|
| 794 |
model_name=MODEL_NAME,
|
| 795 |
index_path=INDEX_PATH,
|
| 796 |
index_cache=CACHE_PATH,
|
|
|
|
|
|
|
|
|
|
| 797 |
)
|
| 798 |
except Exception as e:
|
| 799 |
cause = f"{type(e).__name__}: {e}"
|
|
@@ -1179,12 +1236,14 @@ def plant_info(
|
|
| 1179 |
|
| 1180 |
title = first_meta.get("species_name", name)
|
| 1181 |
common_name = first_meta.get("common_name", "")
|
| 1182 |
-
|
| 1183 |
-
|
| 1184 |
-
|
| 1185 |
-
|
| 1186 |
-
|
| 1187 |
-
|
|
|
|
|
|
|
| 1188 |
|
| 1189 |
# Combine chunks for OpenAI context (up to 6000 chars)
|
| 1190 |
documents = results.get("documents", [])
|
|
|
|
| 28 |
INDEX_PATH = os.getenv("PLANCLEF_INDEX_PATH", "data/planclef.faiss")
|
| 29 |
CACHE_PATH = os.getenv("PLANCLEF_CACHE_PATH", "data/planclef_cache.pt")
|
| 30 |
MODEL_NAME = os.getenv("PLANCLEF_MODEL_NAME", "ViT-B-32")
|
| 31 |
+
LEAFSNAP_INDEX_PATH = os.getenv("LEAFSNAP_INDEX_PATH", "data/leafsnap_field.faiss")
|
| 32 |
+
LEAFSNAP_CACHE_PATH = os.getenv("LEAFSNAP_CACHE_PATH", "data/leafsnap_cache.pt")
|
| 33 |
RAG_DB_PATH = os.getenv("RAG_DB_PATH", "data/plant_rag")
|
| 34 |
WIKI_USER_AGENT = os.getenv(
|
| 35 |
"WIKI_USER_AGENT",
|
|
|
|
| 172 |
|
| 173 |
|
| 174 |
def _get_species_preview_image_url(species_name: str) -> str:
|
| 175 |
+
image_paths = _get_species_images_from_db(species_name)
|
| 176 |
+
for raw_path in image_paths:
|
| 177 |
+
if isinstance(raw_path, str) and raw_path.startswith(("http://", "https://")):
|
| 178 |
+
return raw_path
|
| 179 |
+
|
| 180 |
+
normalized_path = _normalize_image_path(str(raw_path or ""))
|
| 181 |
+
if not normalized_path:
|
| 182 |
+
continue
|
| 183 |
+
|
| 184 |
+
local_path = Path("data") / "images" / normalized_path
|
| 185 |
+
if local_path.exists():
|
| 186 |
+
return f"/images/{normalized_path}"
|
| 187 |
+
|
| 188 |
+
# Backward compatibility: read from legacy RAG metadata if DB is empty.
|
| 189 |
try:
|
| 190 |
collection = get_rag_collection()
|
| 191 |
res = collection.get(
|
|
|
|
| 281 |
|
| 282 |
conn = sqlite3.connect(db_path)
|
| 283 |
conn.row_factory = sqlite3.Row
|
| 284 |
+
try:
|
| 285 |
+
conn.execute("ALTER TABLE plants ADD COLUMN image_paths TEXT")
|
| 286 |
+
conn.commit()
|
| 287 |
+
except Exception:
|
| 288 |
+
pass
|
| 289 |
return conn
|
| 290 |
|
| 291 |
|
| 292 |
+
def _get_species_images_from_db(species_name: str) -> list[str]:
|
| 293 |
+
query = "SELECT image_paths FROM plants WHERE lower(species_name) = lower(?) LIMIT 1"
|
| 294 |
+
with get_plants_db_connection() as conn:
|
| 295 |
+
row = conn.execute(query, (species_name.strip(),)).fetchone()
|
| 296 |
+
|
| 297 |
+
if row is None:
|
| 298 |
+
return []
|
| 299 |
+
|
| 300 |
+
raw = row["image_paths"] if "image_paths" in row.keys() else None
|
| 301 |
+
if not raw:
|
| 302 |
+
return []
|
| 303 |
+
|
| 304 |
+
try:
|
| 305 |
+
parsed = json.loads(raw)
|
| 306 |
+
except (json.JSONDecodeError, TypeError):
|
| 307 |
+
return []
|
| 308 |
+
|
| 309 |
+
if not isinstance(parsed, list):
|
| 310 |
+
return []
|
| 311 |
+
|
| 312 |
+
return [str(v).strip() for v in parsed if str(v).strip()]
|
| 313 |
+
|
| 314 |
+
|
| 315 |
def _sqlite_table_exists(conn: sqlite3.Connection, table_name: str) -> bool:
|
| 316 |
row = conn.execute(
|
| 317 |
"SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ? LIMIT 1",
|
|
|
|
| 834 |
try:
|
| 835 |
from plentclef import PlentClefIndex
|
| 836 |
|
| 837 |
+
leafsnap_aliases: dict[str, str] = {}
|
| 838 |
+
try:
|
| 839 |
+
with sqlite3.connect(PLANTS_SQLITE_PATH) as _conn:
|
| 840 |
+
rows = _conn.execute(
|
| 841 |
+
"SELECT leafsnap_label, db_species_name FROM leafsnap_aliases"
|
| 842 |
+
).fetchall()
|
| 843 |
+
leafsnap_aliases = {r[0]: r[1] for r in rows}
|
| 844 |
+
except Exception:
|
| 845 |
+
pass # table may not exist yet; aliases simply won't be applied
|
| 846 |
+
|
| 847 |
index = PlentClefIndex(
|
| 848 |
model_name=MODEL_NAME,
|
| 849 |
index_path=INDEX_PATH,
|
| 850 |
index_cache=CACHE_PATH,
|
| 851 |
+
leafsnap_index_path=LEAFSNAP_INDEX_PATH,
|
| 852 |
+
leafsnap_cache_path=LEAFSNAP_CACHE_PATH,
|
| 853 |
+
leafsnap_aliases=leafsnap_aliases,
|
| 854 |
)
|
| 855 |
except Exception as e:
|
| 856 |
cause = f"{type(e).__name__}: {e}"
|
|
|
|
| 1236 |
|
| 1237 |
title = first_meta.get("species_name", name)
|
| 1238 |
common_name = first_meta.get("common_name", "")
|
| 1239 |
+
image_paths = _get_species_images_from_db(name)
|
| 1240 |
+
if not image_paths:
|
| 1241 |
+
# Backward compatibility with old RAG metadata layout.
|
| 1242 |
+
image_paths_json = first_meta.get("image_paths", "[]")
|
| 1243 |
+
try:
|
| 1244 |
+
image_paths = json.loads(image_paths_json)
|
| 1245 |
+
except (json.JSONDecodeError, TypeError):
|
| 1246 |
+
image_paths = []
|
| 1247 |
|
| 1248 |
# Combine chunks for OpenAI context (up to 6000 chars)
|
| 1249 |
documents = results.get("documents", [])
|
build_plant_rag.py
CHANGED
|
@@ -13,7 +13,8 @@ Notes
|
|
| 13 |
- Tries Italian Wikipedia first, falls back to English.
|
| 14 |
- Skips sections: Note, Bibliografia, Voci correlate, Altri progetti,
|
| 15 |
Collegamenti esterni (and their English equivalents).
|
| 16 |
-
- Metadata per chunk: species_name, common_name,
|
|
|
|
| 17 |
"""
|
| 18 |
|
| 19 |
import csv
|
|
@@ -25,6 +26,7 @@ import sqlite3
|
|
| 25 |
import sys
|
| 26 |
import time
|
| 27 |
import urllib.parse
|
|
|
|
| 28 |
from pathlib import Path
|
| 29 |
from typing import Optional
|
| 30 |
|
|
@@ -407,6 +409,46 @@ def collect_images(species_name: str, lang: str) -> list[str]:
|
|
| 407 |
return fetch_wiki_image_urls(species_name, lang)[:MAX_IMAGES]
|
| 408 |
|
| 409 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
# ---------------------------------------------------------------------------
|
| 411 |
# Core processing
|
| 412 |
# ---------------------------------------------------------------------------
|
|
@@ -419,6 +461,7 @@ def process_species(
|
|
| 419 |
translator_client: OpenAI | None,
|
| 420 |
translation_model: str,
|
| 421 |
translate_non_italian: bool,
|
|
|
|
| 422 |
) -> dict:
|
| 423 |
slug = slugify(species_name)
|
| 424 |
|
|
@@ -481,6 +524,7 @@ def process_species(
|
|
| 481 |
|
| 482 |
# --- Images ---
|
| 483 |
image_paths = collect_images(resolved_title, lang)
|
|
|
|
| 484 |
|
| 485 |
# --- Chunk & upsert into ChromaDB ---
|
| 486 |
chunks = chunk_by_words(full_text)
|
|
@@ -492,7 +536,6 @@ def process_species(
|
|
| 492 |
{
|
| 493 |
"species_name": species_name,
|
| 494 |
"common_name": common_name,
|
| 495 |
-
"image_paths": json.dumps(image_paths),
|
| 496 |
"chunk_index": i,
|
| 497 |
"lang": lang,
|
| 498 |
"source_lang": lang,
|
|
@@ -643,6 +686,7 @@ def main() -> None:
|
|
| 643 |
translator_client=translator_client,
|
| 644 |
translation_model=args.translation_model,
|
| 645 |
translate_non_italian=args.translate_non_italian,
|
|
|
|
| 646 |
)
|
| 647 |
progress[species] = result
|
| 648 |
except Exception as exc:
|
|
@@ -669,7 +713,7 @@ def main() -> None:
|
|
| 669 |
print(f" Errors : {errors}")
|
| 670 |
print(f" Total docs : {collection.count()}")
|
| 671 |
print(f" ChromaDB : {RAG_DIR}")
|
| 672 |
-
print(" Images : stored
|
| 673 |
|
| 674 |
|
| 675 |
if __name__ == "__main__":
|
|
|
|
| 13 |
- Tries Italian Wikipedia first, falls back to English.
|
| 14 |
- Skips sections: Note, Bibliografia, Voci correlate, Altri progetti,
|
| 15 |
Collegamenti esterni (and their English equivalents).
|
| 16 |
+
- Metadata per chunk: species_name, common_name, lang.
|
| 17 |
+
- Image paths per species are stored in plants.db (plants.image_paths JSON).
|
| 18 |
"""
|
| 19 |
|
| 20 |
import csv
|
|
|
|
| 26 |
import sys
|
| 27 |
import time
|
| 28 |
import urllib.parse
|
| 29 |
+
from datetime import datetime, timezone
|
| 30 |
from pathlib import Path
|
| 31 |
from typing import Optional
|
| 32 |
|
|
|
|
| 409 |
return fetch_wiki_image_urls(species_name, lang)[:MAX_IMAGES]
|
| 410 |
|
| 411 |
|
| 412 |
+
def save_species_images_to_sqlite(sqlite_path: Path, species_name: str, image_paths: list[str]) -> None:
|
| 413 |
+
"""Store per-species image paths in plants.db for API/UI consumption.
|
| 414 |
+
|
| 415 |
+
Uses JSON string in plants.image_paths and preserves existing profile fields.
|
| 416 |
+
"""
|
| 417 |
+
sqlite_path.parent.mkdir(parents=True, exist_ok=True)
|
| 418 |
+
now_iso = datetime.now(timezone.utc).isoformat()
|
| 419 |
+
payload = json.dumps(image_paths, ensure_ascii=False)
|
| 420 |
+
|
| 421 |
+
if not sqlite_path.exists():
|
| 422 |
+
return
|
| 423 |
+
|
| 424 |
+
conn = sqlite3.connect(sqlite_path)
|
| 425 |
+
try:
|
| 426 |
+
row = conn.execute(
|
| 427 |
+
"SELECT 1 FROM sqlite_master WHERE type='table' AND name='plants' LIMIT 1"
|
| 428 |
+
).fetchone()
|
| 429 |
+
if row is None:
|
| 430 |
+
return
|
| 431 |
+
|
| 432 |
+
try:
|
| 433 |
+
conn.execute("ALTER TABLE plants ADD COLUMN image_paths TEXT")
|
| 434 |
+
except Exception:
|
| 435 |
+
pass
|
| 436 |
+
|
| 437 |
+
conn.execute(
|
| 438 |
+
"""
|
| 439 |
+
INSERT INTO plants (species_name, image_paths, updated_at)
|
| 440 |
+
VALUES (?, ?, ?)
|
| 441 |
+
ON CONFLICT(species_name) DO UPDATE SET
|
| 442 |
+
image_paths=excluded.image_paths,
|
| 443 |
+
updated_at=excluded.updated_at
|
| 444 |
+
""",
|
| 445 |
+
(species_name, payload, now_iso),
|
| 446 |
+
)
|
| 447 |
+
conn.commit()
|
| 448 |
+
finally:
|
| 449 |
+
conn.close()
|
| 450 |
+
|
| 451 |
+
|
| 452 |
# ---------------------------------------------------------------------------
|
| 453 |
# Core processing
|
| 454 |
# ---------------------------------------------------------------------------
|
|
|
|
| 461 |
translator_client: OpenAI | None,
|
| 462 |
translation_model: str,
|
| 463 |
translate_non_italian: bool,
|
| 464 |
+
sqlite_path: Path,
|
| 465 |
) -> dict:
|
| 466 |
slug = slugify(species_name)
|
| 467 |
|
|
|
|
| 524 |
|
| 525 |
# --- Images ---
|
| 526 |
image_paths = collect_images(resolved_title, lang)
|
| 527 |
+
save_species_images_to_sqlite(sqlite_path, species_name, image_paths)
|
| 528 |
|
| 529 |
# --- Chunk & upsert into ChromaDB ---
|
| 530 |
chunks = chunk_by_words(full_text)
|
|
|
|
| 536 |
{
|
| 537 |
"species_name": species_name,
|
| 538 |
"common_name": common_name,
|
|
|
|
| 539 |
"chunk_index": i,
|
| 540 |
"lang": lang,
|
| 541 |
"source_lang": lang,
|
|
|
|
| 686 |
translator_client=translator_client,
|
| 687 |
translation_model=args.translation_model,
|
| 688 |
translate_non_italian=args.translate_non_italian,
|
| 689 |
+
sqlite_path=Path(args.sqlite_path),
|
| 690 |
)
|
| 691 |
progress[species] = result
|
| 692 |
except Exception as exc:
|
|
|
|
| 713 |
print(f" Errors : {errors}")
|
| 714 |
print(f" Total docs : {collection.count()}")
|
| 715 |
print(f" ChromaDB : {RAG_DIR}")
|
| 716 |
+
print(" Images : stored in plants.db (plants.image_paths JSON)")
|
| 717 |
|
| 718 |
|
| 719 |
if __name__ == "__main__":
|
build_plants_sqlite.py
CHANGED
|
@@ -58,6 +58,7 @@ def init_db(conn: sqlite3.Connection) -> None:
|
|
| 58 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 59 |
species_name TEXT NOT NULL UNIQUE,
|
| 60 |
indexed INTEGER NOT NULL DEFAULT 0,
|
|
|
|
| 61 |
annaffiatura_gg INTEGER,
|
| 62 |
annaffiatura_time TEXT,
|
| 63 |
luce TEXT,
|
|
@@ -72,6 +73,20 @@ def init_db(conn: sqlite3.Connection) -> None:
|
|
| 72 |
)
|
| 73 |
"""
|
| 74 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
conn.commit()
|
| 76 |
|
| 77 |
|
|
@@ -324,6 +339,7 @@ def upsert_plant(
|
|
| 324 |
species_name: str,
|
| 325 |
indexed: bool,
|
| 326 |
profile: dict | None,
|
|
|
|
| 327 |
) -> None:
|
| 328 |
now_iso = datetime.now(timezone.utc).isoformat()
|
| 329 |
profile = profile or {}
|
|
@@ -333,6 +349,7 @@ def upsert_plant(
|
|
| 333 |
INSERT INTO plants (
|
| 334 |
species_name,
|
| 335 |
indexed,
|
|
|
|
| 336 |
annaffiatura_gg,
|
| 337 |
annaffiatura_time,
|
| 338 |
luce,
|
|
@@ -345,9 +362,10 @@ def upsert_plant(
|
|
| 345 |
prevenzione,
|
| 346 |
updated_at
|
| 347 |
)
|
| 348 |
-
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 349 |
ON CONFLICT(species_name) DO UPDATE SET
|
| 350 |
indexed=excluded.indexed,
|
|
|
|
| 351 |
annaffiatura_gg=excluded.annaffiatura_gg,
|
| 352 |
annaffiatura_time=excluded.annaffiatura_time,
|
| 353 |
luce=excluded.luce,
|
|
@@ -363,6 +381,7 @@ def upsert_plant(
|
|
| 363 |
(
|
| 364 |
species_name,
|
| 365 |
1 if indexed else 0,
|
|
|
|
| 366 |
profile.get("annaffiatura_gg"),
|
| 367 |
profile.get("annaffiatura_time"),
|
| 368 |
profile.get("luce"),
|
|
|
|
| 58 |
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
| 59 |
species_name TEXT NOT NULL UNIQUE,
|
| 60 |
indexed INTEGER NOT NULL DEFAULT 0,
|
| 61 |
+
image_paths TEXT,
|
| 62 |
annaffiatura_gg INTEGER,
|
| 63 |
annaffiatura_time TEXT,
|
| 64 |
luce TEXT,
|
|
|
|
| 73 |
)
|
| 74 |
"""
|
| 75 |
)
|
| 76 |
+
# Migration for existing DBs created before image_paths support.
|
| 77 |
+
try:
|
| 78 |
+
conn.execute("ALTER TABLE plants ADD COLUMN image_paths TEXT")
|
| 79 |
+
conn.commit()
|
| 80 |
+
except Exception:
|
| 81 |
+
pass
|
| 82 |
+
conn.execute(
|
| 83 |
+
"""
|
| 84 |
+
CREATE TABLE IF NOT EXISTS leafsnap_aliases (
|
| 85 |
+
leafsnap_label TEXT PRIMARY KEY,
|
| 86 |
+
db_species_name TEXT NOT NULL
|
| 87 |
+
)
|
| 88 |
+
"""
|
| 89 |
+
)
|
| 90 |
conn.commit()
|
| 91 |
|
| 92 |
|
|
|
|
| 339 |
species_name: str,
|
| 340 |
indexed: bool,
|
| 341 |
profile: dict | None,
|
| 342 |
+
image_paths: str | None = None,
|
| 343 |
) -> None:
|
| 344 |
now_iso = datetime.now(timezone.utc).isoformat()
|
| 345 |
profile = profile or {}
|
|
|
|
| 349 |
INSERT INTO plants (
|
| 350 |
species_name,
|
| 351 |
indexed,
|
| 352 |
+
image_paths,
|
| 353 |
annaffiatura_gg,
|
| 354 |
annaffiatura_time,
|
| 355 |
luce,
|
|
|
|
| 362 |
prevenzione,
|
| 363 |
updated_at
|
| 364 |
)
|
| 365 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
|
| 366 |
ON CONFLICT(species_name) DO UPDATE SET
|
| 367 |
indexed=excluded.indexed,
|
| 368 |
+
image_paths=COALESCE(excluded.image_paths, plants.image_paths),
|
| 369 |
annaffiatura_gg=excluded.annaffiatura_gg,
|
| 370 |
annaffiatura_time=excluded.annaffiatura_time,
|
| 371 |
luce=excluded.luce,
|
|
|
|
| 381 |
(
|
| 382 |
species_name,
|
| 383 |
1 if indexed else 0,
|
| 384 |
+
image_paths,
|
| 385 |
profile.get("annaffiatura_gg"),
|
| 386 |
profile.get("annaffiatura_time"),
|
| 387 |
profile.get("luce"),
|
data/plants.db
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:be66035a5e96a147ec7c9859bdf91e40c8fe9e13b383d76b858b2c2a39b0f34e
|
| 3 |
+
size 2465792
|
plentclef.py
CHANGED
|
@@ -27,8 +27,28 @@ def _load_plantclef_cache(cache_path):
|
|
| 27 |
with open(cache_path, "rb") as f:
|
| 28 |
return pickle.load(f)
|
| 29 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
class PlentClefIndex():
|
| 31 |
-
def __init__(self, model_name, index_path,index_cache
|
|
|
|
|
|
|
| 32 |
self.model, self.preprocess, self.tokenizer = open_clip.create_model_and_transforms(
|
| 33 |
model_name=model_name,
|
| 34 |
pretrained="laion2b_s34b_b79k"
|
|
@@ -43,29 +63,48 @@ class PlentClefIndex():
|
|
| 43 |
raise KeyError("Missing 'labels' in PlantCLEF cache")
|
| 44 |
self.plantclef_labels = data["labels"]
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
with torch.no_grad():
|
| 49 |
e = self.model.encode_image(img)
|
| 50 |
e = e / e.norm(dim=-1, keepdim=True)
|
| 51 |
return e.cpu().numpy().astype("float32")
|
| 52 |
|
| 53 |
-
def
|
| 54 |
-
|
| 55 |
-
sims, idxs =
|
| 56 |
-
|
| 57 |
-
aggregated_results = defaultdict(lambda: {'score_sum': 0.0, 'image_paths': []})
|
| 58 |
-
|
| 59 |
for score, idx in zip(sims[0], idxs[0]):
|
| 60 |
species_label = labels[idx]
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
final_results = []
|
| 65 |
-
for category, data in aggregated_results.items():
|
| 66 |
-
final_results.append((category, data['score_sum'], data['image_paths']))
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
|
|
|
| 70 |
|
| 71 |
-
return
|
|
|
|
| 27 |
with open(cache_path, "rb") as f:
|
| 28 |
return pickle.load(f)
|
| 29 |
|
| 30 |
+
def _rrf_merge(results_list: list[list], k: int = 60) -> list[tuple]:
|
| 31 |
+
"""Reciprocal Rank Fusion across multiple ranked result lists.
|
| 32 |
+
|
| 33 |
+
Each element of *results_list* is a list of (species, score, image_paths)
|
| 34 |
+
tuples already sorted by descending score. Returns a merged list sorted
|
| 35 |
+
by descending RRF score, same tuple format (image_paths will be empty).
|
| 36 |
+
"""
|
| 37 |
+
combined: dict[str, float] = defaultdict(float)
|
| 38 |
+
for ranked in results_list:
|
| 39 |
+
for rank, (species, _score, _paths) in enumerate(ranked):
|
| 40 |
+
combined[species] += 1.0 / (k + rank + 1)
|
| 41 |
+
return sorted(
|
| 42 |
+
[(species, rrf_score, []) for species, rrf_score in combined.items()],
|
| 43 |
+
key=lambda x: x[1],
|
| 44 |
+
reverse=True,
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
class PlentClefIndex():
|
| 49 |
+
def __init__(self, model_name, index_path, index_cache,
|
| 50 |
+
leafsnap_index_path=None, leafsnap_cache_path=None,
|
| 51 |
+
leafsnap_aliases: dict | None = None):
|
| 52 |
self.model, self.preprocess, self.tokenizer = open_clip.create_model_and_transforms(
|
| 53 |
model_name=model_name,
|
| 54 |
pretrained="laion2b_s34b_b79k"
|
|
|
|
| 63 |
raise KeyError("Missing 'labels' in PlantCLEF cache")
|
| 64 |
self.plantclef_labels = data["labels"]
|
| 65 |
|
| 66 |
+
# Optional LeafSnap index (leaf-only images, same embedding space)
|
| 67 |
+
self.leafsnap_index = None
|
| 68 |
+
self.leafsnap_labels: list = []
|
| 69 |
+
if leafsnap_index_path and os.path.exists(leafsnap_index_path):
|
| 70 |
+
self.leafsnap_index = faiss.read_index(leafsnap_index_path)
|
| 71 |
+
if leafsnap_cache_path and os.path.exists(leafsnap_cache_path):
|
| 72 |
+
ls_data = _load_plantclef_cache(leafsnap_cache_path)
|
| 73 |
+
if not isinstance(ls_data, dict) or "labels" not in ls_data:
|
| 74 |
+
raise KeyError("Missing 'labels' in LeafSnap cache")
|
| 75 |
+
self.leafsnap_labels = ls_data["labels"]
|
| 76 |
+
# Dict mapping LeafSnap label -> canonical DB species name
|
| 77 |
+
self.leafsnap_aliases: dict[str, str] = leafsnap_aliases or {}
|
| 78 |
+
|
| 79 |
+
def embed_image(self, path):
|
| 80 |
+
img = self.preprocess(Image.open(path).convert("RGB")).unsqueeze(0)
|
| 81 |
with torch.no_grad():
|
| 82 |
e = self.model.encode_image(img)
|
| 83 |
e = e / e.norm(dim=-1, keepdim=True)
|
| 84 |
return e.cpu().numpy().astype("float32")
|
| 85 |
|
| 86 |
+
def _search_index(self, q, index, labels, k):
|
| 87 |
+
"""Search a single FAISS index and aggregate scores by species."""
|
| 88 |
+
sims, idxs = index.search(q, k)
|
| 89 |
+
aggregated: dict = defaultdict(lambda: {'score_sum': 0.0, 'image_paths': []})
|
|
|
|
|
|
|
| 90 |
for score, idx in zip(sims[0], idxs[0]):
|
| 91 |
species_label = labels[idx]
|
| 92 |
+
species_label = self.leafsnap_aliases.get(species_label, species_label)
|
| 93 |
+
aggregated[species_label]['score_sum'] += score
|
| 94 |
+
results = [
|
| 95 |
+
(cat, d['score_sum'], d['image_paths'])
|
| 96 |
+
for cat, d in aggregated.items()
|
| 97 |
+
]
|
| 98 |
+
results.sort(key=lambda x: x[1], reverse=True)
|
| 99 |
+
return results
|
| 100 |
+
|
| 101 |
+
def search(self, path, labels, k=5):
|
| 102 |
+
q = self.embed_image(path)
|
| 103 |
|
| 104 |
+
planclef_results = self._search_index(q, self.index, labels, k)
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
+
if self.leafsnap_index is not None and self.leafsnap_labels:
|
| 107 |
+
leafsnap_results = self._search_index(q, self.leafsnap_index, self.leafsnap_labels, k)
|
| 108 |
+
return _rrf_merge([planclef_results, leafsnap_results])[:k]
|
| 109 |
|
| 110 |
+
return planclef_results[:k]
|
pwa-app/src/App.jsx
CHANGED
|
@@ -16,6 +16,7 @@ import {
|
|
| 16 |
updateMyPlantFirstWaterDate,
|
| 17 |
uploadMyPlantPhoto,
|
| 18 |
verifyGoogleToken,
|
|
|
|
| 19 |
toAbsoluteImage,
|
| 20 |
toOptimizedImage
|
| 21 |
} from "./api";
|
|
@@ -111,6 +112,21 @@ function formatISOToInputDate(value) {
|
|
| 111 |
return `${year}-${month}-${day}`;
|
| 112 |
}
|
| 113 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
export default function App({ googleClientIdConfigured = false }) {
|
| 115 |
const [auth, setAuth] = useState(null);
|
| 116 |
const [authBusy, setAuthBusy] = useState(false);
|
|
@@ -149,6 +165,7 @@ export default function App({ googleClientIdConfigured = false }) {
|
|
| 149 |
const [uploadingPhotoId, setUploadingPhotoId] = useState(null);
|
| 150 |
const plantPhotoInputRef = useRef(null);
|
| 151 |
const plantPhotoTargetIdRef = useRef(null);
|
|
|
|
| 152 |
|
| 153 |
const [busy, setBusy] = useState({
|
| 154 |
search: false,
|
|
@@ -260,6 +277,83 @@ export default function App({ googleClientIdConfigured = false }) {
|
|
| 260 |
.filter((entry) => entry.value !== null && entry.value !== "");
|
| 261 |
}, [myPlantProfile]);
|
| 262 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 263 |
useEffect(() => {
|
| 264 |
const raw = window.localStorage.getItem(AUTH_STORAGE_KEY);
|
| 265 |
if (!raw) {
|
|
@@ -281,6 +375,37 @@ export default function App({ googleClientIdConfigured = false }) {
|
|
| 281 |
}
|
| 282 |
}, []);
|
| 283 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 284 |
useEffect(() => {
|
| 285 |
setExpandedProfileKey("");
|
| 286 |
}, [plantProfile]);
|
|
@@ -550,7 +675,25 @@ export default function App({ googleClientIdConfigured = false }) {
|
|
| 550 |
try {
|
| 551 |
const data = await saveMyPlant(selectedSpecies, trimmed);
|
| 552 |
const saved = data.saved || null;
|
| 553 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 554 |
setUserPlantName("");
|
| 555 |
await loadMyPlants();
|
| 556 |
} catch (err) {
|
|
|
|
| 16 |
updateMyPlantFirstWaterDate,
|
| 17 |
uploadMyPlantPhoto,
|
| 18 |
verifyGoogleToken,
|
| 19 |
+
setUnauthorizedHandler,
|
| 20 |
toAbsoluteImage,
|
| 21 |
toOptimizedImage
|
| 22 |
} from "./api";
|
|
|
|
| 112 |
return `${year}-${month}-${day}`;
|
| 113 |
}
|
| 114 |
|
| 115 |
+
function parseJwtPayload(token) {
|
| 116 |
+
try {
|
| 117 |
+
const parts = String(token || "").split(".");
|
| 118 |
+
if (parts.length < 2) {
|
| 119 |
+
return null;
|
| 120 |
+
}
|
| 121 |
+
const base64 = parts[1].replace(/-/g, "+").replace(/_/g, "/");
|
| 122 |
+
const padded = base64 + "=".repeat((4 - (base64.length % 4)) % 4);
|
| 123 |
+
const json = window.atob(padded);
|
| 124 |
+
return JSON.parse(json);
|
| 125 |
+
} catch {
|
| 126 |
+
return null;
|
| 127 |
+
}
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
export default function App({ googleClientIdConfigured = false }) {
|
| 131 |
const [auth, setAuth] = useState(null);
|
| 132 |
const [authBusy, setAuthBusy] = useState(false);
|
|
|
|
| 165 |
const [uploadingPhotoId, setUploadingPhotoId] = useState(null);
|
| 166 |
const plantPhotoInputRef = useRef(null);
|
| 167 |
const plantPhotoTargetIdRef = useRef(null);
|
| 168 |
+
const refreshPromiseRef = useRef(null);
|
| 169 |
|
| 170 |
const [busy, setBusy] = useState({
|
| 171 |
search: false,
|
|
|
|
| 277 |
.filter((entry) => entry.value !== null && entry.value !== "");
|
| 278 |
}, [myPlantProfile]);
|
| 279 |
|
| 280 |
+
async function refreshGoogleSessionSilently() {
|
| 281 |
+
if (refreshPromiseRef.current) {
|
| 282 |
+
return refreshPromiseRef.current;
|
| 283 |
+
}
|
| 284 |
+
|
| 285 |
+
const currentToken = auth?.idToken || "";
|
| 286 |
+
const payload = parseJwtPayload(currentToken);
|
| 287 |
+
const clientId = String(payload?.aud || "").trim();
|
| 288 |
+
|
| 289 |
+
if (!clientId || !window.google?.accounts?.id) {
|
| 290 |
+
return false;
|
| 291 |
+
}
|
| 292 |
+
|
| 293 |
+
const refreshPromise = new Promise((resolve) => {
|
| 294 |
+
let done = false;
|
| 295 |
+
const finish = (ok) => {
|
| 296 |
+
if (done) {
|
| 297 |
+
return;
|
| 298 |
+
}
|
| 299 |
+
done = true;
|
| 300 |
+
resolve(ok);
|
| 301 |
+
};
|
| 302 |
+
|
| 303 |
+
const timeoutId = window.setTimeout(() => finish(false), 8000);
|
| 304 |
+
|
| 305 |
+
try {
|
| 306 |
+
window.google.accounts.id.initialize({
|
| 307 |
+
client_id: clientId,
|
| 308 |
+
auto_select: true,
|
| 309 |
+
cancel_on_tap_outside: false,
|
| 310 |
+
callback: async (credentialResponse) => {
|
| 311 |
+
const idToken = credentialResponse?.credential || "";
|
| 312 |
+
if (!idToken) {
|
| 313 |
+
window.clearTimeout(timeoutId);
|
| 314 |
+
finish(false);
|
| 315 |
+
return;
|
| 316 |
+
}
|
| 317 |
+
|
| 318 |
+
try {
|
| 319 |
+
const data = await verifyGoogleToken(idToken);
|
| 320 |
+
const nextAuth = {
|
| 321 |
+
idToken,
|
| 322 |
+
user: data.user || auth?.user || null,
|
| 323 |
+
expiresAt: data.expires_at || null
|
| 324 |
+
};
|
| 325 |
+
setAuth(nextAuth);
|
| 326 |
+
setAuthToken(idToken);
|
| 327 |
+
window.localStorage.setItem(AUTH_STORAGE_KEY, JSON.stringify(nextAuth));
|
| 328 |
+
window.clearTimeout(timeoutId);
|
| 329 |
+
finish(true);
|
| 330 |
+
} catch {
|
| 331 |
+
window.clearTimeout(timeoutId);
|
| 332 |
+
finish(false);
|
| 333 |
+
}
|
| 334 |
+
}
|
| 335 |
+
});
|
| 336 |
+
|
| 337 |
+
window.google.accounts.id.prompt((notification) => {
|
| 338 |
+
if (notification.isNotDisplayed() || notification.isSkippedMoment()) {
|
| 339 |
+
window.clearTimeout(timeoutId);
|
| 340 |
+
finish(false);
|
| 341 |
+
}
|
| 342 |
+
});
|
| 343 |
+
} catch {
|
| 344 |
+
window.clearTimeout(timeoutId);
|
| 345 |
+
finish(false);
|
| 346 |
+
}
|
| 347 |
+
});
|
| 348 |
+
|
| 349 |
+
refreshPromiseRef.current = refreshPromise;
|
| 350 |
+
try {
|
| 351 |
+
return await refreshPromise;
|
| 352 |
+
} finally {
|
| 353 |
+
refreshPromiseRef.current = null;
|
| 354 |
+
}
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
useEffect(() => {
|
| 358 |
const raw = window.localStorage.getItem(AUTH_STORAGE_KEY);
|
| 359 |
if (!raw) {
|
|
|
|
| 375 |
}
|
| 376 |
}, []);
|
| 377 |
|
| 378 |
+
useEffect(() => {
|
| 379 |
+
setUnauthorizedHandler(async () => {
|
| 380 |
+
const refreshed = await refreshGoogleSessionSilently();
|
| 381 |
+
if (!refreshed) {
|
| 382 |
+
setError("Sessione Google scaduta. Tocca Accedi per rinnovarla.");
|
| 383 |
+
}
|
| 384 |
+
return refreshed;
|
| 385 |
+
});
|
| 386 |
+
|
| 387 |
+
return () => setUnauthorizedHandler(null);
|
| 388 |
+
}, [auth?.idToken]);
|
| 389 |
+
|
| 390 |
+
useEffect(() => {
|
| 391 |
+
const exp = Number(auth?.expiresAt || 0);
|
| 392 |
+
if (!exp) {
|
| 393 |
+
return undefined;
|
| 394 |
+
}
|
| 395 |
+
|
| 396 |
+
const refreshBeforeMs = 2 * 60 * 1000;
|
| 397 |
+
const delayMs = (exp * 1000) - Date.now() - refreshBeforeMs;
|
| 398 |
+
if (delayMs <= 0) {
|
| 399 |
+
return undefined;
|
| 400 |
+
}
|
| 401 |
+
|
| 402 |
+
const timerId = window.setTimeout(() => {
|
| 403 |
+
refreshGoogleSessionSilently();
|
| 404 |
+
}, delayMs);
|
| 405 |
+
|
| 406 |
+
return () => window.clearTimeout(timerId);
|
| 407 |
+
}, [auth?.expiresAt, auth?.idToken]);
|
| 408 |
+
|
| 409 |
useEffect(() => {
|
| 410 |
setExpandedProfileKey("");
|
| 411 |
}, [plantProfile]);
|
|
|
|
| 675 |
try {
|
| 676 |
const data = await saveMyPlant(selectedSpecies, trimmed);
|
| 677 |
const saved = data.saved || null;
|
| 678 |
+
let photoUploaded = false;
|
| 679 |
+
if (saved?.id && file) {
|
| 680 |
+
try {
|
| 681 |
+
await uploadMyPlantPhoto(saved.id, file);
|
| 682 |
+
photoUploaded = true;
|
| 683 |
+
} catch {
|
| 684 |
+
photoUploaded = false;
|
| 685 |
+
}
|
| 686 |
+
}
|
| 687 |
+
|
| 688 |
+
if (saved) {
|
| 689 |
+
setSaveStatus(
|
| 690 |
+
photoUploaded
|
| 691 |
+
? `Salvata: ${saved.user_given_name} (foto associata)`
|
| 692 |
+
: `Salvata: ${saved.user_given_name}`
|
| 693 |
+
);
|
| 694 |
+
} else {
|
| 695 |
+
setSaveStatus("Pianta salvata.");
|
| 696 |
+
}
|
| 697 |
setUserPlantName("");
|
| 698 |
await loadMyPlants();
|
| 699 |
} catch (err) {
|
pwa-app/src/api.js
CHANGED
|
@@ -10,6 +10,7 @@ function getApiBase() {
|
|
| 10 |
|
| 11 |
const API_BASE = getApiBase();
|
| 12 |
let authToken = "";
|
|
|
|
| 13 |
|
| 14 |
function buildUrl(path) {
|
| 15 |
return `${API_BASE}${path}`;
|
|
@@ -19,16 +20,37 @@ export function setAuthToken(token) {
|
|
| 19 |
authToken = token || "";
|
| 20 |
}
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
async function apiFetch(path, options = {}) {
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
}
|
| 27 |
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
}
|
| 33 |
|
| 34 |
async function parseResponse(response) {
|
|
|
|
| 10 |
|
| 11 |
const API_BASE = getApiBase();
|
| 12 |
let authToken = "";
|
| 13 |
+
let unauthorizedHandler = null;
|
| 14 |
|
| 15 |
function buildUrl(path) {
|
| 16 |
return `${API_BASE}${path}`;
|
|
|
|
| 20 |
authToken = token || "";
|
| 21 |
}
|
| 22 |
|
| 23 |
+
export function setUnauthorizedHandler(handler) {
|
| 24 |
+
unauthorizedHandler = typeof handler === "function" ? handler : null;
|
| 25 |
+
}
|
| 26 |
+
|
| 27 |
async function apiFetch(path, options = {}) {
|
| 28 |
+
async function doFetch() {
|
| 29 |
+
const headers = new Headers(options.headers || {});
|
| 30 |
+
if (authToken) {
|
| 31 |
+
headers.set("Authorization", `Bearer ${authToken}`);
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
return fetch(buildUrl(path), {
|
| 35 |
+
...options,
|
| 36 |
+
headers
|
| 37 |
+
});
|
| 38 |
}
|
| 39 |
|
| 40 |
+
const response = await doFetch();
|
| 41 |
+
|
| 42 |
+
if (
|
| 43 |
+
response.status === 401
|
| 44 |
+
&& !options.__skipAuthRefresh
|
| 45 |
+
&& unauthorizedHandler
|
| 46 |
+
) {
|
| 47 |
+
const refreshed = await unauthorizedHandler();
|
| 48 |
+
if (refreshed) {
|
| 49 |
+
return doFetch();
|
| 50 |
+
}
|
| 51 |
+
}
|
| 52 |
+
|
| 53 |
+
return response;
|
| 54 |
}
|
| 55 |
|
| 56 |
async function parseResponse(response) {
|
sync_leafsnap_aliases.py
ADDED
|
@@ -0,0 +1,181 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Populate leafsnap_aliases table in plants.db and update missing_species.csv.
|
| 4 |
+
|
| 5 |
+
For each LeafSnap species not in the DB:
|
| 6 |
+
- If a DB species with the same genus has an epithet edit-distance <= EDIT_THRESHOLD
|
| 7 |
+
(or the full name edit-distance <= EDIT_THRESHOLD), it is saved as an alias.
|
| 8 |
+
- Otherwise it is appended to missing_species.csv for manual review.
|
| 9 |
+
|
| 10 |
+
Run:
|
| 11 |
+
python sync_leafsnap_aliases.py [--db data/plants.db] [--cache data/leafsnap_cache.pt]
|
| 12 |
+
[--missing missing_species.csv] [--threshold 3] [--dry-run]
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import argparse
|
| 16 |
+
import csv
|
| 17 |
+
import os
|
| 18 |
+
import sqlite3
|
| 19 |
+
from collections import defaultdict
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
def _edit_distance(a: str, b: str) -> int:
|
| 24 |
+
"""Simple Levenshtein distance."""
|
| 25 |
+
a, b = a.lower(), b.lower()
|
| 26 |
+
if a == b:
|
| 27 |
+
return 0
|
| 28 |
+
m, n = len(a), len(b)
|
| 29 |
+
dp = list(range(n + 1))
|
| 30 |
+
for i in range(1, m + 1):
|
| 31 |
+
prev = dp[:]
|
| 32 |
+
dp[0] = i
|
| 33 |
+
for j in range(1, n + 1):
|
| 34 |
+
cost = 0 if a[i - 1] == b[j - 1] else 1
|
| 35 |
+
dp[j] = min(dp[j] + 1, dp[j - 1] + 1, prev[j - 1] + cost)
|
| 36 |
+
return dp[n]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def find_best_alias(
|
| 40 |
+
leafsnap_label: str,
|
| 41 |
+
db_species: set[str],
|
| 42 |
+
db_by_genus: dict[str, list[str]],
|
| 43 |
+
threshold: int,
|
| 44 |
+
) -> str | None:
|
| 45 |
+
"""Return the best-matching DB species within *threshold* edit distance, or None."""
|
| 46 |
+
genus = leafsnap_label.split()[0]
|
| 47 |
+
epithet = " ".join(leafsnap_label.split()[1:])
|
| 48 |
+
|
| 49 |
+
candidates = db_by_genus.get(genus, [])
|
| 50 |
+
if not candidates:
|
| 51 |
+
return None
|
| 52 |
+
|
| 53 |
+
best_label: str | None = None
|
| 54 |
+
best_dist = threshold + 1
|
| 55 |
+
|
| 56 |
+
for db_sp in candidates:
|
| 57 |
+
db_epithet = " ".join(db_sp.split()[1:])
|
| 58 |
+
# Compare epithet only (genus already matches)
|
| 59 |
+
d = _edit_distance(epithet, db_epithet)
|
| 60 |
+
if d < best_dist:
|
| 61 |
+
best_dist = d
|
| 62 |
+
best_label = db_sp
|
| 63 |
+
|
| 64 |
+
return best_label if best_dist <= threshold else None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def main() -> None:
|
| 68 |
+
parser = argparse.ArgumentParser(description="Sync LeafSnap aliases into plants.db")
|
| 69 |
+
parser.add_argument("--db", default=os.getenv("PLANTS_SQLITE_PATH", "data/plants.db"))
|
| 70 |
+
parser.add_argument("--cache", default=os.getenv("LEAFSNAP_CACHE_PATH", "data/leafsnap_cache.pt"))
|
| 71 |
+
parser.add_argument("--missing", default="missing_species.csv")
|
| 72 |
+
parser.add_argument("--threshold", type=int, default=2,
|
| 73 |
+
help="Max Levenshtein distance on epithet to consider an alias (default: 2)")
|
| 74 |
+
parser.add_argument("--dry-run", action="store_true",
|
| 75 |
+
help="Print results without writing to DB or CSV")
|
| 76 |
+
args = parser.parse_args()
|
| 77 |
+
|
| 78 |
+
import torch
|
| 79 |
+
|
| 80 |
+
print(f"Loading LeafSnap cache from {args.cache} ...")
|
| 81 |
+
ls_data = torch.load(args.cache, map_location="cpu", weights_only=False)
|
| 82 |
+
leafsnap_species = set(ls_data["labels"])
|
| 83 |
+
|
| 84 |
+
print(f"Loading DB species from {args.db} ...")
|
| 85 |
+
conn = sqlite3.connect(args.db)
|
| 86 |
+
|
| 87 |
+
# Ensure table exists
|
| 88 |
+
conn.execute(
|
| 89 |
+
"""
|
| 90 |
+
CREATE TABLE IF NOT EXISTS leafsnap_aliases (
|
| 91 |
+
leafsnap_label TEXT PRIMARY KEY,
|
| 92 |
+
db_species_name TEXT NOT NULL
|
| 93 |
+
)
|
| 94 |
+
"""
|
| 95 |
+
)
|
| 96 |
+
conn.commit()
|
| 97 |
+
|
| 98 |
+
db_species = set(row[0] for row in conn.execute("SELECT species_name FROM plants"))
|
| 99 |
+
existing_aliases = dict(conn.execute("SELECT leafsnap_label, db_species_name FROM leafsnap_aliases"))
|
| 100 |
+
|
| 101 |
+
db_by_genus: dict[str, list[str]] = defaultdict(list)
|
| 102 |
+
for s in db_species:
|
| 103 |
+
db_by_genus[s.split()[0]].append(s)
|
| 104 |
+
|
| 105 |
+
# Classify each LeafSnap species
|
| 106 |
+
to_alias: list[tuple[str, str]] = [] # (leafsnap_label, db_species_name)
|
| 107 |
+
to_missing: list[str] = []
|
| 108 |
+
|
| 109 |
+
# Hardcoded typo overrides (edit distance > threshold but unambiguously same species)
|
| 110 |
+
TYPO_OVERRIDES: dict[str, str] = {
|
| 111 |
+
"Aesculus hippocastamon": "Aesculus hippocastanum",
|
| 112 |
+
}
|
| 113 |
+
|
| 114 |
+
for sp in sorted(leafsnap_species):
|
| 115 |
+
if sp in db_species:
|
| 116 |
+
continue # already in DB, no alias needed
|
| 117 |
+
if sp in existing_aliases:
|
| 118 |
+
print(f" [skip] {sp} -> {existing_aliases[sp]} (already in aliases)")
|
| 119 |
+
continue
|
| 120 |
+
|
| 121 |
+
if sp in TYPO_OVERRIDES:
|
| 122 |
+
target = TYPO_OVERRIDES[sp]
|
| 123 |
+
if target in db_species:
|
| 124 |
+
to_alias.append((sp, target))
|
| 125 |
+
continue
|
| 126 |
+
|
| 127 |
+
match = find_best_alias(sp, db_species, db_by_genus, args.threshold)
|
| 128 |
+
if match:
|
| 129 |
+
to_alias.append((sp, match))
|
| 130 |
+
else:
|
| 131 |
+
to_missing.append(sp)
|
| 132 |
+
|
| 133 |
+
print(f"\nNew aliases found: {len(to_alias)}")
|
| 134 |
+
for ls_lbl, db_lbl in to_alias:
|
| 135 |
+
print(f" {ls_lbl} -> {db_lbl}")
|
| 136 |
+
|
| 137 |
+
print(f"\nUnresolvable (-> missing_species.csv): {len(to_missing)}")
|
| 138 |
+
for sp in to_missing:
|
| 139 |
+
print(f" {sp}")
|
| 140 |
+
|
| 141 |
+
if args.dry_run:
|
| 142 |
+
print("\n[dry-run] No changes written.")
|
| 143 |
+
conn.close()
|
| 144 |
+
return
|
| 145 |
+
|
| 146 |
+
# Write aliases to DB
|
| 147 |
+
if to_alias:
|
| 148 |
+
conn.executemany(
|
| 149 |
+
"INSERT OR REPLACE INTO leafsnap_aliases (leafsnap_label, db_species_name) VALUES (?, ?)",
|
| 150 |
+
to_alias,
|
| 151 |
+
)
|
| 152 |
+
conn.commit()
|
| 153 |
+
print(f"\nSaved {len(to_alias)} aliases to leafsnap_aliases table.")
|
| 154 |
+
|
| 155 |
+
conn.close()
|
| 156 |
+
|
| 157 |
+
# Append to missing_species.csv (avoid duplicates)
|
| 158 |
+
missing_path = Path(args.missing)
|
| 159 |
+
existing_missing: set[str] = set()
|
| 160 |
+
if missing_path.exists():
|
| 161 |
+
with open(missing_path, newline="", encoding="utf-8") as f:
|
| 162 |
+
reader = csv.DictReader(f)
|
| 163 |
+
for row in reader:
|
| 164 |
+
existing_missing.add(row["species_name"])
|
| 165 |
+
|
| 166 |
+
new_missing = [sp for sp in to_missing if sp not in existing_missing]
|
| 167 |
+
if new_missing:
|
| 168 |
+
write_header = not missing_path.exists() or missing_path.stat().st_size == 0
|
| 169 |
+
with open(missing_path, "a", newline="", encoding="utf-8") as f:
|
| 170 |
+
writer = csv.DictWriter(f, fieldnames=["species_name"])
|
| 171 |
+
if write_header:
|
| 172 |
+
writer.writeheader()
|
| 173 |
+
for sp in new_missing:
|
| 174 |
+
writer.writerow({"species_name": sp})
|
| 175 |
+
print(f"Appended {len(new_missing)} species to {missing_path}.")
|
| 176 |
+
else:
|
| 177 |
+
print("No new missing species to append.")
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
if __name__ == "__main__":
|
| 181 |
+
main()
|