""" matcher.py — Cosine similarity para identificar se dois avistamentos são do mesmo animal. Threshold padrão: 0.80 (ajustável via MATCH_THRESHOLD env var). """ import os from typing import Optional, Tuple import numpy as np THRESHOLD = float(os.environ.get("MATCH_THRESHOLD", "0.80")) class AnimalMatcher: def find_match( self, new_embedding: list, candidates: list[dict], ) -> Optional[Tuple[int, float]]: """ Compara new_embedding com os embeddings de candidates. Retorna (animal_id, score) do melhor match acima do threshold, ou None se nenhum match encontrado. candidates: lista de dicts com chaves 'id' e 'embedding' (list[float]). """ if not candidates or not new_embedding: return None new_vec = np.array(new_embedding, dtype=np.float32) best_id: Optional[int] = None best_score: float = 0.0 for animal in candidates: emb = animal.get("embedding") if not emb: continue score = self._cosine(new_vec, np.array(emb, dtype=np.float32)) if score > best_score: best_score = score best_id = animal["id"] if best_score >= THRESHOLD: return best_id, best_score return None def find_top_matches( self, new_embedding: list, candidates: list[dict], top_n: int = 3, ) -> list[dict]: """ Retorna os top_n animais mais similares (sem threshold mínimo), ordenados por score decrescente. Cada item: {'id': int, 'score': float} """ if not candidates or not new_embedding: return [] new_vec = np.array(new_embedding, dtype=np.float32) scores = [] for animal in candidates: emb = animal.get("embedding") if not emb: continue score = self._cosine(new_vec, np.array(emb, dtype=np.float32)) scores.append({"id": animal["id"], "score": score}) scores.sort(key=lambda x: x["score"], reverse=True) return scores[:top_n] @staticmethod def _cosine(a: np.ndarray, b: np.ndarray) -> float: norm_a = np.linalg.norm(a) norm_b = np.linalg.norm(b) if norm_a == 0.0 or norm_b == 0.0: return 0.0 return float(np.dot(a, b) / (norm_a * norm_b))