pawmap / core /ai.py
sara.mesquita
feat: improved breed prompt + dynamic breed dropdown by species
45b7139
"""
ai.py — Visao via HuggingFace InferenceClient ou NVIDIA NIM.
Variaveis de ambiente:
HF_TOKEN — token HuggingFace (usa HF Serverless Inference)
NVIDIA_API_KEY — chave NVIDIA NIM (free tier em build.nvidia.com)
Modelo HF: meta-llama/Llama-3.2-11B-Vision-Instruct (disponivel no HF Serverless)
Modelo NIM: nvidia/nemotron-3-nano-omni-30b-a3b (Nemotron Omni via NVIDIA)
"""
import base64
import io
import json
import logging
import os
import re
import numpy as np
log = logging.getLogger(__name__)
_HF_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct"
_NIM_MODEL = "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning" # Nemotron Omni VLM, NVIDIA NIM
PROMPT = (
"Examine this image carefully.\n"
"FIRST: Is there a dog or cat clearly visible?\n"
"If NO dog or cat is present, respond with exactly: {\"is_animal\": false}\n\n"
"If YES, respond with ONLY valid JSON (no markdown, no explanation).\n\n"
"For breed_estimate, use visual cues (coat type, ear shape, body build, snout, tail) to pick the SINGLE best match.\n\n"
"DOG breeds to choose from (use exact spelling):\n"
"SRD, Labrador Retriever, Golden Retriever, Pitbull, Poodle, Shih Tzu, Rottweiler, "
"German Shepherd, Bulldog, Dachshund, Chihuahua, Siberian Husky, Border Collie, "
"Beagle, Boxer, Maltese, Chow Chow, Akita, Dalmatian, Doberman\n\n"
"CAT breeds to choose from (use exact spelling):\n"
"Domestic Shorthair, Domestic Longhair, Siamese, Persian, Maine Coon, Bengal, "
"British Shorthair, Ragdoll, Scottish Fold, Turkish Angora, Sphynx, Abyssinian\n\n"
"JSON format:\n"
"{\"is_animal\": true,"
" \"species\": \"dog or cat\","
" \"breed_estimate\": \"exact name from the list above — SRD/Domestic Shorthair only if truly unidentifiable\","
" \"size\": \"small, medium or large\","
" \"primary_color\": \"main color: caramel, black, white, gray, brown, golden, orange, tabby, mixed\","
" \"secondary_colors\": [\"other visible colors, or empty list\"],"
" \"distinctive_marks\": [\"notable features: e.g. white chest patch, red collar, scar, missing ear — or empty list\"],"
" \"condition\": \"healthy, thin or injured\","
" \"description_text\": \"one concise English sentence describing this specific animal for identity matching\"}"
)
class AnimalAI:
def __init__(self):
self.mode = None # "hf" | "nim"
self.model = None
self.client = None # OpenAI (NIM) ou InferenceClient (HF)
hf_token = os.environ.get("HF_TOKEN", "")
nvidia_key = os.environ.get("NVIDIA_API_KEY", "")
if nvidia_key:
try:
from openai import OpenAI
self.mode = "nim"
self.model = os.environ.get("NVIDIA_MODEL", _NIM_MODEL)
self.client = OpenAI(
base_url="https://integrate.api.nvidia.com/v1",
api_key=nvidia_key,
)
log.info("AI: Nemotron Omni via NVIDIA NIM (%s)", self.model)
except ImportError:
log.warning("openai nao instalado")
elif hf_token:
try:
from huggingface_hub import InferenceClient
self.mode = "hf"
self.model = os.environ.get("NVIDIA_MODEL", _HF_MODEL)
self.client = InferenceClient(model=self.model, token=hf_token)
log.info("AI: %s via HF InferenceClient", self.model)
except ImportError:
log.warning("huggingface_hub nao instalado")
else:
log.warning("Sem chave de API — IA desabilitada. Configure HF_TOKEN nos Secrets.")
self.embedder = None
try:
from sentence_transformers import SentenceTransformer
self.embedder = SentenceTransformer("all-MiniLM-L6-v2")
log.info("sentence-transformers: all-MiniLM-L6-v2")
except Exception as e:
log.warning("sentence-transformers nao carregou: %s", e)
def analyze_image(self, image) -> dict:
"""Analisa imagem. _ai_success=False indica que a IA nao foi usada."""
if self.client is None:
return self._fallback()
try:
img_b64 = self._to_b64(image)
if self.mode == "hf":
# HuggingFace InferenceClient — suporte nativo a multimodal
resp = self.client.chat_completion(
messages=[{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/jpeg;base64," + img_b64}},
{"type": "text", "text": PROMPT},
],
}],
max_tokens=400,
temperature=0.1,
)
raw = resp.choices[0].message.content or ""
else:
# NVIDIA NIM — Nemotron Omni (reasoning model)
resp = self.client.chat.completions.create(
model=self.model,
messages=[{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "data:image/jpeg;base64," + img_b64}},
{"type": "text", "text": PROMPT},
],
}],
max_tokens=1024,
temperature=0.6,
top_p=0.95,
extra_body={
"chat_template_kwargs": {"enable_thinking": True},
"reasoning_budget": 512,
},
)
msg = resp.choices[0].message
# Nemotron Omni: resposta pode estar em content ou reasoning_content
raw = (msg.content or "") or (getattr(msg, "reasoning_content", "") or "")
log.info("AI resposta: %s", raw[:200])
result = self._parse(raw)
# Rejeição explícita: a IA não viu nenhum animal
if result.get("is_animal") is False:
log.info("IA: nenhum animal detectado na imagem.")
return {"is_animal": False, "_ai_success": True}
result["is_animal"] = True
result["_ai_success"] = True
return result
except Exception as e:
log.error("Erro na API de IA: %s", e)
return self._fallback()
def get_embedding(self, description: dict) -> list:
"""Embedding da descricao. Aleatorio se IA falhou (evita falsos matches)."""
if not description.get("_ai_success", True):
log.info("Fallback IA — embedding aleatorio")
v = np.random.randn(384).astype(np.float32)
v /= np.linalg.norm(v)
return v.tolist()
if self.embedder is None:
v = np.random.randn(384).astype(np.float32)
v /= np.linalg.norm(v)
return v.tolist()
text = description.get("description_text") or self._desc_text(description)
return self.embedder.encode(text, normalize_embeddings=True).tolist()
@staticmethod
def _to_b64(image) -> str:
buf = io.BytesIO()
img = image.copy()
img.thumbnail((800, 800))
img.save(buf, format="JPEG", quality=80)
return base64.b64encode(buf.getvalue()).decode()
@staticmethod
def _parse(raw: str) -> dict:
m = re.search(r"\{.*\}", raw, re.DOTALL)
if m:
try:
return json.loads(m.group())
except json.JSONDecodeError:
pass
log.warning("JSON nao parseado — fallback")
return AnimalAI._fallback()
@staticmethod
def _desc_text(d: dict) -> str:
parts = [d.get("size",""), d.get("primary_color",""), d.get("species",""), d.get("breed_estimate","")]
marks = d.get("distinctive_marks", [])
if marks:
parts.append("with " + ", ".join(marks))
return " ".join(filter(None, parts))
@staticmethod
def _fallback() -> dict:
return {
"is_animal": True,
"_ai_success": False,
"species": "dog",
"breed_estimate": "SRD",
"size": "médio",
"primary_color": "caramelo",
"secondary_colors": [],
"distinctive_marks": [],
"condition": "saudável",
"description_text": "stray dog of unknown breed",
}