Spaces:
Running
Running
| """ | |
| ai.py — Visao via HuggingFace InferenceClient ou NVIDIA NIM. | |
| Variaveis de ambiente: | |
| HF_TOKEN — token HuggingFace (usa HF Serverless Inference) | |
| NVIDIA_API_KEY — chave NVIDIA NIM (free tier em build.nvidia.com) | |
| Modelo HF: meta-llama/Llama-3.2-11B-Vision-Instruct (disponivel no HF Serverless) | |
| Modelo NIM: nvidia/nemotron-3-nano-omni-30b-a3b (Nemotron Omni via NVIDIA) | |
| """ | |
| import base64 | |
| import io | |
| import json | |
| import logging | |
| import os | |
| import re | |
| import numpy as np | |
| log = logging.getLogger(__name__) | |
| _HF_MODEL = "meta-llama/Llama-3.2-11B-Vision-Instruct" | |
| _NIM_MODEL = "nvidia/nemotron-3-nano-omni-30b-a3b-reasoning" # Nemotron Omni VLM, NVIDIA NIM | |
| PROMPT = ( | |
| "Examine this image carefully.\n" | |
| "FIRST: Is there a dog or cat clearly visible?\n" | |
| "If NO dog or cat is present, respond with exactly: {\"is_animal\": false}\n\n" | |
| "If YES, respond with ONLY valid JSON (no markdown, no explanation).\n\n" | |
| "For breed_estimate, use visual cues (coat type, ear shape, body build, snout, tail) to pick the SINGLE best match.\n\n" | |
| "DOG breeds to choose from (use exact spelling):\n" | |
| "SRD, Labrador Retriever, Golden Retriever, Pitbull, Poodle, Shih Tzu, Rottweiler, " | |
| "German Shepherd, Bulldog, Dachshund, Chihuahua, Siberian Husky, Border Collie, " | |
| "Beagle, Boxer, Maltese, Chow Chow, Akita, Dalmatian, Doberman\n\n" | |
| "CAT breeds to choose from (use exact spelling):\n" | |
| "Domestic Shorthair, Domestic Longhair, Siamese, Persian, Maine Coon, Bengal, " | |
| "British Shorthair, Ragdoll, Scottish Fold, Turkish Angora, Sphynx, Abyssinian\n\n" | |
| "JSON format:\n" | |
| "{\"is_animal\": true," | |
| " \"species\": \"dog or cat\"," | |
| " \"breed_estimate\": \"exact name from the list above — SRD/Domestic Shorthair only if truly unidentifiable\"," | |
| " \"size\": \"small, medium or large\"," | |
| " \"primary_color\": \"main color: caramel, black, white, gray, brown, golden, orange, tabby, mixed\"," | |
| " \"secondary_colors\": [\"other visible colors, or empty list\"]," | |
| " \"distinctive_marks\": [\"notable features: e.g. white chest patch, red collar, scar, missing ear — or empty list\"]," | |
| " \"condition\": \"healthy, thin or injured\"," | |
| " \"description_text\": \"one concise English sentence describing this specific animal for identity matching\"}" | |
| ) | |
| class AnimalAI: | |
| def __init__(self): | |
| self.mode = None # "hf" | "nim" | |
| self.model = None | |
| self.client = None # OpenAI (NIM) ou InferenceClient (HF) | |
| hf_token = os.environ.get("HF_TOKEN", "") | |
| nvidia_key = os.environ.get("NVIDIA_API_KEY", "") | |
| if nvidia_key: | |
| try: | |
| from openai import OpenAI | |
| self.mode = "nim" | |
| self.model = os.environ.get("NVIDIA_MODEL", _NIM_MODEL) | |
| self.client = OpenAI( | |
| base_url="https://integrate.api.nvidia.com/v1", | |
| api_key=nvidia_key, | |
| ) | |
| log.info("AI: Nemotron Omni via NVIDIA NIM (%s)", self.model) | |
| except ImportError: | |
| log.warning("openai nao instalado") | |
| elif hf_token: | |
| try: | |
| from huggingface_hub import InferenceClient | |
| self.mode = "hf" | |
| self.model = os.environ.get("NVIDIA_MODEL", _HF_MODEL) | |
| self.client = InferenceClient(model=self.model, token=hf_token) | |
| log.info("AI: %s via HF InferenceClient", self.model) | |
| except ImportError: | |
| log.warning("huggingface_hub nao instalado") | |
| else: | |
| log.warning("Sem chave de API — IA desabilitada. Configure HF_TOKEN nos Secrets.") | |
| self.embedder = None | |
| try: | |
| from sentence_transformers import SentenceTransformer | |
| self.embedder = SentenceTransformer("all-MiniLM-L6-v2") | |
| log.info("sentence-transformers: all-MiniLM-L6-v2") | |
| except Exception as e: | |
| log.warning("sentence-transformers nao carregou: %s", e) | |
| def analyze_image(self, image) -> dict: | |
| """Analisa imagem. _ai_success=False indica que a IA nao foi usada.""" | |
| if self.client is None: | |
| return self._fallback() | |
| try: | |
| img_b64 = self._to_b64(image) | |
| if self.mode == "hf": | |
| # HuggingFace InferenceClient — suporte nativo a multimodal | |
| resp = self.client.chat_completion( | |
| messages=[{ | |
| "role": "user", | |
| "content": [ | |
| {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64," + img_b64}}, | |
| {"type": "text", "text": PROMPT}, | |
| ], | |
| }], | |
| max_tokens=400, | |
| temperature=0.1, | |
| ) | |
| raw = resp.choices[0].message.content or "" | |
| else: | |
| # NVIDIA NIM — Nemotron Omni (reasoning model) | |
| resp = self.client.chat.completions.create( | |
| model=self.model, | |
| messages=[{ | |
| "role": "user", | |
| "content": [ | |
| {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64," + img_b64}}, | |
| {"type": "text", "text": PROMPT}, | |
| ], | |
| }], | |
| max_tokens=1024, | |
| temperature=0.6, | |
| top_p=0.95, | |
| extra_body={ | |
| "chat_template_kwargs": {"enable_thinking": True}, | |
| "reasoning_budget": 512, | |
| }, | |
| ) | |
| msg = resp.choices[0].message | |
| # Nemotron Omni: resposta pode estar em content ou reasoning_content | |
| raw = (msg.content or "") or (getattr(msg, "reasoning_content", "") or "") | |
| log.info("AI resposta: %s", raw[:200]) | |
| result = self._parse(raw) | |
| # Rejeição explícita: a IA não viu nenhum animal | |
| if result.get("is_animal") is False: | |
| log.info("IA: nenhum animal detectado na imagem.") | |
| return {"is_animal": False, "_ai_success": True} | |
| result["is_animal"] = True | |
| result["_ai_success"] = True | |
| return result | |
| except Exception as e: | |
| log.error("Erro na API de IA: %s", e) | |
| return self._fallback() | |
| def get_embedding(self, description: dict) -> list: | |
| """Embedding da descricao. Aleatorio se IA falhou (evita falsos matches).""" | |
| if not description.get("_ai_success", True): | |
| log.info("Fallback IA — embedding aleatorio") | |
| v = np.random.randn(384).astype(np.float32) | |
| v /= np.linalg.norm(v) | |
| return v.tolist() | |
| if self.embedder is None: | |
| v = np.random.randn(384).astype(np.float32) | |
| v /= np.linalg.norm(v) | |
| return v.tolist() | |
| text = description.get("description_text") or self._desc_text(description) | |
| return self.embedder.encode(text, normalize_embeddings=True).tolist() | |
| def _to_b64(image) -> str: | |
| buf = io.BytesIO() | |
| img = image.copy() | |
| img.thumbnail((800, 800)) | |
| img.save(buf, format="JPEG", quality=80) | |
| return base64.b64encode(buf.getvalue()).decode() | |
| def _parse(raw: str) -> dict: | |
| m = re.search(r"\{.*\}", raw, re.DOTALL) | |
| if m: | |
| try: | |
| return json.loads(m.group()) | |
| except json.JSONDecodeError: | |
| pass | |
| log.warning("JSON nao parseado — fallback") | |
| return AnimalAI._fallback() | |
| def _desc_text(d: dict) -> str: | |
| parts = [d.get("size",""), d.get("primary_color",""), d.get("species",""), d.get("breed_estimate","")] | |
| marks = d.get("distinctive_marks", []) | |
| if marks: | |
| parts.append("with " + ", ".join(marks)) | |
| return " ".join(filter(None, parts)) | |
| def _fallback() -> dict: | |
| return { | |
| "is_animal": True, | |
| "_ai_success": False, | |
| "species": "dog", | |
| "breed_estimate": "SRD", | |
| "size": "médio", | |
| "primary_color": "caramelo", | |
| "secondary_colors": [], | |
| "distinctive_marks": [], | |
| "condition": "saudável", | |
| "description_text": "stray dog of unknown breed", | |
| } | |