Spaces:

froidhj
/

servejj

Sleeping

File size: 5,713 Bytes

4a6156f
d24ae2d
 
 
 
 
4a6156f
22e64d4
34aa0cd
c84e518
 
 
 
22e64d4
e4ed76e
 
 
b037393
22e64d4
 
b037393
 
 
 
 
 
 
 
 
 
c84e518
 
b037393
c84e518
4a6156f
22e64d4
e4ed76e
afcf209
e4ed76e
22e64d4
afcf209
 
 
 
 
 
 
 
 
 
 
 
22e64d4
c84e518
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22e64d4
e4ed76e
4a6156f
e4ed76e
4a6156f
22e64d4
82e76f6
 
afcf209
82e76f6
 
0687ea7
f39cbba
e4ed76e
f40cf99
6dbca3d
0687ea7
f40cf99
 
 
c84e518
6dbca3d
955743d
 
c84e518
955743d
 
 
 
b037393
955743d
b037393
955743d
 
 
 
 
b037393
955743d
b037393
 
 
 
 
955743d
22e64d4
f40cf99
 
c84e518
 
f40cf99
 
 
 
afcf209
f40cf99
 
 
 
 
 
 
f39cbba
22e64d4
f40cf99
 
f39cbba
 
 
d24ae2d

# app.py — TrashTrack Turbo (compatível com ESP32 + multipart/form-data)
import os
# Evita erro do libgomp e excesso de threads no CPU
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

from fastapi import FastAPI, File, UploadFile
from fastapi.responses import JSONResponse
from PIL import Image, ImageOps
import io, time, torch
import numpy as np

from transformers import AutoProcessor, AutoModel

# ==============================================
# ⚙️ Configurações
# ==============================================
MODEL_ID = "google/siglip-so400m-patch14-384"  # modelo de maior assertividade
device = "cuda" if torch.cuda.is_available() else "cpu"

# Detecta automaticamente se há torchvision
try:
    import torchvision  # noqa: F401
    USE_FAST = True
except Exception:
    USE_FAST = False

print(f"🚀 Carregando modelo {MODEL_ID} (use_fast={USE_FAST})...")
# Usa caminho "slow" se não tiver torchvision
processor = AutoProcessor.from_pretrained(MODEL_ID, use_fast=USE_FAST)
model = AutoModel.from_pretrained(
    MODEL_ID,
    dtype=torch.float16 if device == "cuda" else None
).to(device).eval()
print("✅ Modelo carregado com sucesso.")

# ==============================================
# 📋 Classes (PT + EN) — **SEM VIDRO**
# ==============================================
labels = {
    "plastico": [
        "plástico", "garrafa PET", "tampinha plástica",
        "sacola plástica", "plastic bottle"
    ],
    "papel": [
        "papel", "folha", "envelope de papel",
        "paper sheet", "paper wrapper"
    ],
    "metal": [
        "lata", "alumínio", "tampinha metálica",
        "metal cap", "can"
    ],
}

def _promptize(term: str) -> str:
    return f"centered {term} on a white background; ignore the background; classify only the object"

texts = [_promptize(t) for group in labels.values() for t in group]

# ==============================================
# 🔧 Util — recorte do foreground ignorando fundo branco
# ==============================================
def crop_foreground_ignore_white(pil: Image.Image) -> Image.Image:
    img = pil.convert("RGB")
    arr = np.array(img)
    r, g, b = arr[..., 0], arr[..., 1], arr[..., 2]
    whiteish = (r > 230) & (g > 230) & (b > 230)
    fg = ~whiteish

    if fg.sum() < 500:
        w, h = img.size
        cw, ch = int(w * 0.8), int(h * 0.8)
        left, top = (w - cw) // 2, (h - ch) // 2
        return img.crop((left, top, left + cw, top + ch))

    ys, xs = np.where(fg)
    y0, y1 = ys.min(), ys.max()
    x0, x1 = xs.min(), xs.max()

    py, px = int(0.03 * img.height), int(0.03 * img.width)
    y0 = max(0, y0 - py); y1 = min(img.height - 1, y1 + py)
    x0 = max(0, x0 - px); x1 = min(img.width - 1, x1 + px)

    return img.crop((x0, y0, x1 + 1, y1 + 1))

# ==============================================
# 🌐 App FastAPI
# ==============================================
app = FastAPI(title="TrashTrack Turbo — ESP32 Compatible")

@app.get("/")
def root():
    return {"ok": True, "model": MODEL_ID, "mode": "multipart/files[]", "classes": list(labels.keys())}

@app.post("/predict")
async def predict(files_: list[UploadFile] = File(..., alias="files[]")):
    try:
        t0 = time.time()
        results = []

        for f in files_:
            data = await f.read()
            image = Image.open(io.BytesIO(data))
            image = ImageOps.exif_transpose(image).convert("RGB")
            image = crop_foreground_ignore_white(image)

            text_inputs  = processor(text=texts,  return_tensors="pt", padding=True).to(device)
            image_inputs = processor(images=image, return_tensors="pt").to(device)

            with torch.inference_mode():
                txt_emb, img_emb = None, None

                if hasattr(model, "get_text_features"):
                    txt_emb = model.get_text_features(**text_inputs)
                if hasattr(model, "get_image_features"):
                    img_emb = model.get_image_features(**image_inputs)

                if txt_emb is None or img_emb is None:
                    joint = processor(text=texts, images=image, return_tensors="pt", padding=True).to(device)
                    out = model(**joint)
                    if txt_emb is None:
                        txt_emb = getattr(out, "text_embeds", getattr(out, "text_embeds_projected", None))
                    if img_emb is None:
                        img_emb = getattr(out, "image_embeds", getattr(out, "image_embeds_projected", None))

                img_emb = torch.nn.functional.normalize(img_emb, dim=-1)
                txt_emb = torch.nn.functional.normalize(txt_emb, dim=-1)
                logits  = (img_emb @ txt_emb.t()).squeeze(0)
                probs   = torch.softmax(logits.float().cpu(), dim=-1).tolist()

            idx, scores = 0, {}
            for key, group in labels.items():
                g = probs[idx: idx + len(group)]
                s = sum(g) / len(g)
                scores[key] = s
                idx += len(group)

            best = max(scores, key=scores.get)
            conf = round(float(scores[best]), 3)
            results.append((best, conf))

        votos = {}
        for r, c in results:
            votos.setdefault(r, []).append(c)
        final = max(votos, key=lambda k: sum(votos[k]) / len(votos[k]))
        conf = round(sum(votos[final]) / len(votos[final]), 3)
        latency = round(time.time() - t0, 2)

        print(f"[OK] {final} ({conf}) em {latency}s")
        return JSONResponse({"label": final, "conf": conf, "latency_s": latency})

    except Exception as e:
        print(f"[ERRO] {e}")
        return JSONResponse({"error": str(e)}, status_code=500)