Spaces:

maylinejix
/

recognizer

Sleeping

File size: 13,583 Bytes

import io
import base64
import json
import numpy as np
import onnxruntime as ort
from pathlib import Path
from PIL import Image, ImageFilter
from tokenizers import Tokenizer
from fastapi import FastAPI
from pydantic import BaseModel

MODELS_DIR = Path("models")

app = FastAPI()

def make_session(path):
    opts = ort.SessionOptions()
    opts.intra_op_num_threads = 4
    opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    return ort.InferenceSession(str(path), sess_options=opts, providers=["CPUExecutionProvider"])

vis      = make_session(MODELS_DIR / "clip_visual.onnx")
txt_sess = make_session(MODELS_DIR / "clip_text.onnx")
tok      = Tokenizer.from_file(str(MODELS_DIR / "tokenizer.json"))

def preprocess(img):
    img = img.convert("RGB").filter(ImageFilter.MedianFilter(size=3))
    img = img.resize((224, 224), Image.BICUBIC)
    arr = np.array(img, dtype=np.float32) / 255.0
    arr = (arr - [0.48145466, 0.4578275, 0.40821073]) / [0.26862954, 0.26130258, 0.27577711]
    return arr.transpose(2, 0, 1)[np.newaxis].astype(np.float32)

def norm(x):
    return x / (np.linalg.norm(x, axis=-1, keepdims=True) + 1e-8)

def encode_txt(texts):
    SOT, EOT, CTX = 49406, 49407, 77
    ids = np.zeros((len(texts), CTX), dtype=np.int64)
    for i, t in enumerate(texts):
        enc = tok.encode(t.lower()).ids
        row = [SOT] + enc + [EOT]
        ids[i, :min(len(row), CTX)] = row[:CTX]
    return norm(txt_sess.run(None, {txt_sess.get_inputs()[0].name: ids})[0])

PROMPTS = {
    "bicycles": (
        ["a bicycle parked on the street", "a bicycle wheel close up", "bicycle frame and handlebars",
         "people riding bicycles on road", "a mountain bike", "a road bicycle", "bicycle rack with bikes",
         "a bike leaning against wall", "bicycle tires on pavement"],
        ["grass only", "a flower garden", "a plain building wall", "empty road no vehicle",
         "sky and clouds", "a car on road", "a motorcycle", "a tree trunk"]
    ),
    "bicycle": (
        ["a bicycle", "bicycle wheel", "bicycle handlebar", "a parked bike",
         "bicycle frame", "a person riding a bike", "bicycle seat and pedals"],
        ["grass", "a flower", "a building wall", "empty ground", "a car", "a motorcycle"]
    ),
    "cars": (
        ["a car on the road", "a parked car", "car headlights at night", "car door and window",
         "a sedan car", "an SUV on the street", "car bumper and grille", "car hood and windshield",
         "a vehicle driving on highway", "cars in traffic", "car rear with taillights"],
        ["a bicycle", "grass field", "a building facade", "sky only", "a tree",
         "a bus", "a truck", "a motorcycle", "sidewalk with no cars"]
    ),
    "car": (
        ["a car", "a vehicle on road", "car headlights", "car door",
         "car windshield", "a parked automobile", "car body metal"],
        ["a bicycle", "grass", "a building", "sky", "a bus", "a truck"]
    ),
    "traffic lights": (
        ["a traffic light pole on street", "red traffic light signal", "green traffic light signal",
         "yellow traffic light", "traffic signal at intersection", "traffic light hanging above road",
         "a stoplight on pole", "pedestrian traffic signal light"],
        ["a car", "grass", "a building wall", "sky without lights", "a tree",
         "a street lamp", "a billboard", "a road sign"]
    ),
    "traffic light": (
        ["a traffic light", "traffic signal pole", "red green traffic light",
         "stoplight at intersection", "a traffic signal"],
        ["a car", "grass", "a building", "sky", "a street lamp", "a road sign"]
    ),
    "fire hydrants": (
        ["a fire hydrant on sidewalk", "a red fire hydrant", "a yellow fire hydrant",
         "fire hydrant near curb", "a standpipe hydrant on street",
         "a short red cylinder hydrant", "fire hydrant bolts on top"],
        ["a car", "grass", "a building wall", "sky", "a tree",
         "a parking meter", "a trash can", "a mailbox"]
    ),
    "fire hydrant": (
        ["a fire hydrant", "a red hydrant", "fire hydrant on sidewalk",
         "a short red yellow cylinder on street"],
        ["a car", "grass", "a building", "sky", "a parking meter"]
    ),
    "buses": (
        ["a city bus on the road", "a public transit bus", "a large passenger bus",
         "a school bus", "a double decker bus", "bus exterior side view",
         "a bus at a bus stop", "bus windows in a row", "a coach bus on highway"],
        ["a car", "a bicycle", "grass", "a building", "sky",
         "a truck", "a van", "a train"]
    ),
    "bus": (
        ["a bus", "a public bus", "large bus vehicle", "a city bus",
         "bus exterior", "a school bus"],
        ["a car", "a bicycle", "grass", "a building", "a truck"]
    ),
    "motorcycles": (
        ["a motorcycle on the road", "a person riding a motorcycle", "motorcycle wheel and engine",
         "a parked motorcycle", "motorcycle handlebars and fuel tank",
         "a motorbike on street", "a scooter motorcycle", "motorcycle exhaust pipe"],
        ["grass", "a flower", "a building wall", "sky", "a tree",
         "a bicycle", "a car", "a truck"]
    ),
    "motorcycle": (
        ["a motorcycle", "motorcycle wheel", "riding a motorcycle",
         "a motorbike", "motorcycle engine", "a scooter"],
        ["grass", "a flower", "a building", "sky", "a bicycle", "a car"]
    ),
    "crosswalks": (
        ["a crosswalk on the road", "zebra crossing white stripes", "pedestrian crossing painted lines",
         "white parallel lines on road", "a marked crosswalk at intersection",
         "crosswalk stripes on asphalt", "pedestrian walkway markings"],
        ["a car", "grass", "a building wall", "sky", "a tree",
         "a solid road surface", "a sidewalk", "a driveway"]
    ),
    "crosswalk": (
        ["a crosswalk", "zebra crossing", "pedestrian crossing",
         "white stripes on road", "crosswalk lines painted on asphalt"],
        ["a car", "grass", "a building", "sky", "plain road no markings"]
    ),
    "stairs": (
        ["stairs going up outdoors", "concrete staircase steps", "outdoor stone steps",
         "a staircase with railing", "steps leading to building entrance",
         "stair steps close up", "wooden staircase interior"],
        ["grass", "a tree", "sky", "a car", "a window", "flat ground", "a ramp"]
    ),
    "staircase": (
        ["a staircase", "stairs", "steps going up", "stair railing and steps"],
        ["grass", "a tree", "sky", "a car", "flat surface"]
    ),
    "chimneys": (
        ["a chimney on a rooftop", "brick chimney stack", "chimney on top of building",
         "a tall chimney pipe", "industrial chimney", "multiple chimneys on roof"],
        ["grass", "a car", "sky only", "a tree", "a road", "a wall", "a window"]
    ),
    "bridges": (
        ["a bridge over water", "a road bridge spanning river", "bridge structure with supports",
         "a suspension bridge", "a concrete bridge", "bridge arch over water",
         "a pedestrian bridge", "bridge girders and cables"],
        ["grass", "a car", "a building", "a tree", "a road without bridge"]
    ),
    "boats": (
        ["a boat on water", "a sailing boat", "a motorboat", "a ship at sea",
         "a rowboat on lake", "a fishing boat", "boat hull in water",
         "a yacht on ocean", "a ferry boat"],
        ["grass", "a car", "a building", "a tree", "a road", "empty water no boat"]
    ),
    "mountains": (
        ["a mountain landscape", "mountain peak with snow", "rocky mountain scenery",
         "a mountain range in background", "mountain slope with trees",
         "high altitude mountain view", "mountain ridge and valley"],
        ["a car", "a building", "a road", "a bicycle", "flat ground", "a city skyline"]
    ),
    "tractors": (
        ["a farm tractor", "a tractor in a field", "agricultural tractor working",
         "tractor large rear wheels", "a green farm tractor", "tractor on farmland"],
        ["a car", "grass without tractor", "a building", "sky", "a bicycle", "a truck"]
    ),
    "parking meters": (
        ["a parking meter on sidewalk", "coin operated parking meter",
         "a metal parking meter pole", "parking pay station on street",
         "a single post parking meter"],
        ["a car", "grass", "a building", "sky", "a tree", "a fire hydrant", "a trash can"]
    ),
    "trucks": (
        ["a large truck on the road", "a delivery truck", "a semi truck with trailer",
         "a cargo truck", "truck cab and body", "a pickup truck",
         "a freight truck on highway", "truck wheels and axle"],
        ["a car", "a bicycle", "grass", "a building", "sky", "a bus"]
    ),
    "truck": (
        ["a truck", "a delivery truck", "a pickup truck", "cargo truck body"],
        ["a car", "a bicycle", "grass", "a building", "a bus"]
    ),
    "palm trees": (
        ["a palm tree", "tropical palm tree leaves", "a tall palm trunk",
         "coconut palm tree", "palm fronds at top of tree", "a palm tree on beach"],
        ["a car", "a building", "grass", "a pine tree", "a leafy tree", "a cactus"]
    ),
    "traffic signs": (
        ["a traffic sign on pole", "a road sign", "a stop sign", "a yield sign",
         "speed limit sign on road", "a warning road sign", "directional traffic sign"],
        ["a car", "grass", "a building", "sky", "a tree", "a traffic light"]
    ),
    "vehicles": (
        ["a motor vehicle on road", "a car driving", "a bus on street",
         "a truck on highway", "a motorcycle", "a vehicle in traffic"],
        ["grass", "a building", "sky", "a tree", "a bicycle", "a person walking"]
    ),
    "airplanes": (
        ["an airplane in the sky", "a commercial aircraft", "airplane wings in flight",
         "a plane on runway", "aircraft fuselage", "a jet plane taking off"],
        ["a car", "a bird", "a building", "grass", "a boat", "clouds only"]
    ),
    "train": (
        ["a train on tracks", "a locomotive", "train cars on railway",
         "a passenger train", "train wheels on rails"],
        ["a car", "a bus", "a truck", "grass", "a building", "a road"]
    ),
    "taxicabs": (
        ["a yellow taxi cab", "a taxicab on road", "a taxi car with sign on top",
         "a cab vehicle for hire", "taxi with yellow paint"],
        ["a private car", "a bus", "a police car", "grass", "a building"]
    ),
    "store fronts": (
        ["a store front with windows", "a shop entrance facade",
         "retail store exterior", "a business storefront with sign",
         "shop window display on street"],
        ["a car", "grass", "sky", "a tree", "a house", "a warehouse"]
    ),
    "taxis": (
        ["a taxi cab", "a yellow taxi", "a cab with taxi sign",
         "a taxi vehicle on street"],
        ["a private car", "a bus", "grass", "a building"]
    ),
}

_txt_cache = {}

def get_txt_feats(label):
    if label not in _txt_cache:
        if label in PROMPTS:
            pos, neg = PROMPTS[label]
        else:
            # generic fallback lebih kaya
            pos = [
                f"a photo of {label}",
                f"{label} close up",
                f"an image clearly showing {label}",
                f"{label} on the street",
                f"a clear view of {label}",
            ]
            neg = [
                "grass and dirt",
                "a plain building facade",
                "sky and clouds only",
                "a tree with leaves",
                "an empty road surface",
                "blurry background texture",
            ]
        _txt_cache[label] = (encode_txt(pos + neg), len(pos))
    return _txt_cache[label]

def adaptive_threshold(scores: list[float], n_tiles: int) -> float:
    arr = np.array(scores)
    mean_s = float(np.mean(arr))
    std_s  = float(np.std(arr))
    max_s  = float(np.max(arr))
    min_s  = float(np.min(arr))
    spread = max_s - min_s

    if std_s < 0.005:
        # semua score mirip: ambil top-N paling tinggi
        n_take = max(1, min(3, n_tiles // 3))
        return float(sorted(arr)[-n_take])

    if spread > 0.15:
        # ada gap besar: ambil yang jelas-jelas di atas
        return mean_s + 0.5 * std_s

    # normal case: agak konservatif
    return mean_s + 0.25 * std_s

class ScoreRequest(BaseModel):
    label: str
    tiles: list[str]

class ScoreResponse(BaseModel):
    scores: list[float]
    threshold: float
    to_click: list[int]

@app.get("/")
def root():
    return {"status": "ok"}

@app.get("/health")
def health():
    return {"status": "ok"}

@app.post("/score", response_model=ScoreResponse)
def score_tiles(req: ScoreRequest):
    label = req.label.lower().strip()
    t_feat, n_pos = get_txt_feats(label)

    imgs = []
    for b64 in req.tiles:
        raw = base64.b64decode(b64)
        img = Image.open(io.BytesIO(raw))
        imgs.append(preprocess(img))

    batch  = np.concatenate(imgs, axis=0)
    i_feat = norm(vis.run(None, {vis.get_inputs()[0].name: batch})[0])
    sims   = i_feat @ t_feat.T

    scores = [float(sims[i, :n_pos].max() - sims[i, n_pos:].max()) for i in range(len(imgs))]

    threshold = adaptive_threshold(scores, len(imgs))
    to_click  = [i for i, s in enumerate(scores) if s >= threshold]

    # safety: kalau terlalu banyak klik (>= semua tile) mungkin threshold terlalu rendah, naikkan
    if len(to_click) >= len(scores):
        threshold = float(np.max(scores)) * 0.95
        to_click  = [i for i, s in enumerate(scores) if s >= threshold]

    return ScoreResponse(scores=scores, threshold=threshold, to_click=to_click)