maylinejix commited on
Commit
422860c
Β·
verified Β·
1 Parent(s): 6d1c0ce

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -66
app.py CHANGED
@@ -13,8 +13,6 @@ MODELS_DIR = Path("models")
13
 
14
  app = FastAPI()
15
 
16
- # ── Load models ───────────────────────────────────────────────────────────────
17
-
18
  def make_session(path):
19
  opts = ort.SessionOptions()
20
  opts.intra_op_num_threads = 4
@@ -25,8 +23,6 @@ vis = make_session(MODELS_DIR / "clip_visual.onnx")
25
  txt_sess = make_session(MODELS_DIR / "clip_text.onnx")
26
  tok = Tokenizer.from_file(str(MODELS_DIR / "tokenizer.json"))
27
 
28
- # ── CLIP helpers ──────────────────────────────────────────────────────────────
29
-
30
  def preprocess(img):
31
  img = img.convert("RGB").filter(ImageFilter.MedianFilter(size=3))
32
  img = img.resize((224, 224), Image.BICUBIC)
@@ -47,50 +43,185 @@ def encode_txt(texts):
47
  return norm(txt_sess.run(None, {txt_sess.get_inputs()[0].name: ids})[0])
48
 
49
  PROMPTS = {
50
- "bicycles": (["a bicycle parked","a bicycle wheel","a bicycle frame","people riding bicycles"],
51
- ["grass","a flower","a building wall","dirt ground","sky"]),
52
- "bicycle": (["a bicycle","a bicycle wheel","bicycle handlebar"],
53
- ["grass","a flower","a building wall","dirt ground"]),
54
- "cars": (["a car on the road","a car parked","car headlights","car door"],
55
- ["a bicycle","grass","a building","sky","a tree"]),
56
- "car": (["a car","a vehicle","car headlights"],
57
- ["a bicycle","grass","a building","sky"]),
58
- "traffic lights": (["a traffic light pole","red traffic light","green traffic light","traffic signal"],
59
- ["a car","grass","a building","sky","a tree"]),
60
- "traffic light": (["a traffic light","traffic signal"],
61
- ["a car","grass","a building","sky"]),
62
- "fire hydrants": (["a fire hydrant on the sidewalk","a red fire hydrant"],
63
- ["a car","grass","a building","sky","a tree"]),
64
- "fire hydrant": (["a fire hydrant","a red hydrant"],
65
- ["a car","grass","a building","sky"]),
66
- "buses": (["a bus on the road","a public bus","a large bus","school bus"],
67
- ["a car","a bicycle","grass","a building","sky"]),
68
- "bus": (["a bus","a public bus","large bus vehicle"],
69
- ["a car","a bicycle","grass","a building"]),
70
- "motorcycles": (["a motorcycle on the road","a person riding a motorcycle","motorcycle wheel"],
71
- ["grass","a flower","a building","sky","a tree"]),
72
- "motorcycle": (["a motorcycle","motorcycle wheel","riding a motorcycle"],
73
- ["grass","a flower","a building","sky"]),
74
- "crosswalks": (["a crosswalk on the road","zebra crossing","pedestrian crossing","white stripes on road"],
75
- ["a car","grass","a building","sky","a tree"]),
76
- "crosswalk": (["a crosswalk","zebra crossing","pedestrian crossing"],
77
- ["a car","grass","a building","sky"]),
78
- "stairs": (["stairs going up","staircase steps","outdoor stairs","concrete steps"],
79
- ["grass","a tree","sky","a car","a window"]),
80
- "staircase": (["a staircase","stairs","steps going up"],
81
- ["grass","a tree","sky","a car"]),
82
- "chimneys": (["a chimney on a rooftop","brick chimney","chimney stack"],
83
- ["grass","a car","sky","a tree","a road"]),
84
- "bridges": (["a bridge over water","a road bridge","bridge structure"],
85
- ["grass","a car","a building","a tree"]),
86
- "boats": (["a boat on water","a sailing boat"],
87
- ["grass","a car","a building","a tree","a road"]),
88
- "mountains": (["a mountain landscape","mountain peak","rocky mountain"],
89
- ["a car","a building","a road","a bicycle"]),
90
- "tractors": (["a farm tractor","a tractor in a field"],
91
- ["a car","grass","a building","sky","a bicycle"]),
92
- "parking meters": (["a parking meter on sidewalk","coin parking meter"],
93
- ["a car","grass","a building","sky","a tree"]),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  }
95
 
96
  _txt_cache = {}
@@ -100,21 +231,53 @@ def get_txt_feats(label):
100
  if label in PROMPTS:
101
  pos, neg = PROMPTS[label]
102
  else:
103
- pos = [f"a photo of {label}", f"{label}", f"an image of {label}"]
104
- neg = ["grass", "a building", "sky", "a tree", "a road"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  _txt_cache[label] = (encode_txt(pos + neg), len(pos))
106
  return _txt_cache[label]
107
 
108
- # ── API ───────────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  class ScoreRequest(BaseModel):
111
  label: str
112
- tiles: list[str] # list of base64 PNG images
113
 
114
  class ScoreResponse(BaseModel):
115
  scores: list[float]
116
  threshold: float
117
- to_click: list[int] # indices yang harus diklik
118
 
119
  @app.get("/")
120
  def root():
@@ -126,7 +289,7 @@ def health():
126
 
127
  @app.post("/score", response_model=ScoreResponse)
128
  def score_tiles(req: ScoreRequest):
129
- label = req.label.lower().strip()
130
  t_feat, n_pos = get_txt_feats(label)
131
 
132
  imgs = []
@@ -135,20 +298,18 @@ def score_tiles(req: ScoreRequest):
135
  img = Image.open(io.BytesIO(raw))
136
  imgs.append(preprocess(img))
137
 
138
- batch = np.concatenate(imgs, axis=0)
139
- i_feat = norm(vis.run(None, {vis.get_inputs()[0].name: batch})[0])
140
- sims = i_feat @ t_feat.T
141
- scores = [float(sims[i, :n_pos].max() - sims[i, n_pos:].max()) for i in range(len(imgs))]
142
 
143
- vals = scores
144
- mean_s = float(np.mean(vals))
145
- std_s = float(np.std(vals))
146
 
147
- if std_s < 0.005:
148
- threshold = sorted(vals)[-min(3, len(vals))]
149
- else:
150
- threshold = mean_s + 0.1 * std_s
151
 
152
- to_click = [i for i, s in enumerate(scores) if s >= threshold]
 
 
 
153
 
154
  return ScoreResponse(scores=scores, threshold=threshold, to_click=to_click)
 
13
 
14
  app = FastAPI()
15
 
 
 
16
  def make_session(path):
17
  opts = ort.SessionOptions()
18
  opts.intra_op_num_threads = 4
 
23
  txt_sess = make_session(MODELS_DIR / "clip_text.onnx")
24
  tok = Tokenizer.from_file(str(MODELS_DIR / "tokenizer.json"))
25
 
 
 
26
  def preprocess(img):
27
  img = img.convert("RGB").filter(ImageFilter.MedianFilter(size=3))
28
  img = img.resize((224, 224), Image.BICUBIC)
 
43
  return norm(txt_sess.run(None, {txt_sess.get_inputs()[0].name: ids})[0])
44
 
45
  PROMPTS = {
46
+ "bicycles": (
47
+ ["a bicycle parked on the street", "a bicycle wheel close up", "bicycle frame and handlebars",
48
+ "people riding bicycles on road", "a mountain bike", "a road bicycle", "bicycle rack with bikes",
49
+ "a bike leaning against wall", "bicycle tires on pavement"],
50
+ ["grass only", "a flower garden", "a plain building wall", "empty road no vehicle",
51
+ "sky and clouds", "a car on road", "a motorcycle", "a tree trunk"]
52
+ ),
53
+ "bicycle": (
54
+ ["a bicycle", "bicycle wheel", "bicycle handlebar", "a parked bike",
55
+ "bicycle frame", "a person riding a bike", "bicycle seat and pedals"],
56
+ ["grass", "a flower", "a building wall", "empty ground", "a car", "a motorcycle"]
57
+ ),
58
+ "cars": (
59
+ ["a car on the road", "a parked car", "car headlights at night", "car door and window",
60
+ "a sedan car", "an SUV on the street", "car bumper and grille", "car hood and windshield",
61
+ "a vehicle driving on highway", "cars in traffic", "car rear with taillights"],
62
+ ["a bicycle", "grass field", "a building facade", "sky only", "a tree",
63
+ "a bus", "a truck", "a motorcycle", "sidewalk with no cars"]
64
+ ),
65
+ "car": (
66
+ ["a car", "a vehicle on road", "car headlights", "car door",
67
+ "car windshield", "a parked automobile", "car body metal"],
68
+ ["a bicycle", "grass", "a building", "sky", "a bus", "a truck"]
69
+ ),
70
+ "traffic lights": (
71
+ ["a traffic light pole on street", "red traffic light signal", "green traffic light signal",
72
+ "yellow traffic light", "traffic signal at intersection", "traffic light hanging above road",
73
+ "a stoplight on pole", "pedestrian traffic signal light"],
74
+ ["a car", "grass", "a building wall", "sky without lights", "a tree",
75
+ "a street lamp", "a billboard", "a road sign"]
76
+ ),
77
+ "traffic light": (
78
+ ["a traffic light", "traffic signal pole", "red green traffic light",
79
+ "stoplight at intersection", "a traffic signal"],
80
+ ["a car", "grass", "a building", "sky", "a street lamp", "a road sign"]
81
+ ),
82
+ "fire hydrants": (
83
+ ["a fire hydrant on sidewalk", "a red fire hydrant", "a yellow fire hydrant",
84
+ "fire hydrant near curb", "a standpipe hydrant on street",
85
+ "a short red cylinder hydrant", "fire hydrant bolts on top"],
86
+ ["a car", "grass", "a building wall", "sky", "a tree",
87
+ "a parking meter", "a trash can", "a mailbox"]
88
+ ),
89
+ "fire hydrant": (
90
+ ["a fire hydrant", "a red hydrant", "fire hydrant on sidewalk",
91
+ "a short red yellow cylinder on street"],
92
+ ["a car", "grass", "a building", "sky", "a parking meter"]
93
+ ),
94
+ "buses": (
95
+ ["a city bus on the road", "a public transit bus", "a large passenger bus",
96
+ "a school bus", "a double decker bus", "bus exterior side view",
97
+ "a bus at a bus stop", "bus windows in a row", "a coach bus on highway"],
98
+ ["a car", "a bicycle", "grass", "a building", "sky",
99
+ "a truck", "a van", "a train"]
100
+ ),
101
+ "bus": (
102
+ ["a bus", "a public bus", "large bus vehicle", "a city bus",
103
+ "bus exterior", "a school bus"],
104
+ ["a car", "a bicycle", "grass", "a building", "a truck"]
105
+ ),
106
+ "motorcycles": (
107
+ ["a motorcycle on the road", "a person riding a motorcycle", "motorcycle wheel and engine",
108
+ "a parked motorcycle", "motorcycle handlebars and fuel tank",
109
+ "a motorbike on street", "a scooter motorcycle", "motorcycle exhaust pipe"],
110
+ ["grass", "a flower", "a building wall", "sky", "a tree",
111
+ "a bicycle", "a car", "a truck"]
112
+ ),
113
+ "motorcycle": (
114
+ ["a motorcycle", "motorcycle wheel", "riding a motorcycle",
115
+ "a motorbike", "motorcycle engine", "a scooter"],
116
+ ["grass", "a flower", "a building", "sky", "a bicycle", "a car"]
117
+ ),
118
+ "crosswalks": (
119
+ ["a crosswalk on the road", "zebra crossing white stripes", "pedestrian crossing painted lines",
120
+ "white parallel lines on road", "a marked crosswalk at intersection",
121
+ "crosswalk stripes on asphalt", "pedestrian walkway markings"],
122
+ ["a car", "grass", "a building wall", "sky", "a tree",
123
+ "a solid road surface", "a sidewalk", "a driveway"]
124
+ ),
125
+ "crosswalk": (
126
+ ["a crosswalk", "zebra crossing", "pedestrian crossing",
127
+ "white stripes on road", "crosswalk lines painted on asphalt"],
128
+ ["a car", "grass", "a building", "sky", "plain road no markings"]
129
+ ),
130
+ "stairs": (
131
+ ["stairs going up outdoors", "concrete staircase steps", "outdoor stone steps",
132
+ "a staircase with railing", "steps leading to building entrance",
133
+ "stair steps close up", "wooden staircase interior"],
134
+ ["grass", "a tree", "sky", "a car", "a window", "flat ground", "a ramp"]
135
+ ),
136
+ "staircase": (
137
+ ["a staircase", "stairs", "steps going up", "stair railing and steps"],
138
+ ["grass", "a tree", "sky", "a car", "flat surface"]
139
+ ),
140
+ "chimneys": (
141
+ ["a chimney on a rooftop", "brick chimney stack", "chimney on top of building",
142
+ "a tall chimney pipe", "industrial chimney", "multiple chimneys on roof"],
143
+ ["grass", "a car", "sky only", "a tree", "a road", "a wall", "a window"]
144
+ ),
145
+ "bridges": (
146
+ ["a bridge over water", "a road bridge spanning river", "bridge structure with supports",
147
+ "a suspension bridge", "a concrete bridge", "bridge arch over water",
148
+ "a pedestrian bridge", "bridge girders and cables"],
149
+ ["grass", "a car", "a building", "a tree", "a road without bridge"]
150
+ ),
151
+ "boats": (
152
+ ["a boat on water", "a sailing boat", "a motorboat", "a ship at sea",
153
+ "a rowboat on lake", "a fishing boat", "boat hull in water",
154
+ "a yacht on ocean", "a ferry boat"],
155
+ ["grass", "a car", "a building", "a tree", "a road", "empty water no boat"]
156
+ ),
157
+ "mountains": (
158
+ ["a mountain landscape", "mountain peak with snow", "rocky mountain scenery",
159
+ "a mountain range in background", "mountain slope with trees",
160
+ "high altitude mountain view", "mountain ridge and valley"],
161
+ ["a car", "a building", "a road", "a bicycle", "flat ground", "a city skyline"]
162
+ ),
163
+ "tractors": (
164
+ ["a farm tractor", "a tractor in a field", "agricultural tractor working",
165
+ "tractor large rear wheels", "a green farm tractor", "tractor on farmland"],
166
+ ["a car", "grass without tractor", "a building", "sky", "a bicycle", "a truck"]
167
+ ),
168
+ "parking meters": (
169
+ ["a parking meter on sidewalk", "coin operated parking meter",
170
+ "a metal parking meter pole", "parking pay station on street",
171
+ "a single post parking meter"],
172
+ ["a car", "grass", "a building", "sky", "a tree", "a fire hydrant", "a trash can"]
173
+ ),
174
+ "trucks": (
175
+ ["a large truck on the road", "a delivery truck", "a semi truck with trailer",
176
+ "a cargo truck", "truck cab and body", "a pickup truck",
177
+ "a freight truck on highway", "truck wheels and axle"],
178
+ ["a car", "a bicycle", "grass", "a building", "sky", "a bus"]
179
+ ),
180
+ "truck": (
181
+ ["a truck", "a delivery truck", "a pickup truck", "cargo truck body"],
182
+ ["a car", "a bicycle", "grass", "a building", "a bus"]
183
+ ),
184
+ "palm trees": (
185
+ ["a palm tree", "tropical palm tree leaves", "a tall palm trunk",
186
+ "coconut palm tree", "palm fronds at top of tree", "a palm tree on beach"],
187
+ ["a car", "a building", "grass", "a pine tree", "a leafy tree", "a cactus"]
188
+ ),
189
+ "traffic signs": (
190
+ ["a traffic sign on pole", "a road sign", "a stop sign", "a yield sign",
191
+ "speed limit sign on road", "a warning road sign", "directional traffic sign"],
192
+ ["a car", "grass", "a building", "sky", "a tree", "a traffic light"]
193
+ ),
194
+ "vehicles": (
195
+ ["a motor vehicle on road", "a car driving", "a bus on street",
196
+ "a truck on highway", "a motorcycle", "a vehicle in traffic"],
197
+ ["grass", "a building", "sky", "a tree", "a bicycle", "a person walking"]
198
+ ),
199
+ "airplanes": (
200
+ ["an airplane in the sky", "a commercial aircraft", "airplane wings in flight",
201
+ "a plane on runway", "aircraft fuselage", "a jet plane taking off"],
202
+ ["a car", "a bird", "a building", "grass", "a boat", "clouds only"]
203
+ ),
204
+ "train": (
205
+ ["a train on tracks", "a locomotive", "train cars on railway",
206
+ "a passenger train", "train wheels on rails"],
207
+ ["a car", "a bus", "a truck", "grass", "a building", "a road"]
208
+ ),
209
+ "taxicabs": (
210
+ ["a yellow taxi cab", "a taxicab on road", "a taxi car with sign on top",
211
+ "a cab vehicle for hire", "taxi with yellow paint"],
212
+ ["a private car", "a bus", "a police car", "grass", "a building"]
213
+ ),
214
+ "store fronts": (
215
+ ["a store front with windows", "a shop entrance facade",
216
+ "retail store exterior", "a business storefront with sign",
217
+ "shop window display on street"],
218
+ ["a car", "grass", "sky", "a tree", "a house", "a warehouse"]
219
+ ),
220
+ "taxis": (
221
+ ["a taxi cab", "a yellow taxi", "a cab with taxi sign",
222
+ "a taxi vehicle on street"],
223
+ ["a private car", "a bus", "grass", "a building"]
224
+ ),
225
  }
226
 
227
  _txt_cache = {}
 
231
  if label in PROMPTS:
232
  pos, neg = PROMPTS[label]
233
  else:
234
+ # generic fallback lebih kaya
235
+ pos = [
236
+ f"a photo of {label}",
237
+ f"{label} close up",
238
+ f"an image clearly showing {label}",
239
+ f"{label} on the street",
240
+ f"a clear view of {label}",
241
+ ]
242
+ neg = [
243
+ "grass and dirt",
244
+ "a plain building facade",
245
+ "sky and clouds only",
246
+ "a tree with leaves",
247
+ "an empty road surface",
248
+ "blurry background texture",
249
+ ]
250
  _txt_cache[label] = (encode_txt(pos + neg), len(pos))
251
  return _txt_cache[label]
252
 
253
+ def adaptive_threshold(scores: list[float], n_tiles: int) -> float:
254
+ arr = np.array(scores)
255
+ mean_s = float(np.mean(arr))
256
+ std_s = float(np.std(arr))
257
+ max_s = float(np.max(arr))
258
+ min_s = float(np.min(arr))
259
+ spread = max_s - min_s
260
+
261
+ if std_s < 0.005:
262
+ # semua score mirip: ambil top-N paling tinggi
263
+ n_take = max(1, min(3, n_tiles // 3))
264
+ return float(sorted(arr)[-n_take])
265
+
266
+ if spread > 0.15:
267
+ # ada gap besar: ambil yang jelas-jelas di atas
268
+ return mean_s + 0.5 * std_s
269
+
270
+ # normal case: agak konservatif
271
+ return mean_s + 0.25 * std_s
272
 
273
  class ScoreRequest(BaseModel):
274
  label: str
275
+ tiles: list[str]
276
 
277
  class ScoreResponse(BaseModel):
278
  scores: list[float]
279
  threshold: float
280
+ to_click: list[int]
281
 
282
  @app.get("/")
283
  def root():
 
289
 
290
  @app.post("/score", response_model=ScoreResponse)
291
  def score_tiles(req: ScoreRequest):
292
+ label = req.label.lower().strip()
293
  t_feat, n_pos = get_txt_feats(label)
294
 
295
  imgs = []
 
298
  img = Image.open(io.BytesIO(raw))
299
  imgs.append(preprocess(img))
300
 
301
+ batch = np.concatenate(imgs, axis=0)
302
+ i_feat = norm(vis.run(None, {vis.get_inputs()[0].name: batch})[0])
303
+ sims = i_feat @ t_feat.T
 
304
 
305
+ scores = [float(sims[i, :n_pos].max() - sims[i, n_pos:].max()) for i in range(len(imgs))]
 
 
306
 
307
+ threshold = adaptive_threshold(scores, len(imgs))
308
+ to_click = [i for i, s in enumerate(scores) if s >= threshold]
 
 
309
 
310
+ # safety: kalau terlalu banyak klik (>= semua tile) mungkin threshold terlalu rendah, naikkan
311
+ if len(to_click) >= len(scores):
312
+ threshold = float(np.max(scores)) * 0.95
313
+ to_click = [i for i, s in enumerate(scores) if s >= threshold]
314
 
315
  return ScoreResponse(scores=scores, threshold=threshold, to_click=to_click)