AceXRoux commited on
Commit
6a47d15
·
verified ·
1 Parent(s): 61494a5

Update app.py

Browse files

Rolling back to original code

Files changed (1) hide show
  1. app.py +143 -362
app.py CHANGED
@@ -1,7 +1,7 @@
1
  #!/usr/bin/env python3
2
  """
3
- GeoVLM with 3D Globe Visualization
4
- Interactive 3D globe that flies to predicted locations
5
  """
6
 
7
  import gradio as gr
@@ -9,20 +9,22 @@ from PIL import Image
9
  from transformers import AutoProcessor, AutoModelForImageTextToText
10
  import torch
11
  import re
12
- import json
13
  from dataclasses import dataclass
14
 
15
  # ============================================================================
16
- # Geolocation Parser
17
  # ============================================================================
18
 
19
  @dataclass(frozen=True)
20
  class Coords:
 
21
  lat: float
22
  lon: float
23
 
24
  @dataclass(frozen=True)
25
  class ParsedResponse:
 
26
  city: str | None
27
  region: str | None
28
  country: str | None
@@ -41,18 +43,25 @@ PROMPT_TEMPLATE = (
41
  )
42
 
43
  KEY_ALIASES = {
44
- "city": "city", "country": "country", "region": "region",
45
- "state": "region", "province": "region",
46
- "latitude": "lat", "lat": "lat",
47
- "longitude": "lon", "lon": "lon",
 
 
 
 
 
48
  }
49
 
50
  def parse_response(text: str) -> ParsedResponse:
51
  """Parse structured 5-line format"""
52
  parsed = {}
 
53
  if not text:
54
  return ParsedResponse(None, None, None, None, text, False)
55
 
 
56
  key_pattern = re.compile(
57
  r'^\s*(?:[-*+\u2022]\s*)?(?P<key>[A-Za-z][A-Za-z0-9\s\-/_.]*?)\s*:\s*(?P<value>.+)$'
58
  )
@@ -63,14 +72,18 @@ def parse_response(text: str) -> ParsedResponse:
63
  continue
64
 
65
  key_raw = match.group("key").strip().lower()
66
- key_raw = re.sub(r"\s+", " ", key_raw.strip("*_`\"' "))
 
67
  canonical = KEY_ALIASES.get(key_raw)
68
 
69
  if canonical is None:
70
  continue
71
 
72
- value_raw = match.group("value").strip().strip("`\"' \t")
73
- value_raw = re.sub(r"^[*_`]+|[*_`]+$", "", value_raw).strip()
 
 
 
74
 
75
  if canonical in {"city", "region", "country"}:
76
  if value_raw and canonical not in parsed:
@@ -84,22 +97,26 @@ def parse_response(text: str) -> ParsedResponse:
84
  except ValueError:
85
  pass
86
 
 
87
  coords = None
88
  if "lat" in parsed and "lon" in parsed:
89
  try:
90
- lat, lon = parsed["lat"], parsed["lon"]
 
91
  if -90 <= lat <= 90 and -180 <= lon <= 180:
92
  coords = Coords(lat=lat, lon=lon)
93
  except (ValueError, TypeError):
94
  pass
95
 
 
 
96
  return ParsedResponse(
97
  city=parsed.get("city"),
98
  region=parsed.get("region"),
99
  country=parsed.get("country"),
100
  coords=coords,
101
  raw_text=text,
102
- format_valid=bool(len(parsed) >= 2),
103
  )
104
 
105
  # ============================================================================
@@ -111,6 +128,7 @@ processor = None
111
  MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
112
 
113
  def load_model():
 
114
  global model, processor
115
  if model is None:
116
  print(f"Loading model: {MODEL_NAME}")
@@ -120,331 +138,123 @@ def load_model():
120
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
121
  device_map="auto" if torch.cuda.is_available() else "cpu"
122
  )
123
- print("Model loaded!")
124
 
125
  def predict_location(image):
126
- """Predict geolocation and return globe visualization data"""
127
  if image is None:
128
- return "Please upload an image.", "", ""
129
 
 
130
  load_model()
131
 
 
132
  if not isinstance(image, Image.Image):
133
  image = Image.fromarray(image).convert("RGB")
134
  else:
135
  image = image.convert("RGB")
136
 
137
- messages = [{
138
- "role": "user",
139
- "content": [
140
- {"type": "image"},
141
- {"type": "text", "text": PROMPT_TEMPLATE}
142
- ]
143
- }]
 
 
 
144
 
 
145
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
146
  inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
 
 
147
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
148
 
 
149
  with torch.no_grad():
150
- output_ids = model.generate(**inputs, max_new_tokens=256, do_sample=False)
 
 
 
 
151
 
 
152
  generated_ids = output_ids[0][inputs['input_ids'].shape[1]:]
153
  response = processor.decode(generated_ids, skip_special_tokens=True).strip()
 
 
154
  parsed = parse_response(response)
155
 
156
  # Format output
157
  output = f"""
158
- ## 🤖 AI Prediction
159
-
160
- **📍 Location Details:**
161
- - **City:** {parsed.city or "Unknown"}
162
- - **Region:** {parsed.region or "Unknown"}
163
- - **Country:** {parsed.country or "Unknown"}
164
- - **Coordinates:** {f"{parsed.coords.lat:.6f}°, {parsed.coords.lon:.6f}°" if parsed.coords else "Not found"}
165
-
166
- ---
167
-
168
- ## 🔍 Raw Response:
169
  ```
170
  {response}
171
  ```
172
- """
173
-
174
- # Create globe HTML
175
- globe_html = create_globe_html(parsed) if parsed.coords else "<div style='text-align:center; padding:50px; color:#666;'>No coordinates found</div>"
176
-
177
- # Create info card
178
- info_html = create_info_card(parsed)
179
-
180
- return output, globe_html, info_html
181
-
182
- def create_globe_html(parsed: ParsedResponse) -> str:
183
- """Create Three.js globe visualization with day/night toggle and country borders"""
184
- if not parsed.coords:
185
- return ""
186
-
187
- lat, lon = parsed.coords.lat, parsed.coords.lon
188
-
189
- html = f"""
190
- <!DOCTYPE html>
191
- <html>
192
- <head>
193
- <style>
194
- body {{ margin: 0; padding: 0; overflow: hidden; background: #000; position: relative; }}
195
- #globeViz {{ width: 100%; height: 600px; }}
196
- .location-label {{
197
- color: white;
198
- font-size: 16px;
199
- font-family: Arial, sans-serif;
200
- background: rgba(0,0,0,0.7);
201
- padding: 8px 12px;
202
- border-radius: 4px;
203
- pointer-events: none;
204
- }}
205
- .controls {{
206
- position: absolute;
207
- top: 20px;
208
- right: 20px;
209
- z-index: 100;
210
- display: flex;
211
- gap: 10px;
212
- }}
213
- .control-btn {{
214
- background: rgba(255,255,255,0.9);
215
- border: none;
216
- padding: 10px 16px;
217
- border-radius: 6px;
218
- cursor: pointer;
219
- font-weight: bold;
220
- font-size: 14px;
221
- transition: all 0.3s;
222
- box-shadow: 0 2px 8px rgba(0,0,0,0.3);
223
- }}
224
- .control-btn:hover {{
225
- background: white;
226
- transform: translateY(-2px);
227
- box-shadow: 0 4px 12px rgba(0,0,0,0.4);
228
- }}
229
- .control-btn.active {{
230
- background: #667eea;
231
- color: white;
232
- }}
233
- </style>
234
- </head>
235
- <body>
236
- <div class="controls">
237
- <button class="control-btn active" id="dayBtn" onclick="setDayMode()">☀️ Day</button>
238
- <button class="control-btn" id="nightBtn" onclick="setNightMode()">🌙 Night</button>
239
- <button class="control-btn" id="bordersBtn" onclick="toggleBorders()">🗺️ Borders</button>
240
- </div>
241
- <div id="globeViz"></div>
242
-
243
- <script src="//unpkg.com/globe.gl"></script>
244
- <script>
245
- let showBorders = false;
246
- let currentMode = 'day';
247
-
248
- const myGlobe = Globe()
249
- .globeImageUrl('//unpkg.com/three-globe/example/img/earth-blue-marble.jpg')
250
- .bumpImageUrl('//unpkg.com/three-globe/example/img/earth-topology.png')
251
- .backgroundImageUrl('//unpkg.com/three-globe/example/img/night-sky.png')
252
- .pointOfView({{ lat: {lat}, lng: {lon}, altitude: 2.5 }}, 0)
253
- .atmosphereColor('lightskyblue')
254
- .atmosphereAltitude(0.15)
255
- (document.getElementById('globeViz'));
256
-
257
- // Load country borders
258
- fetch('//unpkg.com/world-atlas/countries-50m.json')
259
- .then(res => res.json())
260
- .then(countries => {{
261
- window.countriesData = countries;
262
- }});
263
-
264
- // Add marker point
265
- const markerData = [{{
266
- lat: {lat},
267
- lng: {lon},
268
- size: 0.5,
269
- color: '#ff4444',
270
- label: '{parsed.city or "Location"}',
271
- city: '{parsed.city or "Unknown"}',
272
- region: '{parsed.region or "Unknown"}',
273
- country: '{parsed.country or "Unknown"}'
274
- }}];
275
-
276
- myGlobe
277
- .pointsData(markerData)
278
- .pointAltitude('size')
279
- .pointColor('color')
280
- .pointRadius(0.6)
281
- .pointLabel(d => `
282
- <div class="location-label">
283
- <b>${{d.city}}</b><br/>
284
- ${{d.region}}, ${{d.country}}<br/>
285
- ${{d.lat.toFixed(4)}}°, ${{d.lng.toFixed(4)}}°
286
- </div>
287
- `);
288
-
289
- // Animate to location
290
- myGlobe.pointOfView({{ lat: {lat}, lng: {lon}, altitude: 1.5 }}, 3000);
291
 
292
- // Auto-rotate
293
- myGlobe.controls().autoRotate = true;
294
- myGlobe.controls().autoRotateSpeed = 0.3;
295
-
296
- // Add pulsing ring animation
297
- const ringData = [{{
298
- lat: {lat},
299
- lng: {lon},
300
- maxR: 10,
301
- propagationSpeed: 2,
302
- repeatPeriod: 1500
303
- }}];
304
-
305
- myGlobe
306
- .ringsData(ringData)
307
- .ringColor(() => 'rgba(255,68,68,0.5)')
308
- .ringMaxRadius('maxR')
309
- .ringPropagationSpeed('propagationSpeed')
310
- .ringRepeatPeriod('repeatPeriod');
311
-
312
- // Add arcs for visual effect
313
- const arcData = [{{
314
- startLat: {lat},
315
- startLng: {lon},
316
- endLat: {lat + 10},
317
- endLng: {lon + 10},
318
- color: ['rgba(255,68,68,0.4)', 'rgba(255,68,68,0.1)']
319
- }}];
320
-
321
- myGlobe
322
- .arcsData(arcData)
323
- .arcColor('color')
324
- .arcDashLength(0.4)
325
- .arcDashGap(0.2)
326
- .arcDashAnimateTime(2000)
327
- .arcStroke(0.5);
328
-
329
- // Mode switching functions
330
- function setDayMode() {{
331
- currentMode = 'day';
332
- myGlobe
333
- .globeImageUrl('//unpkg.com/three-globe/example/img/earth-blue-marble.jpg')
334
- .bumpImageUrl('//unpkg.com/three-globe/example/img/earth-topology.png');
335
-
336
- document.getElementById('dayBtn').classList.add('active');
337
- document.getElementById('nightBtn').classList.remove('active');
338
- }}
339
-
340
- function setNightMode() {{
341
- currentMode = 'night';
342
- myGlobe
343
- .globeImageUrl('//unpkg.com/three-globe/example/img/earth-night.jpg')
344
- .bumpImageUrl('//unpkg.com/three-globe/example/img/earth-topology.png');
345
-
346
- document.getElementById('nightBtn').classList.add('active');
347
- document.getElementById('dayBtn').classList.remove('active');
348
- }}
349
 
350
- function toggleBorders() {{
351
- showBorders = !showBorders;
352
- const btn = document.getElementById('bordersBtn');
353
-
354
- if (showBorders && window.countriesData) {{
355
- const countries = topojson.feature(window.countriesData, window.countriesData.objects.countries);
356
- myGlobe
357
- .polygonsData(countries.features)
358
- .polygonAltitude(0.01)
359
- .polygonCapColor(() => 'rgba(200, 200, 200, 0.1)')
360
- .polygonSideColor(() => 'rgba(200, 200, 200, 0.05)')
361
- .polygonStrokeColor(() => '#ffffff')
362
- .polygonLabel(({{ properties: d }}) => `
363
- <div class="location-label">
364
- <b>${{d.name}}</b>
365
- </div>
366
- `);
367
- btn.classList.add('active');
368
- }} else {{
369
- myGlobe.polygonsData([]);
370
- btn.classList.remove('active');
371
- }}
372
- }}
373
- </script>
374
- <script src="//unpkg.com/topojson-client"></script>
375
- </body>
376
- </html>
377
- """
378
- return html
379
 
380
- def create_info_card(parsed: ParsedResponse) -> str:
381
- """Create information card with details"""
382
- if not parsed.coords:
383
- return ""
384
-
385
- lat, lon = parsed.coords.lat, parsed.coords.lon
386
 
387
- html = f"""
388
- <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
389
- border-radius: 12px; padding: 24px; color: white; margin-top: 20px;">
390
- <h2 style="margin: 0 0 16px 0; font-size: 24px;">📍 Predicted Location</h2>
391
-
392
- <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 16px; margin-bottom: 20px;">
393
- <div style="background: rgba(255,255,255,0.1); padding: 12px; border-radius: 8px;">
394
- <div style="font-size: 12px; opacity: 0.8;">City</div>
395
- <div style="font-size: 18px; font-weight: bold;">{parsed.city or "Unknown"}</div>
396
- </div>
397
- <div style="background: rgba(255,255,255,0.1); padding: 12px; border-radius: 8px;">
398
- <div style="font-size: 12px; opacity: 0.8;">Region</div>
399
- <div style="font-size: 18px; font-weight: bold;">{parsed.region or "Unknown"}</div>
 
 
 
 
 
 
 
 
 
 
400
  </div>
401
- <div style="background: rgba(255,255,255,0.1); padding: 12px; border-radius: 8px;">
402
- <div style="font-size: 12px; opacity: 0.8;">Country</div>
403
- <div style="font-size: 18px; font-weight: bold;">{parsed.country or "Unknown"}</div>
404
- </div>
405
- <div style="background: rgba(255,255,255,0.1); padding: 12px; border-radius: 8px;">
406
- <div style="font-size: 12px; opacity: 0.8;">Coordinates</div>
407
- <div style="font-size: 14px; font-weight: bold;">{lat:.4f}°, {lon:.4f}°</div>
408
- </div>
409
- </div>
410
-
411
- <div style="display: flex; gap: 12px; flex-wrap: wrap;">
412
- <a href="https://www.google.com/maps?q={lat},{lon}" target="_blank"
413
- style="background: #4285f4; color: white; padding: 10px 20px;
414
- border-radius: 6px; text-decoration: none; font-weight: bold;">
415
- 🗺️ Google Maps
416
- </a>
417
- <a href="https://www.openstreetmap.org/?mlat={lat}&mlon={lon}#map=12/{lat}/{lon}" target="_blank"
418
- style="background: #7ebc6f; color: white; padding: 10px 20px;
419
- border-radius: 6px; text-decoration: none; font-weight: bold;">
420
- 🌍 OpenStreetMap
421
- </a>
422
- <a href="https://www.google.com/search?q={parsed.city}+{parsed.country}" target="_blank"
423
- style="background: #ea4335; color: white; padding: 10px 20px;
424
- border-radius: 6px; text-decoration: none; font-weight: bold;">
425
- 🔍 Learn More
426
- </a>
427
  </div>
428
- </div>
429
- """
430
- return html
 
 
431
 
432
  # ============================================================================
433
  # Gradio Interface
434
  # ============================================================================
435
 
436
- with gr.Blocks(title="GeoVLM - 3D Globe", theme=gr.themes.Soft(), css="""
437
- .gradio-container {max-width: 1400px !important;}
438
- .globe-container {height: 600px !important;}
439
- """) as demo:
440
-
441
- gr.Markdown("""
442
- # 🌍 GeoVLM - AI Geolocation with 3D Globe
443
-
444
- Upload any image and watch the AI predict its location on an interactive 3D globe!
445
-
446
- **Powered by:** [vlm-gym](https://github.com/sdan/vlm-gym) | Vision-Language Models | Three.js Globe
447
- """)
 
 
 
448
 
449
  with gr.Row():
450
  with gr.Column(scale=1):
@@ -453,77 +263,48 @@ with gr.Blocks(title="GeoVLM - 3D Globe", theme=gr.themes.Soft(), css="""
453
  label="📸 Upload Image",
454
  height=400
455
  )
 
456
 
457
- predict_btn = gr.Button(
458
- "🔍 Analyze & Locate",
459
- variant="primary",
460
- size="lg"
 
 
 
 
461
  )
462
-
463
- gr.Markdown("""
464
- ### 💡 Tips:
465
- - Outdoor images work best
466
- - Street views are ideal
467
- - Landmarks help accuracy
468
- - Clear, well-lit photos
469
-
470
- ### 🎯 Features:
471
- - 3D interactive globe
472
- - Flies to predicted location
473
- - Pulsing marker animation
474
- - Auto-rotating globe
475
- """)
476
 
477
- with gr.Column(scale=2):
478
- with gr.Tabs():
479
- with gr.Tab("🌐 3D Globe"):
480
- globe_output = gr.HTML(
481
- label="Interactive Globe",
482
- elem_classes=["globe-container"]
483
- )
484
-
485
- with gr.Tab("📊 Details"):
486
- info_output = gr.HTML(label="Location Info")
487
- output_text = gr.Markdown(label="Analysis")
488
-
489
- gr.Markdown("""
490
- ---
491
-
492
- ### 🎮 How It Works:
493
-
494
- 1. **Upload** any image with visible location clues
495
- 2. **AI analyzes** architecture, vegetation, signs, landscape
496
- 3. **Globe flies** to the predicted location in 3D
497
- 4. **Explore** the area with interactive controls
498
-
499
- ### 🔬 Technology:
500
- - **Vision Model:** Qwen2-VL-2B-Instruct
501
- - **Training:** Reinforcement learning on 5M geotagged images
502
- - **Visualization:** Three.js Globe.GL
503
- - **Dataset:** OSV5M (OpenStreetView 5M)
504
-
505
- ### 🚀 Use Cases:
506
- - **OSINT Research** - Verify photo locations
507
- - **Education** - Learn world geography
508
- - **Travel** - Discover new places
509
- - **Training** - Practice geolocation skills
510
-
511
- ---
512
 
513
- Built with ❤️ by AceXRoux | [GitHub](https://github.com/axroux) | [LinkedIn](https://linkedin.com/in/vance-poitier)
514
- """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
515
 
 
516
  predict_btn.click(
517
  fn=predict_location,
518
  inputs=image_input,
519
- outputs=[output_text, globe_output, info_output]
520
  )
521
 
522
  if __name__ == "__main__":
523
- print("🌍 Starting GeoVLM with 3D Globe...")
524
  load_model()
525
- demo.launch(
526
- server_name="0.0.0.0",
527
- server_port=7860,
528
- share=False
529
- )
 
1
  #!/usr/bin/env python3
2
  """
3
+ GeoVLM - AI-Powered Geolocation
4
+ Upload any image and predict where it was taken using Vision-Language Models
5
  """
6
 
7
  import gradio as gr
 
9
  from transformers import AutoProcessor, AutoModelForImageTextToText
10
  import torch
11
  import re
12
+ import math
13
  from dataclasses import dataclass
14
 
15
  # ============================================================================
16
+ # Simplified Geolocation Parser (from vlm-gym)
17
  # ============================================================================
18
 
19
  @dataclass(frozen=True)
20
  class Coords:
21
+ """Geographic coordinates"""
22
  lat: float
23
  lon: float
24
 
25
  @dataclass(frozen=True)
26
  class ParsedResponse:
27
+ """Structured model output"""
28
  city: str | None
29
  region: str | None
30
  country: str | None
 
43
  )
44
 
45
  KEY_ALIASES = {
46
+ "city": "city",
47
+ "country": "country",
48
+ "region": "region",
49
+ "state": "region",
50
+ "province": "region",
51
+ "latitude": "lat",
52
+ "lat": "lat",
53
+ "longitude": "lon",
54
+ "lon": "lon",
55
  }
56
 
57
  def parse_response(text: str) -> ParsedResponse:
58
  """Parse structured 5-line format"""
59
  parsed = {}
60
+
61
  if not text:
62
  return ParsedResponse(None, None, None, None, text, False)
63
 
64
+ # Parse key-value lines
65
  key_pattern = re.compile(
66
  r'^\s*(?:[-*+\u2022]\s*)?(?P<key>[A-Za-z][A-Za-z0-9\s\-/_.]*?)\s*:\s*(?P<value>.+)$'
67
  )
 
72
  continue
73
 
74
  key_raw = match.group("key").strip().lower()
75
+ key_raw = key_raw.strip("*_`\"' ")
76
+ key_raw = re.sub(r"\s+", " ", key_raw)
77
  canonical = KEY_ALIASES.get(key_raw)
78
 
79
  if canonical is None:
80
  continue
81
 
82
+ value_raw = match.group("value").strip()
83
+ value_raw = value_raw.strip("`\"' \t")
84
+ value_raw = re.sub(r"^[*_`]+", "", value_raw)
85
+ value_raw = re.sub(r"[*_`]+$", "", value_raw)
86
+ value_raw = value_raw.strip()
87
 
88
  if canonical in {"city", "region", "country"}:
89
  if value_raw and canonical not in parsed:
 
97
  except ValueError:
98
  pass
99
 
100
+ # Build coords if available
101
  coords = None
102
  if "lat" in parsed and "lon" in parsed:
103
  try:
104
+ lat = parsed["lat"]
105
+ lon = parsed["lon"]
106
  if -90 <= lat <= 90 and -180 <= lon <= 180:
107
  coords = Coords(lat=lat, lon=lon)
108
  except (ValueError, TypeError):
109
  pass
110
 
111
+ format_valid = bool(len(parsed) >= 2)
112
+
113
  return ParsedResponse(
114
  city=parsed.get("city"),
115
  region=parsed.get("region"),
116
  country=parsed.get("country"),
117
  coords=coords,
118
  raw_text=text,
119
+ format_valid=format_valid,
120
  )
121
 
122
  # ============================================================================
 
128
  MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
129
 
130
  def load_model():
131
+ """Load model once on startup"""
132
  global model, processor
133
  if model is None:
134
  print(f"Loading model: {MODEL_NAME}")
 
138
  torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
139
  device_map="auto" if torch.cuda.is_available() else "cpu"
140
  )
141
+ print("Model loaded successfully!")
142
 
143
  def predict_location(image):
144
+ """Predict geolocation from an image"""
145
  if image is None:
146
+ return "Please upload an image.", ""
147
 
148
+ # Ensure model is loaded
149
  load_model()
150
 
151
+ # Convert to PIL if needed
152
  if not isinstance(image, Image.Image):
153
  image = Image.fromarray(image).convert("RGB")
154
  else:
155
  image = image.convert("RGB")
156
 
157
+ # Prepare prompt
158
+ messages = [
159
+ {
160
+ "role": "user",
161
+ "content": [
162
+ {"type": "image"},
163
+ {"type": "text", "text": PROMPT_TEMPLATE}
164
+ ]
165
+ }
166
+ ]
167
 
168
+ # Process inputs
169
  text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
170
  inputs = processor(text=[text], images=[image], return_tensors="pt", padding=True)
171
+
172
+ # Move to device
173
  inputs = {k: v.to(model.device) for k, v in inputs.items()}
174
 
175
+ # Generate
176
  with torch.no_grad():
177
+ output_ids = model.generate(
178
+ **inputs,
179
+ max_new_tokens=256,
180
+ do_sample=False,
181
+ )
182
 
183
+ # Decode
184
  generated_ids = output_ids[0][inputs['input_ids'].shape[1]:]
185
  response = processor.decode(generated_ids, skip_special_tokens=True).strip()
186
+
187
+ # Parse
188
  parsed = parse_response(response)
189
 
190
  # Format output
191
  output = f"""
192
+ ## 🤖 Raw Model Response:
 
 
 
 
 
 
 
 
 
 
193
  ```
194
  {response}
195
  ```
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
196
 
197
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
 
199
+ ## 📍 Parsed Prediction:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
+ **City:** {parsed.city or "Not provided"}
202
+ **Region:** {parsed.region or "Not provided"}
203
+ **Country:** {parsed.country or "Not provided"}
204
+ **Coordinates:** {f"{parsed.coords.lat:.6f}, {parsed.coords.lon:.6f}" if parsed.coords else "Not provided"}
205
+ **Format Valid:** {"✅ Yes" if parsed.format_valid else "❌ No"}
206
+ """
207
 
208
+ # Create map embed
209
+ map_html = ""
210
+ if parsed.coords:
211
+ map_html = f"""
212
+ <div style="margin-top: 20px;">
213
+ <iframe
214
+ width="100%"
215
+ height="450"
216
+ frameborder="0"
217
+ scrolling="no"
218
+ marginheight="0"
219
+ marginwidth="0"
220
+ src="https://www.openstreetmap.org/export/embed.html?bbox={parsed.coords.lon-0.1},{parsed.coords.lat-0.1},{parsed.coords.lon+0.1},{parsed.coords.lat+0.1}&marker={parsed.coords.lat},{parsed.coords.lon}"
221
+ style="border: 2px solid #ddd; border-radius: 8px;">
222
+ </iframe>
223
+ <div style="margin-top: 10px; text-align: center;">
224
+ <a href="https://www.google.com/maps?q={parsed.coords.lat},{parsed.coords.lon}" target="_blank" style="margin: 0 10px; color: #4285f4; text-decoration: none; font-weight: bold;">
225
+ 🗺️ View on Google Maps
226
+ </a>
227
+ <span style="color: #666;">|</span>
228
+ <a href="https://www.openstreetmap.org/?mlat={parsed.coords.lat}&mlon={parsed.coords.lon}#map=12/{parsed.coords.lat}/{parsed.coords.lon}" target="_blank" style="margin: 0 10px; color: #7ebc6f; text-decoration: none; font-weight: bold;">
229
+ 🌍 View on OpenStreetMap
230
+ </a>
231
  </div>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
232
  </div>
233
+ """
234
+ else:
235
+ map_html = "<div style='text-align: center; padding: 20px; color: #666;'>No valid coordinates found</div>"
236
+
237
+ return output, map_html
238
 
239
  # ============================================================================
240
  # Gradio Interface
241
  # ============================================================================
242
 
243
+ with gr.Blocks(title="GeoVLM - AI Geolocation", theme=gr.themes.Soft()) as demo:
244
+ gr.Markdown(
245
+ """
246
+ # 🌍 GeoVLM - AI-Powered Geolocation
247
+
248
+ Upload any image and let AI predict where it was taken using vision-language models!
249
+
250
+ ### How it works:
251
+ - Analyzes visual features: architecture, vegetation, road signs, landscape
252
+ - Uses state-of-the-art vision-language models (Qwen2-VL)
253
+ - Predicts city, region, country, and GPS coordinates
254
+
255
+ **Powered by [vlm-gym](https://github.com/sdan/vlm-gym)** | Model: Qwen2-VL-2B-Instruct
256
+ """
257
+ )
258
 
259
  with gr.Row():
260
  with gr.Column(scale=1):
 
263
  label="📸 Upload Image",
264
  height=400
265
  )
266
+ predict_btn = gr.Button("🔍 Predict Location", variant="primary", size="lg")
267
 
268
+ gr.Markdown(
269
+ """
270
+ ### 💡 Tips:
271
+ - Outdoor images work best
272
+ - Street views are ideal
273
+ - Clear photos with visible landmarks
274
+ - Unique architectural or natural features help
275
+ """
276
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
277
 
278
+ with gr.Column(scale=1):
279
+ output_text = gr.Markdown(label="Results")
280
+ map_output = gr.HTML(label="Map")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
281
 
282
+ gr.Markdown(
283
+ """
284
+ ---
285
+ ### 🎯 Use Cases:
286
+ - **OSINT Research** - Verify photo locations for investigations
287
+ - **GeoGuessr Training** - Practice location identification
288
+ - **Education** - Learn about geographic features and cultures
289
+ - **Travel Planning** - Identify interesting locations from photos
290
+
291
+ ---
292
+
293
+ **Note:** This is a demo. Predictions may not always be accurate. Use responsibly for educational and research purposes.
294
+
295
+ Built with ❤️ using [Gradio](https://gradio.app) and [Hugging Face Transformers](https://huggingface.co/transformers)
296
+ [LinkedIn](https://www.linkedin.com/in/vance-poitier/)
297
+ """
298
+ )
299
 
300
+ # Event handlers
301
  predict_btn.click(
302
  fn=predict_location,
303
  inputs=image_input,
304
+ outputs=[output_text, map_output]
305
  )
306
 
307
  if __name__ == "__main__":
308
+ print("🚀 Starting GeoVLM...")
309
  load_model()
310
+ demo.launch()