kobiakor15 commited on
Commit
61cc71c
·
verified ·
1 Parent(s): 8d515d0

Upload demo_oculus_unified.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. demo_oculus_unified.py +263 -0
demo_oculus_unified.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Oculus 0.2 Unified Demo
4
+
5
+ Demonstrates all features of the unified Oculus model:
6
+ - Text mode (captioning, VQA)
7
+ - Point mode (counting objects)
8
+ - Box mode (detection with bounding boxes)
9
+ - Polygon mode (segmentation)
10
+ - Optional reasoning with thinking traces
11
+ - Focus system for fine-grained perception
12
+ """
13
+
14
+ import os
15
+ import sys
16
+ import requests
17
+ from pathlib import Path
18
+ from io import BytesIO
19
+
20
+ from PIL import Image
21
+ import torch
22
+
23
+ # Add parent to path
24
+ sys.path.insert(0, str(Path(__file__).parent))
25
+
26
+ from oculus_unified_model import OculusForConditionalGeneration, OculusConfig
27
+
28
+
29
+ def download_image(url: str) -> Image.Image:
30
+ """Download image from URL."""
31
+ headers = {'User-Agent': 'Mozilla/5.0'}
32
+ response = requests.get(url, headers=headers, timeout=10)
33
+ response.raise_for_status()
34
+ return Image.open(BytesIO(response.content)).convert('RGB')
35
+
36
+
37
+ def print_header(title: str):
38
+ print("\n" + "=" * 70)
39
+ print(f"🔮 {title}")
40
+ print("=" * 70)
41
+
42
+
43
+ def print_section(title: str):
44
+ print(f"\n{'─' * 70}")
45
+ print(f" {title}")
46
+ print(f"{'─' * 70}")
47
+
48
+
49
+ def demo():
50
+ print_header("OCULUS 0.2 UNIFIED MODEL DEMO")
51
+
52
+ # ================================================================
53
+ # Load Model
54
+ # ================================================================
55
+ print("\n[1] Loading Oculus Model...")
56
+
57
+ # Check if we have trained weights
58
+ weights_path = Path(__file__).parent / "checkpoints" / "oculus_coco" / "final"
59
+
60
+ if weights_path.exists():
61
+ print(f" Found trained weights at: {weights_path}")
62
+ model = OculusForConditionalGeneration.from_pretrained(weights_path)
63
+ else:
64
+ print(" Using default configuration")
65
+ config = OculusConfig(
66
+ reasoning_enabled=True,
67
+ enable_focus=True,
68
+ )
69
+ model = OculusForConditionalGeneration(config)
70
+
71
+ print(" ✓ Model loaded!")
72
+
73
+ # ================================================================
74
+ # Test Images
75
+ # ================================================================
76
+ test_images = [
77
+ {
78
+ "name": "Cat on Couch",
79
+ "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/3/3a/Cat03.jpg/1200px-Cat03.jpg"
80
+ },
81
+ {
82
+ "name": "Golden Gate Bridge",
83
+ "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/0/0c/GoldenGateBridge-001.jpg/1200px-GoldenGateBridge-001.jpg"
84
+ },
85
+ ]
86
+
87
+ for test in test_images:
88
+ print_header(f"Testing: {test['name']}")
89
+
90
+ try:
91
+ print("\n[Downloading image...]")
92
+ image = download_image(test["url"])
93
+ print(f" Image size: {image.size}")
94
+
95
+ # ========================================================
96
+ # Mode 1: TEXT (Captioning)
97
+ # ========================================================
98
+ print_section("📝 TEXT MODE - Captioning")
99
+
100
+ output = model.generate(
101
+ image=image,
102
+ prompt="Describe this image in detail",
103
+ mode="text",
104
+ think=False
105
+ )
106
+
107
+ print(f" Caption: \"{output.text}\"")
108
+
109
+ # ========================================================
110
+ # Mode 2: TEXT with Reasoning
111
+ # ========================================================
112
+ print_section("🧠 TEXT MODE - With Reasoning")
113
+
114
+ output = model.generate(
115
+ image=image,
116
+ prompt="What is the main subject of this image?",
117
+ mode="text",
118
+ think=True # Enable thinking traces
119
+ )
120
+
121
+ if output.thinking_trace:
122
+ print(f" 💭 Thinking: {output.thinking_trace[:200]}...")
123
+ print(f" Answer: \"{output.text}\"")
124
+
125
+ # ========================================================
126
+ # Mode 3: TEXT (VQA)
127
+ # ========================================================
128
+ print_section("❓ TEXT MODE - VQA")
129
+
130
+ questions = [
131
+ "What colors are visible in this image?",
132
+ "Is this indoors or outdoors?",
133
+ ]
134
+
135
+ for q in questions:
136
+ output = model.generate(
137
+ image=image,
138
+ prompt=q,
139
+ mode="text"
140
+ )
141
+ print(f" Q: {q}")
142
+ print(f" A: {output.text}")
143
+
144
+ # ========================================================
145
+ # Mode 4: POINT (Counting)
146
+ # ========================================================
147
+ print_section("📍 POINT MODE - Object Counting")
148
+
149
+ output = model.generate(
150
+ image=image,
151
+ prompt="Find objects",
152
+ mode="point"
153
+ )
154
+
155
+ print(f" Detected {len(output.points)} points")
156
+ for i, (pt, label, conf) in enumerate(zip(
157
+ output.points[:5],
158
+ output.labels[:5],
159
+ output.confidences[:5]
160
+ )):
161
+ print(f" Point {i+1}: {pt} (class={label}, conf={conf:.2f})")
162
+
163
+ # ========================================================
164
+ # Mode 5: BOX (Detection)
165
+ # ========================================================
166
+ print_section("📦 BOX MODE - Object Detection")
167
+
168
+ output = model.generate(
169
+ image=image,
170
+ prompt="Detect all objects",
171
+ mode="box"
172
+ )
173
+
174
+ print(f" Detected {len(output.boxes)} boxes")
175
+ for i, (box, label, conf) in enumerate(zip(
176
+ output.boxes[:5],
177
+ output.labels[:5],
178
+ output.confidences[:5]
179
+ )):
180
+ print(f" Box {i+1}: {[f'{b:.2f}' for b in box]} (class={label}, conf={conf:.2f})")
181
+
182
+ # ========================================================
183
+ # Mode 6: POLYGON (Segmentation)
184
+ # ========================================================
185
+ print_section("🔷 POLYGON MODE - Segmentation")
186
+
187
+ output = model.generate(
188
+ image=image,
189
+ prompt="Segment the scene",
190
+ mode="polygon"
191
+ )
192
+
193
+ print(f" Segmentation mask shape: {output.mask.shape if output.mask is not None else 'N/A'}")
194
+ print(f" Detected {len(output.polygons)} regions")
195
+ for i, (poly, label) in enumerate(zip(
196
+ output.polygons[:3],
197
+ output.labels[:3]
198
+ )):
199
+ print(f" Region {i+1}: class={label}, vertices={len(poly)}")
200
+
201
+ print("\n ✅ All modes successful!")
202
+
203
+ except Exception as e:
204
+ print(f"\n ❌ Error: {e}")
205
+ import traceback
206
+ traceback.print_exc()
207
+
208
+ # ================================================================
209
+ # Summary
210
+ # ================================================================
211
+ print_header("DEMO COMPLETE")
212
+
213
+ print("""
214
+ Oculus 0.2 supports:
215
+
216
+ 📝 TEXT MODE
217
+ - Image captioning
218
+ - Visual question answering
219
+ - With optional reasoning traces
220
+
221
+ 📍 POINT MODE
222
+ - Object counting
223
+ - Point localization
224
+
225
+ 📦 BOX MODE
226
+ - Object detection
227
+ - Bounding box prediction
228
+
229
+ 🔷 POLYGON MODE
230
+ - Semantic segmentation
231
+ - Instance segmentation
232
+
233
+ 🧠 REASONING
234
+ - Optional thinking traces
235
+ - Multi-step reasoning
236
+
237
+ 🔍 FOCUS SYSTEM
238
+ - Zoom & crop for fine-grained perception
239
+ - Automatic region detection
240
+
241
+ Usage:
242
+ ```python
243
+ from oculus_unified_model import OculusForConditionalGeneration
244
+
245
+ model = OculusForConditionalGeneration.from_pretrained("./checkpoints/oculus_coco/final")
246
+
247
+ # Caption
248
+ output = model.generate(image, mode="text", prompt="Describe this")
249
+
250
+ # VQA with reasoning
251
+ output = model.generate(image, mode="text", prompt="What color is it?", think=True)
252
+
253
+ # Detection
254
+ output = model.generate(image, mode="box", prompt="Find cars")
255
+
256
+ # Segmentation
257
+ output = model.generate(image, mode="polygon")
258
+ ```
259
+ """)
260
+
261
+
262
+ if __name__ == "__main__":
263
+ demo()