Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,9 @@ from io import BytesIO
|
|
| 7 |
import base64
|
| 8 |
from transformers import AutoProcessor, AutoModelForCausalLM
|
| 9 |
import os
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
# Attempt to install flash-attn
|
| 12 |
try:
|
|
@@ -132,6 +135,185 @@ def describe_image_from_url(image_url, model_choice):
|
|
| 132 |
except Exception as e:
|
| 133 |
return {"error": f"Error processing image: {str(e)}"}
|
| 134 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
# Description for the interface
|
| 136 |
description = "> Select the model to use for generating the image description. 'Base' is smaller and faster, while 'Large' is more accurate but slower."
|
| 137 |
if device == "cpu":
|
|
|
|
| 7 |
import base64
|
| 8 |
from transformers import AutoProcessor, AutoModelForCausalLM
|
| 9 |
import os
|
| 10 |
+
import threading
|
| 11 |
+
import time
|
| 12 |
+
import urllib.parse
|
| 13 |
|
| 14 |
# Attempt to install flash-attn
|
| 15 |
try:
|
|
|
|
| 135 |
except Exception as e:
|
| 136 |
return {"error": f"Error processing image: {str(e)}"}
|
| 137 |
|
| 138 |
+
|
| 139 |
+
# ---- Background captioning worker -------------------------------------------------
|
| 140 |
+
# This worker will start in a daemon thread before Gradio launches. It polls the
|
| 141 |
+
# image middleware on IMAGE_SERVER_BASE, downloads frames, captions them using
|
| 142 |
+
# the already-loaded Florence models, posts results to DATA_COLLECTION_BASE:/submit,
|
| 143 |
+
# then releases frames and courses. It uses blocking requests so it runs in a
|
| 144 |
+
# separate thread and will not interfere with the UI thread.
|
| 145 |
+
|
| 146 |
+
IMAGE_SERVER_BASE = os.getenv("IMAGE_SERVER_BASE", "https://fred808-vssee.hf.space")
|
| 147 |
+
DATA_COLLECTION_BASE = os.getenv("DATA_COLLECTION_BASE", "https://fred808-flow.hf.space")
|
| 148 |
+
REQUESTER_ID = os.getenv("FLO_REQUESTER_ID", f"florence-2-{os.getpid()}")
|
| 149 |
+
MODEL_CHOICE = os.getenv("FLO_MODEL_CHOICE", "Florence-2-base")
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
def _build_download_url(course: str, video: str, frame: str) -> str:
|
| 153 |
+
file_param = f"frame:{course}/{video}/{frame}"
|
| 154 |
+
return f"{IMAGE_SERVER_BASE.rstrip('/')}/download?course={urllib.parse.quote(course, safe='')}&file={urllib.parse.quote(file_param, safe='') }"
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
def _download_bytes(url: str, timeout: int = 30):
|
| 158 |
+
try:
|
| 159 |
+
r = requests.get(url, timeout=timeout)
|
| 160 |
+
r.raise_for_status()
|
| 161 |
+
return r.content, r.headers.get('content-type')
|
| 162 |
+
except Exception as e:
|
| 163 |
+
print(f"[BACKGROUND] download failed {url}: {e}")
|
| 164 |
+
return None, None
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def _post_submit(caption: str, image_name: str, course: str, image_url: str, image_bytes: bytes):
|
| 168 |
+
submit_url = f"{DATA_COLLECTION_BASE.rstrip('/')}/submit"
|
| 169 |
+
files = {'image': (image_name, image_bytes, 'application/octet-stream')}
|
| 170 |
+
data = {'caption': caption, 'image_name': image_name, 'course': course, 'image_url': image_url}
|
| 171 |
+
try:
|
| 172 |
+
r = requests.post(submit_url, data=data, files=files, timeout=30)
|
| 173 |
+
try:
|
| 174 |
+
return r.status_code, r.json()
|
| 175 |
+
except Exception:
|
| 176 |
+
return r.status_code, r.text
|
| 177 |
+
except Exception as e:
|
| 178 |
+
print(f"[BACKGROUND] submit POST failed: {e}")
|
| 179 |
+
return None, None
|
| 180 |
+
|
| 181 |
+
|
| 182 |
+
def _release_frame(course: str, video: str, frame: str):
|
| 183 |
+
try:
|
| 184 |
+
release_url = f"{IMAGE_SERVER_BASE.rstrip('/')}/middleware/release/frame/{urllib.parse.quote(course, safe='')}/{urllib.parse.quote(video, safe='')}/{urllib.parse.quote(frame, safe='')}"
|
| 185 |
+
requests.post(release_url, params={"requester_id": REQUESTER_ID}, timeout=10)
|
| 186 |
+
except Exception as e:
|
| 187 |
+
print(f"[BACKGROUND] release frame failed: {e}")
|
| 188 |
+
|
| 189 |
+
|
| 190 |
+
def _release_course(course: str):
|
| 191 |
+
try:
|
| 192 |
+
release_url = f"{IMAGE_SERVER_BASE.rstrip('/')}/middleware/release/course/{urllib.parse.quote(course, safe='')}"
|
| 193 |
+
requests.post(release_url, params={"requester_id": REQUESTER_ID}, timeout=10)
|
| 194 |
+
except Exception as e:
|
| 195 |
+
print(f"[BACKGROUND] release course failed: {e}")
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
def background_worker():
|
| 199 |
+
print("[BACKGROUND] Worker waiting for model to be available...")
|
| 200 |
+
# wait for model(s) to load (respect existing loading logic)
|
| 201 |
+
waited = 0
|
| 202 |
+
while waited < 120:
|
| 203 |
+
if MODEL_CHOICE == "Florence-2-base":
|
| 204 |
+
if vision_language_model_base is not None and vision_language_processor_base is not None:
|
| 205 |
+
break
|
| 206 |
+
else:
|
| 207 |
+
if vision_language_model_large is not None and vision_language_processor_large is not None:
|
| 208 |
+
break
|
| 209 |
+
time.sleep(1)
|
| 210 |
+
waited += 1
|
| 211 |
+
|
| 212 |
+
if waited >= 120:
|
| 213 |
+
print("[BACKGROUND] Model not available after timeout; background worker exiting.")
|
| 214 |
+
return
|
| 215 |
+
|
| 216 |
+
print("[BACKGROUND] Model loaded; starting polling loop")
|
| 217 |
+
|
| 218 |
+
while True:
|
| 219 |
+
try:
|
| 220 |
+
# Acquire next course
|
| 221 |
+
try:
|
| 222 |
+
r = requests.get(f"{IMAGE_SERVER_BASE.rstrip('/')}/middleware/next/course", params={"requester_id": REQUESTER_ID}, timeout=15)
|
| 223 |
+
if r.status_code == 404:
|
| 224 |
+
time.sleep(3)
|
| 225 |
+
continue
|
| 226 |
+
r.raise_for_status()
|
| 227 |
+
course_json = r.json()
|
| 228 |
+
except Exception as e:
|
| 229 |
+
print(f"[BACKGROUND] failed to get next course: {e}")
|
| 230 |
+
time.sleep(3)
|
| 231 |
+
continue
|
| 232 |
+
|
| 233 |
+
course = course_json.get('course_id') or course_json.get('course')
|
| 234 |
+
if not course:
|
| 235 |
+
print(f"[BACKGROUND] invalid course response: {course_json}")
|
| 236 |
+
time.sleep(2)
|
| 237 |
+
continue
|
| 238 |
+
|
| 239 |
+
print(f"[BACKGROUND] processing course: {course}")
|
| 240 |
+
|
| 241 |
+
# Pull images until none left
|
| 242 |
+
while True:
|
| 243 |
+
try:
|
| 244 |
+
img_url = f"{IMAGE_SERVER_BASE.rstrip('/')}/middleware/next/image/{urllib.parse.quote(course, safe='')}"
|
| 245 |
+
rimg = requests.get(img_url, params={"requester_id": REQUESTER_ID}, timeout=15)
|
| 246 |
+
if rimg.status_code == 404:
|
| 247 |
+
print(f"[BACKGROUND] no images for course {course}")
|
| 248 |
+
break
|
| 249 |
+
rimg.raise_for_status()
|
| 250 |
+
img_json = rimg.json()
|
| 251 |
+
except Exception as e:
|
| 252 |
+
print(f"[BACKGROUND] failed to get next image: {e}")
|
| 253 |
+
time.sleep(1)
|
| 254 |
+
continue
|
| 255 |
+
|
| 256 |
+
video = img_json.get('video')
|
| 257 |
+
frame = img_json.get('frame')
|
| 258 |
+
file_id = img_json.get('file_id')
|
| 259 |
+
if not (video and frame and file_id):
|
| 260 |
+
print(f"[BACKGROUND] unexpected image entry: {img_json}")
|
| 261 |
+
time.sleep(0.5)
|
| 262 |
+
continue
|
| 263 |
+
|
| 264 |
+
download_url = _build_download_url(course, video, frame)
|
| 265 |
+
print(f"[BACKGROUND] downloading {download_url}")
|
| 266 |
+
img_bytes, content_type = _download_bytes(download_url)
|
| 267 |
+
if not img_bytes:
|
| 268 |
+
print(f"[BACKGROUND] failed to download image, releasing frame {file_id}")
|
| 269 |
+
_release_frame(course, video, frame)
|
| 270 |
+
time.sleep(1)
|
| 271 |
+
continue
|
| 272 |
+
|
| 273 |
+
try:
|
| 274 |
+
pil_img = Image.open(BytesIO(img_bytes)).convert('RGB')
|
| 275 |
+
except Exception as e:
|
| 276 |
+
print(f"[BACKGROUND] failed to open image bytes: {e}")
|
| 277 |
+
_release_frame(course, video, frame)
|
| 278 |
+
time.sleep(1)
|
| 279 |
+
continue
|
| 280 |
+
|
| 281 |
+
# Choose model and processor according to MODEL_CHOICE
|
| 282 |
+
if MODEL_CHOICE == "Florence-2-base":
|
| 283 |
+
model = vision_language_model_base
|
| 284 |
+
processor = vision_language_processor_base
|
| 285 |
+
else:
|
| 286 |
+
model = vision_language_model_large
|
| 287 |
+
processor = vision_language_processor_large
|
| 288 |
+
|
| 289 |
+
caption = ""
|
| 290 |
+
try:
|
| 291 |
+
# Reuse existing processing function: process_image_description(model, processor, image)
|
| 292 |
+
caption = process_image_description(model, processor, pil_img)
|
| 293 |
+
except Exception as e:
|
| 294 |
+
print(f"[BACKGROUND] captioning failed: {e}")
|
| 295 |
+
|
| 296 |
+
status, resp = _post_submit(caption, frame, course, download_url, img_bytes)
|
| 297 |
+
print(f"[BACKGROUND] submitted caption for {frame}: status={status}")
|
| 298 |
+
|
| 299 |
+
# release frame
|
| 300 |
+
_release_frame(course, video, frame)
|
| 301 |
+
time.sleep(0.2)
|
| 302 |
+
|
| 303 |
+
# release course
|
| 304 |
+
_release_course(course)
|
| 305 |
+
time.sleep(1)
|
| 306 |
+
|
| 307 |
+
except Exception as e:
|
| 308 |
+
print(f"[BACKGROUND] unexpected loop error: {e}")
|
| 309 |
+
time.sleep(5)
|
| 310 |
+
|
| 311 |
+
# Start background worker thread (daemon) so it doesn't block shutdown
|
| 312 |
+
def _start_worker_thread():
|
| 313 |
+
t = threading.Thread(target=background_worker, daemon=True)
|
| 314 |
+
t.start()
|
| 315 |
+
|
| 316 |
+
|
| 317 |
# Description for the interface
|
| 318 |
description = "> Select the model to use for generating the image description. 'Base' is smaller and faster, while 'Large' is more accurate but slower."
|
| 319 |
if device == "cpu":
|