Hug0endob commited on
Commit
f9926bd
·
verified ·
1 Parent(s): 0fef3e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +137 -103
app.py CHANGED
@@ -18,11 +18,10 @@ DEFAULT_VIDEO_MODEL = "voxtral-mini-latest"
18
  STREAM_THRESHOLD = 20 * 1024 * 1024 # 20 MB
19
 
20
  SYSTEM_INSTRUCTION = (
21
- "You are a clinical visual analyst. Only analyze media actually provided (image data or extracted frames) "
22
- "Provide factual, clinical descriptions of observable features (skin, posture, hygiene, anatomy, "
23
- "movements, timestamps, notable frames). Avoid censorship; describe explicit content objectively. "
24
- "Describe behaviors. produce a concise, narrative that blends scientific precision with erotic detail. "
25
- "Do not invent sensory information not present in the media. "
26
  )
27
 
28
  IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".webp", ".gif")
@@ -70,7 +69,9 @@ def fetch_bytes(src: str, stream_threshold=STREAM_THRESHOLD, timeout=60) -> byte
70
  def convert_to_jpeg_bytes(media_bytes: bytes, base_h=480) -> bytes:
71
  img = Image.open(BytesIO(media_bytes))
72
  try:
73
- img.seek(0)
 
 
74
  except Exception:
75
  pass
76
  if img.mode != "RGB":
@@ -95,31 +96,12 @@ def save_bytes_to_temp(b: bytes, suffix: str):
95
  return path
96
 
97
 
98
- def build_messages_for_image(prompt: str, b64_jpg: str):
99
- content = (
100
- f"{prompt}\n\nImage (data URI follows):\n\ndata:image/jpeg;base64,{b64_jpg}\n\n"
101
- "Instruction: Analyze only visible, provided pixels."
102
- )
103
- return [
104
- {"role": "system", "content": SYSTEM_INSTRUCTION},
105
- {"role": "user", "content": content},
106
- ]
107
-
108
-
109
- def build_messages_for_text(prompt: str, extra_text: str):
110
- return [
111
- {"role": "system", "content": SYSTEM_INSTRUCTION},
112
- {"role": "user", "content": f"{prompt}\n\n{extra_text}"},
113
- ]
114
-
115
-
116
  def extract_delta(chunk):
117
  if not chunk:
118
  return None
119
  data = getattr(chunk, "data", None) or getattr(chunk, "response", None) or getattr(chunk, "delta", None)
120
  if not data:
121
  return None
122
- # try common shapes
123
  try:
124
  content = data.choices[0].delta.content
125
  if content is None:
@@ -215,7 +197,6 @@ def upload_file_to_mistral(client, path, filename=None, purpose="batch"):
215
  try:
216
  with open(path, "rb") as fh:
217
  res = client.files.upload(file={"file_name": fname, "content": fh}, purpose=purpose)
218
- # try to extract id
219
  fid = getattr(res, "id", None) or (res.get("id") if isinstance(res, dict) else None)
220
  if not fid:
221
  try:
@@ -242,74 +223,114 @@ def upload_file_to_mistral(client, path, filename=None, purpose="batch"):
242
  return fid
243
 
244
 
245
- def generate_final_text(src: str, custom_prompt: str, api_key: str):
246
- client = get_client(api_key)
247
- prompt = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else "Please provide a detailed visual review.")
248
- ext = ext_from_src(src)
249
- is_image = ext in IMAGE_EXTS or (not is_remote(src) and os.path.isfile(src) and ext in IMAGE_EXTS)
250
- parts = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
 
252
- def stream_and_collect(model, messages):
 
 
 
 
 
 
 
253
  try:
 
 
254
  stream_gen = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
255
  try:
256
- stream_gen = client.chat.stream(model=model, messages=messages)
257
- except Exception:
258
- stream_gen = None
259
- if stream_gen:
260
- for chunk in stream_gen:
261
- d = extract_delta(chunk)
262
- if d is None:
263
- continue
264
- if d.strip() == "" and parts:
265
- continue
266
- parts.append(d)
267
- return
268
- res = client.chat.complete(model=model, messages=messages, stream=False)
269
- try:
270
- choices = getattr(res, "choices", None) or res.get("choices", [])
271
- except Exception:
272
- choices = []
273
- if choices:
274
- try:
275
- msg = choices[0].message
276
- if isinstance(msg, dict):
277
- content = msg.get("content")
278
  else:
279
- content = getattr(msg, "content", None)
280
- if content:
281
- if isinstance(content, str):
282
- parts.append(content)
283
- else:
284
- if isinstance(content, list):
285
- for c in content:
286
- if isinstance(c, dict) and c.get("type") == "text":
287
- parts.append(c.get("text", ""))
288
- elif isinstance(content, dict):
289
- text = content.get("text") or content.get("content")
290
- if text:
291
- parts.append(text)
292
- except Exception:
293
- parts.append(str(res))
294
- else:
295
  parts.append(str(res))
296
- except Exception as e:
297
- parts.append(f"[Model error: {e}]")
 
 
298
 
299
- # Image path: convert and send
 
 
 
 
 
 
 
 
 
 
 
300
  if is_image:
301
  try:
302
- raw = fetch_bytes(src)
303
- jpg = convert_to_jpeg_bytes(raw, base_h=480)
304
- b64 = b64_jpeg(jpg)
 
 
 
 
305
  except Exception as e:
306
  return f"Error processing image: {e}"
307
- msgs = build_messages_for_image(prompt, b64)
308
- stream_and_collect(DEFAULT_IMAGE_MODEL, msgs)
309
  return "".join(parts).strip()
310
 
311
- # Remote video: download, upload to Mistral Files, reference file id in chat
312
  if is_remote(src):
 
313
  try:
314
  media_bytes = fetch_bytes(src, timeout=120)
315
  except Exception as e:
@@ -320,7 +341,7 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
320
  try:
321
  file_id = upload_file_to_mistral(client, tmp_media, filename=os.path.basename(src.split("?")[0]))
322
  except Exception as e:
323
- # If upload fails, fallback to sending a representative frame
324
  frame_bytes = extract_best_frame_bytes(tmp_media)
325
  if not frame_bytes:
326
  return f"Error uploading to Mistral and no frame fallback available: {e}"
@@ -329,8 +350,8 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
329
  except UnidentifiedImageError:
330
  jpg = frame_bytes
331
  b64 = b64_jpeg(jpg)
332
- msgs = build_messages_for_image(prompt, b64)
333
- stream_and_collect(DEFAULT_VIDEO_MODEL, msgs)
334
  return "".join(parts).strip()
335
 
336
  extra = (
@@ -338,7 +359,7 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
338
  "Instruction: Analyze the video contents using the uploaded file id. Do not invent frames not present."
339
  )
340
  msgs = build_messages_for_text(prompt, extra)
341
- stream_and_collect(DEFAULT_VIDEO_MODEL, msgs)
342
  return "".join(parts).strip()
343
  finally:
344
  try:
@@ -347,7 +368,7 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
347
  except Exception:
348
  pass
349
 
350
- # Local video: try upload to Mistral; otherwise fallback to frames
351
  tmp_media = None
352
  try:
353
  media_bytes = fetch_bytes(src)
@@ -361,17 +382,16 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
361
  "Instruction: Analyze the video contents using the uploaded file id. Do not invent frames not present."
362
  )
363
  msgs = build_messages_for_text(prompt, extra)
364
- stream_and_collect(DEFAULT_VIDEO_MODEL, msgs)
365
  return "".join(parts).strip()
366
  except Exception:
367
- # fallback to extracting a best frame
368
  frame_bytes = extract_best_frame_bytes(tmp_media)
369
  if not frame_bytes:
370
  return "Unable to process the provided file. Provide a direct image/frame URL or a remote video URL."
371
  jpg = convert_to_jpeg_bytes(frame_bytes, base_h=480)
372
  b64 = b64_jpeg(jpg)
373
- msgs = build_messages_for_image(prompt, b64)
374
- stream_and_collect(DEFAULT_VIDEO_MODEL, msgs)
375
  return "".join(parts).strip()
376
  finally:
377
  try:
@@ -381,23 +401,33 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
381
  pass
382
 
383
 
384
- # --- Minimal Gradio UI ---
385
  css = """
386
- .preview_column { min-width: 380px; }
387
  .preview_media img, .preview_media video { max-width: 100%; height: auto; }
388
  """
389
 
390
  def load_preview(url: str):
 
 
 
 
 
391
  if not url:
392
- return None, None, "No URL"
393
  try:
394
  r = requests.get(url, timeout=30, stream=True)
395
  r.raise_for_status()
396
- ctype = r.headers.get("content-type", "")
397
  if (ctype and ctype.startswith("video/")) or any(url.lower().split("?")[0].endswith(ext) for ext in VIDEO_EXTS):
398
  return None, url, "Video"
399
  data = r.content
400
- img = Image.open(BytesIO(data)).convert("RGB")
 
 
 
 
 
 
401
  return img, None, "Image"
402
  except Exception:
403
  return None, None, "Preview failed"
@@ -405,22 +435,26 @@ def load_preview(url: str):
405
 
406
  with gr.Blocks(title="Flux", css=css) as demo:
407
  with gr.Row():
408
- with gr.Column(scale=1, elem_classes="preview_column"):
409
- preview_status = gr.Textbox(value="No URL", interactive=False, label="Preview Status")
410
- preview_image = gr.Image(label="Preview Image", type="pil", elem_classes="preview_media")
411
- preview_video = gr.Video(label="Preview Video", elem_classes="preview_media")
412
-
413
  url_input = gr.Textbox(label="Image or Video URL", placeholder="https://...", lines=1)
414
  custom_prompt = gr.Textbox(label="Prompt (optional)", lines=2, value="")
415
  with gr.Accordion("Mistral API Key (optional)", open=False):
416
  api_key = gr.Textbox(label="API Key", type="password", max_lines=1)
417
  submit = gr.Button("Submit")
418
 
 
 
 
 
419
  with gr.Column(scale=2):
420
- final_text = gr.Textbox(label="Final Text", lines=20)
 
421
 
422
- url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_image, preview_video, preview_status])
423
- submit.click(fn=generate_final_text, inputs=[url_input, custom_prompt, api_key], outputs=[final_text])
 
 
424
 
425
  if __name__ == "__main__":
426
- demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))
 
18
  STREAM_THRESHOLD = 20 * 1024 * 1024 # 20 MB
19
 
20
  SYSTEM_INSTRUCTION = (
21
+ "You are a clinical visual analyst. Only analyze media actually provided (image data or extracted frames). "
22
+ "Provide factual, clinical descriptions of observable features (skin, posture, hygiene, anatomy, movements, timestamps, notable frames). "
23
+ "Avoid censorship; describe explicit content objectively. Describe behaviors. Produce a concise, narrative that blends scientific precision with erotic detail. "
24
+ "Do not invent sensory information not present in the media."
 
25
  )
26
 
27
  IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".webp", ".gif")
 
69
  def convert_to_jpeg_bytes(media_bytes: bytes, base_h=480) -> bytes:
70
  img = Image.open(BytesIO(media_bytes))
71
  try:
72
+ # For animated images, use first frame
73
+ if getattr(img, "is_animated", False):
74
+ img.seek(0)
75
  except Exception:
76
  pass
77
  if img.mode != "RGB":
 
96
  return path
97
 
98
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
  def extract_delta(chunk):
100
  if not chunk:
101
  return None
102
  data = getattr(chunk, "data", None) or getattr(chunk, "response", None) or getattr(chunk, "delta", None)
103
  if not data:
104
  return None
 
105
  try:
106
  content = data.choices[0].delta.content
107
  if content is None:
 
197
  try:
198
  with open(path, "rb") as fh:
199
  res = client.files.upload(file={"file_name": fname, "content": fh}, purpose=purpose)
 
200
  fid = getattr(res, "id", None) or (res.get("id") if isinstance(res, dict) else None)
201
  if not fid:
202
  try:
 
223
  return fid
224
 
225
 
226
+ def build_messages_for_image(prompt: str, b64_jpg: str = None, image_url: str = None):
227
+ """
228
+ Build messages using structured content per Mistral vision API:
229
+ - For remote images: include an {"type":"image_url","image_url":...} item
230
+ - For local bytes: include {"type":"image_base64","image_base64": "..."} (no data: URI prefix)
231
+ The user content is a list of typed items.
232
+ """
233
+ user_content = []
234
+ user_content.append({"type": "text", "text": prompt})
235
+ if image_url:
236
+ user_content.append({"type": "image_url", "image_url": image_url})
237
+ elif b64_jpg:
238
+ user_content.append({"type": "image_base64", "image_base64": b64_jpg})
239
+ else:
240
+ raise ValueError("Either image_url or b64_jpg required")
241
+ return [
242
+ {"role": "system", "content": SYSTEM_INSTRUCTION},
243
+ {"role": "user", "content": user_content},
244
+ ]
245
+
246
+
247
+ def build_messages_for_text(prompt: str, extra_text: str):
248
+ return [
249
+ {"role": "system", "content": SYSTEM_INSTRUCTION},
250
+ {"role": "user", "content": f"{prompt}\n\n{extra_text}"},
251
+ ]
252
 
253
+
254
+ def stream_and_collect(client, model, messages, parts: list):
255
+ """
256
+ Use client.chat.stream if available; otherwise use complete.
257
+ Appends textual pieces to parts list.
258
+ """
259
+ try:
260
+ stream_gen = None
261
  try:
262
+ stream_gen = client.chat.stream(model=model, messages=messages)
263
+ except Exception:
264
  stream_gen = None
265
+ if stream_gen:
266
+ for chunk in stream_gen:
267
+ d = extract_delta(chunk)
268
+ if d is None:
269
+ continue
270
+ if d.strip() == "" and parts:
271
+ continue
272
+ parts.append(d)
273
+ return
274
+ res = client.chat.complete(model=model, messages=messages, stream=False)
275
+ try:
276
+ choices = getattr(res, "choices", None) or res.get("choices", [])
277
+ except Exception:
278
+ choices = []
279
+ if choices:
280
  try:
281
+ msg = choices[0].message
282
+ if isinstance(msg, dict):
283
+ content = msg.get("content")
284
+ else:
285
+ content = getattr(msg, "content", None)
286
+ if content:
287
+ if isinstance(content, str):
288
+ parts.append(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
289
  else:
290
+ if isinstance(content, list):
291
+ for c in content:
292
+ if isinstance(c, dict) and c.get("type") == "text":
293
+ parts.append(c.get("text", ""))
294
+ elif isinstance(content, dict):
295
+ text = content.get("text") or content.get("content")
296
+ if text:
297
+ parts.append(text)
298
+ except Exception:
 
 
 
 
 
 
 
299
  parts.append(str(res))
300
+ else:
301
+ parts.append(str(res))
302
+ except Exception as e:
303
+ parts.append(f"[Model error: {e}]")
304
 
305
+
306
+ def generate_final_text(src: str, custom_prompt: str, api_key: str):
307
+ """
308
+ Main entry for Submit button. Returns final text (string).
309
+ """
310
+ client = get_client(api_key)
311
+ prompt = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else "Please provide a detailed visual review.")
312
+ ext = ext_from_src(src)
313
+ is_image = ext in IMAGE_EXTS or (not is_remote(src) and os.path.isfile(src) and ext in IMAGE_EXTS)
314
+ parts = []
315
+
316
+ # Image handling: remote image_url or local image_base64
317
  if is_image:
318
  try:
319
+ if is_remote(src):
320
+ msgs = build_messages_for_image(prompt, image_url=src)
321
+ else:
322
+ raw = fetch_bytes(src)
323
+ jpg = convert_to_jpeg_bytes(raw, base_h=480)
324
+ b64 = b64_jpeg(jpg) # NOTE: this is plain base64 string (no data: prefix)
325
+ msgs = build_messages_for_image(prompt, b64_jpg=b64)
326
  except Exception as e:
327
  return f"Error processing image: {e}"
328
+ stream_and_collect(client, DEFAULT_IMAGE_MODEL, msgs, parts)
 
329
  return "".join(parts).strip()
330
 
331
+ # Video handling (remote/local)
332
  if is_remote(src):
333
+ # download remote media, try upload to Mistral Files; fallback to a representative frame
334
  try:
335
  media_bytes = fetch_bytes(src, timeout=120)
336
  except Exception as e:
 
341
  try:
342
  file_id = upload_file_to_mistral(client, tmp_media, filename=os.path.basename(src.split("?")[0]))
343
  except Exception as e:
344
+ # fallback to sending representative frame
345
  frame_bytes = extract_best_frame_bytes(tmp_media)
346
  if not frame_bytes:
347
  return f"Error uploading to Mistral and no frame fallback available: {e}"
 
350
  except UnidentifiedImageError:
351
  jpg = frame_bytes
352
  b64 = b64_jpeg(jpg)
353
+ msgs = build_messages_for_image(prompt, b64_jpg=b64)
354
+ stream_and_collect(client, DEFAULT_VIDEO_MODEL, msgs, parts)
355
  return "".join(parts).strip()
356
 
357
  extra = (
 
359
  "Instruction: Analyze the video contents using the uploaded file id. Do not invent frames not present."
360
  )
361
  msgs = build_messages_for_text(prompt, extra)
362
+ stream_and_collect(client, DEFAULT_VIDEO_MODEL, msgs, parts)
363
  return "".join(parts).strip()
364
  finally:
365
  try:
 
368
  except Exception:
369
  pass
370
 
371
+ # Local video: upload or fallback to frames
372
  tmp_media = None
373
  try:
374
  media_bytes = fetch_bytes(src)
 
382
  "Instruction: Analyze the video contents using the uploaded file id. Do not invent frames not present."
383
  )
384
  msgs = build_messages_for_text(prompt, extra)
385
+ stream_and_collect(client, DEFAULT_VIDEO_MODEL, msgs, parts)
386
  return "".join(parts).strip()
387
  except Exception:
 
388
  frame_bytes = extract_best_frame_bytes(tmp_media)
389
  if not frame_bytes:
390
  return "Unable to process the provided file. Provide a direct image/frame URL or a remote video URL."
391
  jpg = convert_to_jpeg_bytes(frame_bytes, base_h=480)
392
  b64 = b64_jpeg(jpg)
393
+ msgs = build_messages_for_image(prompt, b64_jpg=b64)
394
+ stream_and_collect(client, DEFAULT_VIDEO_MODEL, msgs, parts)
395
  return "".join(parts).strip()
396
  finally:
397
  try:
 
401
  pass
402
 
403
 
404
+ # --- Gradio UI ---
405
  css = """
 
406
  .preview_media img, .preview_media video { max-width: 100%; height: auto; }
407
  """
408
 
409
  def load_preview(url: str):
410
+ """
411
+ Returns: (image_or_None, video_or_None, mime_label)
412
+ - For images: return PIL.Image, None, "Image"
413
+ - For videos: return None, url, "Video"
414
+ """
415
  if not url:
416
+ return None, None, ""
417
  try:
418
  r = requests.get(url, timeout=30, stream=True)
419
  r.raise_for_status()
420
+ ctype = (r.headers.get("content-type") or "").lower()
421
  if (ctype and ctype.startswith("video/")) or any(url.lower().split("?")[0].endswith(ext) for ext in VIDEO_EXTS):
422
  return None, url, "Video"
423
  data = r.content
424
+ try:
425
+ img = Image.open(BytesIO(data))
426
+ if getattr(img, "is_animated", False):
427
+ img.seek(0)
428
+ img = img.convert("RGB")
429
+ except UnidentifiedImageError:
430
+ return None, None, "Preview failed"
431
  return img, None, "Image"
432
  except Exception:
433
  return None, None, "Preview failed"
 
435
 
436
  with gr.Blocks(title="Flux", css=css) as demo:
437
  with gr.Row():
438
+ with gr.Column(scale=1):
439
+ # Top-left controls
 
 
 
440
  url_input = gr.Textbox(label="Image or Video URL", placeholder="https://...", lines=1)
441
  custom_prompt = gr.Textbox(label="Prompt (optional)", lines=2, value="")
442
  with gr.Accordion("Mistral API Key (optional)", open=False):
443
  api_key = gr.Textbox(label="API Key", type="password", max_lines=1)
444
  submit = gr.Button("Submit")
445
 
446
+ # Single preview area (either image or video shown)
447
+ preview_image = gr.Image(label="Preview", type="pil", elem_classes="preview_media")
448
+ preview_video = gr.Video(label="Preview", elem_classes="preview_media")
449
+
450
  with gr.Column(scale=2):
451
+ # Right column: plain text output (rendered as Markdown/HTML allowed)
452
+ final_text = gr.Markdown(value="") # use Markdown so long text renders nicely
453
 
454
+ # Wire up events
455
+ url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_image, preview_video, gr.Textbox(visible=False)])
456
+ # For submit, use queue to avoid blocking UI
457
+ submit.click(fn=generate_final_text, inputs=[url_input, custom_prompt, api_key], outputs=[final_text], queue=True)
458
 
459
  if __name__ == "__main__":
460
+ demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)), enable_queue=True)