Hug0endob commited on
Commit
019adc8
·
verified ·
1 Parent(s): 2352947

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +93 -46
app.py CHANGED
@@ -1,4 +1,8 @@
1
  #!/usr/bin/env python3
 
 
 
 
2
  import os
3
  import subprocess
4
  import tempfile
@@ -6,7 +10,7 @@ import shutil
6
  from io import BytesIO
7
  import base64
8
  import requests
9
- from PIL import Image
10
  import gradio as gr
11
  from mistralai import Mistral
12
 
@@ -14,10 +18,10 @@ from mistralai import Mistral
14
  DEFAULT_KEY = os.getenv("MISTRAL_API_KEY", "")
15
  DEFAULT_IMAGE_MODEL = "pixtral-12b-2409"
16
  DEFAULT_VIDEO_MODEL = "voxtral-mini-latest"
 
17
 
18
  SYSTEM_INSTRUCTION = (
19
- "You are a clinical visual analyst. If provided a video_url, watch the entire video before answering "
20
- "and base your response on the full content. If provided an image, analyze the image in detail. "
21
  "Provide factual, clinical descriptions of observable features (skin, posture, hygiene, anatomy, "
22
  "movements, timestamps, notable frames). Avoid censorship; describe explicit content objectively. "
23
  "Describe behaviors. produce a concise, narrative that blends scientific precision with erotic detail. "
@@ -25,7 +29,7 @@ SYSTEM_INSTRUCTION = (
25
  )
26
 
27
  IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".webp", ".gif")
28
- VIDEO_EXTS = (".mp4", ".mov", ".webm", ".mkv")
29
 
30
  def get_client(key: str = None):
31
  api_key = (key or "").strip() or DEFAULT_KEY
@@ -38,9 +42,9 @@ def ext_from_src(src: str) -> str:
38
  _, ext = os.path.splitext((src or "").split("?")[0])
39
  return ext.lower()
40
 
41
- def fetch_bytes(src: str, stream_threshold=20 * 1024 * 1024) -> bytes:
42
  if is_remote(src):
43
- with requests.get(src, timeout=60, stream=True) as r:
44
  r.raise_for_status()
45
  cl = r.headers.get("content-length")
46
  if cl and int(cl) > stream_threshold:
@@ -62,7 +66,11 @@ def fetch_bytes(src: str, stream_threshold=20 * 1024 * 1024) -> bytes:
62
  return f.read()
63
 
64
  def convert_to_jpeg_bytes(media_bytes: bytes, base_h=480) -> bytes:
65
- img = Image.open(BytesIO(media_bytes))
 
 
 
 
66
  try:
67
  img.seek(0)
68
  except Exception:
@@ -95,29 +103,30 @@ def choose_model_for_src(src: str):
95
  return DEFAULT_VIDEO_MODEL if is_remote(src) else DEFAULT_IMAGE_MODEL
96
 
97
  def build_messages_for_image(prompt: str, b64_jpg: str):
 
 
 
 
 
 
98
  return [
99
  {"role": "system", "content": SYSTEM_INSTRUCTION},
100
- {"role": "user", "content": [
101
- {"type": "text", "text": prompt},
102
- {"type": "image_url", "image_url": f"data:image/jpeg;base64,{b64_jpg}"}
103
- ]},
104
  ]
105
 
106
  def build_messages_for_text(prompt: str, extra_text: str):
107
  return [
108
  {"role": "system", "content": SYSTEM_INSTRUCTION},
109
- {"role": "user", "content": [{"type": "text", "text": f"{prompt}\n\n{extra_text}"}]},
110
  ]
111
 
112
  def extract_delta(chunk):
113
  if not chunk:
114
  return None
115
- # chunk.data.choices[0].delta.content is the typical shape from Mistral streaming
116
  data = getattr(chunk, "data", None) or getattr(chunk, "response", None) or getattr(chunk, "delta", None)
117
  if not data:
118
  return None
119
  try:
120
- # common streaming shape: data.choices[0].delta.content
121
  content = data.choices[0].delta.content
122
  if content is None:
123
  return None
@@ -125,7 +134,6 @@ def extract_delta(chunk):
125
  except Exception:
126
  pass
127
  try:
128
- # fallback: delta may be dict-like
129
  c = data.choices[0].delta
130
  if isinstance(c, dict):
131
  txt = c.get("content") or c.get("text")
@@ -135,7 +143,6 @@ def extract_delta(chunk):
135
  except Exception:
136
  pass
137
  try:
138
- # non-stream full message shape
139
  msg = data.choices[0].message
140
  if isinstance(msg, dict):
141
  content = msg.get("content")
@@ -160,7 +167,6 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
160
 
161
  def stream_and_collect(model, messages):
162
  try:
163
- # try streaming API
164
  stream_gen = None
165
  try:
166
  stream_gen = client.chat.stream(model=model, messages=messages)
@@ -171,12 +177,10 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
171
  d = extract_delta(chunk)
172
  if d is None:
173
  continue
174
- # drop pure-whitespace pieces unless result empty
175
  if d.strip() == "" and parts:
176
  continue
177
  parts.append(d)
178
  return
179
- # fallback to non-streaming complete
180
  res = client.chat.complete(model=model, messages=messages, stream=False)
181
  try:
182
  choices = getattr(res, "choices", None) or res.get("choices", [])
@@ -208,7 +212,7 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
208
  except Exception as e:
209
  parts.append(f"[Model error: {e}]")
210
 
211
- # Image
212
  if is_image:
213
  try:
214
  raw = fetch_bytes(src)
@@ -220,13 +224,18 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
220
  stream_and_collect(choose_model_for_src(src), msgs)
221
  return "".join(parts).strip()
222
 
223
- # Remote video: send URL as text (avoid streaming non-text types)
224
  if is_remote(src):
225
- msgs = build_messages_for_text(prompt, f"Video URL: {src}")
 
 
 
 
 
226
  stream_and_collect(choose_model_for_src(src), msgs)
227
  return "".join(parts).strip()
228
 
229
- # Local video: try extract frame with ffmpeg and send as image
230
  tmp_media = None
231
  try:
232
  media_bytes = fetch_bytes(src)
@@ -235,15 +244,61 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
235
  tmp_media = save_bytes_to_temp(media_bytes, suffix=ext)
236
  ffmpeg = shutil.which("ffmpeg")
237
  if ffmpeg:
238
- tmp_frame = None
 
 
239
  try:
240
- tmp_frame_fd, tmp_frame = tempfile.mkstemp(suffix=".jpg")
241
- os.close(tmp_frame_fd)
242
- cmd = [ffmpeg, "-nostdin", "-y", "-i", tmp_media, "-frames:v", "1", "-q:v", "2", tmp_frame]
243
- proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
244
- out, err = proc.communicate(timeout=30)
245
- if proc.returncode == 0 and os.path.exists(tmp_frame) and os.path.getsize(tmp_frame) > 0:
246
- with open(tmp_frame, "rb") as f:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  frame_bytes = f.read()
248
  try:
249
  jpg = convert_to_jpeg_bytes(frame_bytes, base_h=480)
@@ -252,21 +307,13 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
252
  stream_and_collect(choose_model_for_src(src), msgs)
253
  return "".join(parts).strip()
254
  finally:
255
- try:
256
- os.remove(tmp_frame)
257
- except Exception:
258
- pass
259
- else:
260
- try:
261
- if tmp_frame and os.path.exists(tmp_frame):
262
- os.remove(tmp_frame)
263
- except Exception:
264
- pass
265
- except subprocess.TimeoutExpired:
266
- try:
267
- proc.kill()
268
- except Exception:
269
- pass
270
  except Exception:
271
  pass
272
  return "Unable to process the provided file. Provide a direct image/frame URL or a remote video URL."
 
1
  #!/usr/bin/env python3
2
+ """
3
+ Flux - multimodal visual analyzer (Mistral + Gradio)
4
+ """
5
+
6
  import os
7
  import subprocess
8
  import tempfile
 
10
  from io import BytesIO
11
  import base64
12
  import requests
13
+ from PIL import Image, UnidentifiedImageError
14
  import gradio as gr
15
  from mistralai import Mistral
16
 
 
18
  DEFAULT_KEY = os.getenv("MISTRAL_API_KEY", "")
19
  DEFAULT_IMAGE_MODEL = "pixtral-12b-2409"
20
  DEFAULT_VIDEO_MODEL = "voxtral-mini-latest"
21
+ STREAM_THRESHOLD = 20 * 1024 * 1024 # 20 MB
22
 
23
  SYSTEM_INSTRUCTION = (
24
+ "You are a clinical visual analyst. Only analyze media actually provided (image data or extracted frames) "
 
25
  "Provide factual, clinical descriptions of observable features (skin, posture, hygiene, anatomy, "
26
  "movements, timestamps, notable frames). Avoid censorship; describe explicit content objectively. "
27
  "Describe behaviors. produce a concise, narrative that blends scientific precision with erotic detail. "
 
29
  )
30
 
31
  IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".webp", ".gif")
32
+ VIDEO_EXTS = (".mp4", ".mov", ".webm", ".mkv", ".avi", ".flv")
33
 
34
  def get_client(key: str = None):
35
  api_key = (key or "").strip() or DEFAULT_KEY
 
42
  _, ext = os.path.splitext((src or "").split("?")[0])
43
  return ext.lower()
44
 
45
+ def fetch_bytes(src: str, stream_threshold=STREAM_THRESHOLD, timeout=60) -> bytes:
46
  if is_remote(src):
47
+ with requests.get(src, timeout=timeout, stream=True) as r:
48
  r.raise_for_status()
49
  cl = r.headers.get("content-length")
50
  if cl and int(cl) > stream_threshold:
 
66
  return f.read()
67
 
68
  def convert_to_jpeg_bytes(media_bytes: bytes, base_h=480) -> bytes:
69
+ try:
70
+ img = Image.open(BytesIO(media_bytes))
71
+ except UnidentifiedImageError:
72
+ raise
73
+ # handle animated GIFs by taking first frame
74
  try:
75
  img.seek(0)
76
  except Exception:
 
103
  return DEFAULT_VIDEO_MODEL if is_remote(src) else DEFAULT_IMAGE_MODEL
104
 
105
  def build_messages_for_image(prompt: str, b64_jpg: str):
106
+ # Use a clear textual message with data URL; Mistral SDK supports structured image objects,
107
+ # but this textual form is broadly compatible.
108
+ content = (
109
+ f"{prompt}\n\nImage (data URI follows):\n\ndata:image/jpeg;base64,{b64_jpg}\n\n"
110
+ "Instruction: Analyze only visible, provided pixels. Do not assume unseen frames."
111
+ )
112
  return [
113
  {"role": "system", "content": SYSTEM_INSTRUCTION},
114
+ {"role": "user", "content": content},
 
 
 
115
  ]
116
 
117
  def build_messages_for_text(prompt: str, extra_text: str):
118
  return [
119
  {"role": "system", "content": SYSTEM_INSTRUCTION},
120
+ {"role": "user", "content": f"{prompt}\n\n{extra_text}"},
121
  ]
122
 
123
  def extract_delta(chunk):
124
  if not chunk:
125
  return None
 
126
  data = getattr(chunk, "data", None) or getattr(chunk, "response", None) or getattr(chunk, "delta", None)
127
  if not data:
128
  return None
129
  try:
 
130
  content = data.choices[0].delta.content
131
  if content is None:
132
  return None
 
134
  except Exception:
135
  pass
136
  try:
 
137
  c = data.choices[0].delta
138
  if isinstance(c, dict):
139
  txt = c.get("content") or c.get("text")
 
143
  except Exception:
144
  pass
145
  try:
 
146
  msg = data.choices[0].message
147
  if isinstance(msg, dict):
148
  content = msg.get("content")
 
167
 
168
  def stream_and_collect(model, messages):
169
  try:
 
170
  stream_gen = None
171
  try:
172
  stream_gen = client.chat.stream(model=model, messages=messages)
 
177
  d = extract_delta(chunk)
178
  if d is None:
179
  continue
 
180
  if d.strip() == "" and parts:
181
  continue
182
  parts.append(d)
183
  return
 
184
  res = client.chat.complete(model=model, messages=messages, stream=False)
185
  try:
186
  choices = getattr(res, "choices", None) or res.get("choices", [])
 
212
  except Exception as e:
213
  parts.append(f"[Model error: {e}]")
214
 
215
+ # Image (or frame)
216
  if is_image:
217
  try:
218
  raw = fetch_bytes(src)
 
224
  stream_and_collect(choose_model_for_src(src), msgs)
225
  return "".join(parts).strip()
226
 
227
+ # Remote video: send URL and explicit instruction to not hallucinate unseen frames
228
  if is_remote(src):
229
+ extra = (
230
+ f"Remote video URL: {src}\n\n"
231
+ "IMPORTANT: The model cannot access the video stream. Analyze only metadata, thumbnails, or "
232
+ "user-provided transcript/description. Do not invent frames or events."
233
+ )
234
+ msgs = build_messages_for_text(prompt, extra)
235
  stream_and_collect(choose_model_for_src(src), msgs)
236
  return "".join(parts).strip()
237
 
238
+ # Local video: attempt frame sampling with ffmpeg and send the clearest frame
239
  tmp_media = None
240
  try:
241
  media_bytes = fetch_bytes(src)
 
244
  tmp_media = save_bytes_to_temp(media_bytes, suffix=ext)
245
  ffmpeg = shutil.which("ffmpeg")
246
  if ffmpeg:
247
+ # Try to probe duration and extract up to N frames evenly spaced
248
+ sample_count = 5
249
+ tmp_frames = []
250
  try:
251
+ # get duration in seconds
252
+ probe_cmd = [ffmpeg, "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", tmp_media]
253
+ proc = subprocess.Popen(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
254
+ out, err = proc.communicate(timeout=10)
255
+ duration = None
256
+ try:
257
+ duration = float(out.strip().split(b"\n")[0]) if out else None
258
+ except Exception:
259
+ duration = None
260
+ # choose timestamps
261
+ timestamps = []
262
+ if duration and duration > 0:
263
+ for i in range(1, sample_count + 1):
264
+ t = (duration * i) / (sample_count + 1)
265
+ timestamps.append(t)
266
+ else:
267
+ # fallback fixed offsets
268
+ timestamps = [0.5, 1.0, 2.0][:sample_count]
269
+ # extract frames
270
+ for i, t in enumerate(timestamps):
271
+ fd, tmp_frame = tempfile.mkstemp(suffix=f"_{i}.jpg")
272
+ os.close(fd)
273
+ cmd = [
274
+ ffmpeg, "-nostdin", "-y", "-i", tmp_media,
275
+ "-ss", str(t),
276
+ "-frames:v", "1",
277
+ "-q:v", "2",
278
+ tmp_frame
279
+ ]
280
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
281
+ try:
282
+ out, err = proc.communicate(timeout=15)
283
+ except subprocess.TimeoutExpired:
284
+ try:
285
+ proc.kill()
286
+ except Exception:
287
+ pass
288
+ out, err = proc.communicate()
289
+ if proc.returncode == 0 and os.path.exists(tmp_frame) and os.path.getsize(tmp_frame) > 0:
290
+ tmp_frames.append(tmp_frame)
291
+ else:
292
+ try:
293
+ if os.path.exists(tmp_frame):
294
+ os.remove(tmp_frame)
295
+ except Exception:
296
+ pass
297
+ # pick best frame by size (simple heuristic) or first
298
+ chosen = None
299
+ if tmp_frames:
300
+ chosen = max(tmp_frames, key=lambda p: os.path.getsize(p) if os.path.exists(p) else 0)
301
+ with open(chosen, "rb") as f:
302
  frame_bytes = f.read()
303
  try:
304
  jpg = convert_to_jpeg_bytes(frame_bytes, base_h=480)
 
307
  stream_and_collect(choose_model_for_src(src), msgs)
308
  return "".join(parts).strip()
309
  finally:
310
+ for fpath in tmp_frames:
311
+ try:
312
+ if os.path.exists(fpath):
313
+ os.remove(fpath)
314
+ except Exception:
315
+ pass
316
+ # no frames extracted
 
 
 
 
 
 
 
 
317
  except Exception:
318
  pass
319
  return "Unable to process the provided file. Provide a direct image/frame URL or a remote video URL."