Hug0endob commited on
Commit
28f418e
·
verified ·
1 Parent(s): 8361fdd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +165 -38
app.py CHANGED
@@ -9,23 +9,28 @@ import requests
9
  import gradio as gr
10
  from mistralai import Mistral
11
 
 
12
  DEFAULT_KEY = os.getenv("MISTRAL_API_KEY")
 
 
 
13
 
14
  def get_client(alt_key: str = None):
15
  key = (alt_key or "").strip() or DEFAULT_KEY
16
  return Mistral(api_key=key)
17
 
18
  def is_remote(s: str):
19
- return s.startswith("http://") or s.startswith("https://")
20
 
21
  def fetch_bytes(src: str):
22
  if is_remote(src):
23
- r = requests.get(src, timeout=30)
24
  r.raise_for_status()
25
  return r.content
26
  with open(src, "rb") as f:
27
  return f.read()
28
 
 
29
  def try_ffmpeg_extract_frame(in_path: str, out_path: str):
30
  ffmpeg = shutil.which("ffmpeg")
31
  if not ffmpeg:
@@ -80,76 +85,198 @@ def convert_to_jpeg_bytes(media_bytes: bytes, filename_hint: str = "input"):
80
 
81
  def to_b64_jpeg(img_bytes: bytes):
82
  return base64.b64encode(img_bytes).decode("utf-8")
 
83
 
84
- def generate_stream(image_src: str, custom_prompt: str, alt_key: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  try:
86
- raw = fetch_bytes(image_src)
87
- jpg = convert_to_jpeg_bytes(raw, filename_hint=os.path.basename(image_src) or "input")
 
 
 
 
 
 
88
  except Exception as e:
89
- yield f"Error processing media: {e}"
90
- return
91
 
92
- b64 = to_b64_jpeg(jpg)
 
 
93
  prompt_text = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else
94
- "Provide a detailed, neutral, clinical-style description of the image focusing on observable non-sexual features, hygiene, skin condition, posture, and general anatomy. Keep language professional.")
95
- model = "pixtral-12b-2409"
96
- messages = [{
97
- "role": "user",
98
- "content": [
99
- {"type": "text", "text": prompt_text},
100
- {"type": "image_url", "image_url": f"data:image/jpeg;base64,{b64}"}
101
- ],
102
- "stream": False
103
- }]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- client = get_client(alt_key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  partial = ""
108
- for chunk in client.chat.stream(model=model, messages=messages):
109
- if chunk.data.choices[0].delta.content is not None:
110
  partial += chunk.data.choices[0].delta.content
111
  yield partial
 
112
  except Exception as e:
113
- yield f"Model error: {e}"
 
 
 
 
 
 
 
114
 
115
- with gr.Blocks() as demo:
 
116
  gr.Markdown("Image/Video to Clinical Description (custom prompt optional)")
117
 
118
  with gr.Row():
119
  with gr.Column(scale=1):
120
- # Minimal API key field
121
- alt_key = gr.Textbox(label="API Key (optional)", type="password", max_lines=1)
122
- # Preview on top
123
- preview = gr.Image(label="Preview", type="pil")
124
  url_input = gr.Textbox(label="Image/Video URL", placeholder="https://...")
125
  custom = gr.Textbox(label="Custom prompt (optional)", lines=4, placeholder="Enter custom prompt to override default")
 
126
  submit = gr.Button("Submit")
127
  with gr.Column(scale=1):
128
- # Streamed text area rendered as HTML/text block filling right column
129
  output_display = gr.Markdown("", elem_id="generated_output")
130
 
131
  def load_preview(url):
132
  if not url:
133
- return None
 
134
  try:
135
  r = requests.get(url, timeout=30)
136
  r.raise_for_status()
 
 
 
 
 
 
 
 
 
 
 
 
137
  img = Image.open(BytesIO(r.content)).convert("RGB")
138
- return img
139
  except Exception:
140
- return None
 
141
 
142
- def start_gen(url, custom_p, alt_k):
143
  if not url:
144
  return "No URL provided."
145
- # produce full combined text for Markdown via streaming
146
  text = ""
147
- for chunk in generate_stream(url, custom_p, alt_k):
148
  text += chunk
149
  yield text
150
- # final yield already returned in loop
151
 
152
- url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview])
153
- submit.click(fn=start_gen, inputs=[url_input, custom, alt_key], outputs=[output_display])
154
 
155
- demo.launch()
 
 
9
  import gradio as gr
10
  from mistralai import Mistral
11
 
12
+ # --- Configuration ---
13
  DEFAULT_KEY = os.getenv("MISTRAL_API_KEY")
14
+ DEFAULT_MODEL_IMAGE = "pixtral-12b-2409" # image-only model (default for images)
15
+ DEFAULT_MODEL_VIDEO = "voxtral-mini-latest" # audio/video-capable model (Voxtral)
16
+ # ---------------------
17
 
18
  def get_client(alt_key: str = None):
19
  key = (alt_key or "").strip() or DEFAULT_KEY
20
  return Mistral(api_key=key)
21
 
22
  def is_remote(s: str):
23
+ return bool(s) and (s.startswith("http://") or s.startswith("https://"))
24
 
25
  def fetch_bytes(src: str):
26
  if is_remote(src):
27
+ r = requests.get(src, timeout=60)
28
  r.raise_for_status()
29
  return r.content
30
  with open(src, "rb") as f:
31
  return f.read()
32
 
33
+ # ---------------- image conversion utilities (kept from your original) ----------------
34
  def try_ffmpeg_extract_frame(in_path: str, out_path: str):
35
  ffmpeg = shutil.which("ffmpeg")
36
  if not ffmpeg:
 
85
 
86
  def to_b64_jpeg(img_bytes: bytes):
87
  return base64.b64encode(img_bytes).decode("utf-8")
88
+ # --------------------------------------------------------------------------------------
89
 
90
+ # ---------------- audio/video helpers ----------------
91
+ def model_supports_audio(model_name: str) -> bool:
92
+ if not model_name:
93
+ return False
94
+ mn = model_name.lower()
95
+ return "voxtral" in mn or "audio" in mn or "video" in mn
96
+
97
+ def save_remote_to_temp(url: str, suffix: str = "") -> str:
98
+ b = fetch_bytes(url)
99
+ fd, path = tempfile.mkstemp(suffix=suffix or os.path.splitext(url)[1] or "")
100
+ os.close(fd)
101
+ with open(path, "wb") as f:
102
+ f.write(b)
103
+ return path
104
+
105
+ def ffmpeg_extract_audio(in_path: str, out_path: str):
106
+ ffmpeg = shutil.which("ffmpeg")
107
+ if not ffmpeg:
108
+ raise RuntimeError("ffmpeg not available in runtime")
109
+ # mono 16k WAV for transcription robustness
110
+ cmd = [ffmpeg, "-y", "-i", in_path, "-vn", "-ar", "16000", "-ac", "1", "-f", "wav", out_path]
111
+ subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=120)
112
+ return out_path
113
+
114
+ def transcribe_audio_with_client(client, model, audio_bytes: bytes, language: str = None):
115
+ # Use the client's audio.transcriptions.complete if available
116
  try:
117
+ # The mistralai client accepts a file-like object for "file"
118
+ bio = BytesIO(audio_bytes)
119
+ resp = client.audio.transcriptions.complete(model=model, file={"content": bio, "file_name": "audio.wav"}, language=language) # language optional
120
+ # resp typically includes "text"
121
+ if isinstance(resp, dict):
122
+ return resp.get("text", "")
123
+ # fallback attribute access
124
+ return getattr(resp, "text", "")
125
  except Exception as e:
126
+ raise
 
127
 
128
+ # ---------------- streaming & processing ----------------
129
+ def generate_stream_multimedia(media_src: str, custom_prompt: str, alt_key: str, model: str = DEFAULT_MODEL_VIDEO):
130
+ client = get_client(alt_key)
131
  prompt_text = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else
132
+ "Provide a detailed, neutral, clinical-style description focusing on observable non-sexual features, hygiene, skin condition, posture, and general anatomy. Keep language professional.")
133
+ # If input looks like an image (by extension) try the image path used previously
134
+ lower = (media_src or "").lower()
135
+ is_image_ext = lower.endswith((".jpg", ".jpeg", ".png", ".webp", ".gif")) or not is_remote(media_src) and os.path.isfile(media_src) and any(media_src.lower().endswith(ext) for ext in (".jpg", ".jpeg", ".png", ".webp", ".gif"))
136
+ # If it's an image, reuse existing image flow (convert to JPEG and send)
137
+ if is_image_ext:
138
+ try:
139
+ raw = fetch_bytes(media_src)
140
+ jpg = convert_to_jpeg_bytes(raw, filename_hint=os.path.basename(media_src) or "input")
141
+ except Exception as e:
142
+ yield f"Error processing image: {e}"
143
+ return
144
+ b64 = to_b64_jpeg(jpg)
145
+ # choose image-capable model (keep previous model)
146
+ image_model = DEFAULT_MODEL_IMAGE
147
+ messages = [{
148
+ "role": "user",
149
+ "content": [
150
+ {"type": "text", "text": prompt_text},
151
+ {"type": "image_url", "image_url": f"data:image/jpeg;base64,{b64}"}
152
+ ],
153
+ "stream": False
154
+ }]
155
+ try:
156
+ partial = ""
157
+ for chunk in client.chat.stream(model=image_model, messages=messages):
158
+ if getattr(chunk, "data", None) and chunk.data.choices[0].delta.content is not None:
159
+ partial += chunk.data.choices[0].delta.content
160
+ yield partial
161
+ return
162
+ except Exception as e:
163
+ yield f"Model error (image): {e}"
164
+ return
165
 
166
+ # If model supports audio/video and input is a remote URL, try sending the video URL directly
167
+ if model_supports_audio(model) and is_remote(media_src):
168
+ # Try direct video URL block — many Mistral processors accept {"type":"video","url": ...}
169
+ messages = [{
170
+ "role": "user",
171
+ "content": [
172
+ {"type": "text", "text": prompt_text},
173
+ {"type": "video", "url": media_src}
174
+ ],
175
+ "stream": False
176
+ }]
177
+ try:
178
+ partial = ""
179
+ for chunk in client.chat.stream(model=model, messages=messages):
180
+ if getattr(chunk, "data", None) and chunk.data.choices[0].delta.content is not None:
181
+ partial += chunk.data.choices[0].delta.content
182
+ yield partial
183
+ return
184
+ except Exception:
185
+ # if direct video URL fails, fall back to audio extraction/transcription below
186
+ pass
187
+
188
+ # Fallback: download media, extract audio, transcribe, then send transcript + prompt to chat
189
+ tmp_media = None
190
+ tmp_audio = None
191
  try:
192
+ tmp_media = save_remote_to_temp(media_src, suffix=".mp4")
193
+ tmp_audio = tempfile.mktemp(suffix=".wav")
194
+ ffmpeg_extract_audio(tmp_media, tmp_audio)
195
+ with open(tmp_audio, "rb") as f:
196
+ audio_bytes = f.read()
197
+ # Use transcription endpoint
198
+ try:
199
+ transcript = transcribe_audio_with_client(client, model, audio_bytes)
200
+ except Exception as e:
201
+ yield f"Transcription error: {e}"
202
+ return
203
+ # Send transcript + prompt to chosen chat model for streaming description
204
+ # Use image model (text-only) or audio-capable chat model for richer understanding
205
+ chat_model = model if model_supports_audio(model) else DEFAULT_MODEL_IMAGE
206
+ messages = [{
207
+ "role": "user",
208
+ "content": [
209
+ {"type": "text", "text": f"{prompt_text}\n\nTranscript:\n{transcript}"}
210
+ ],
211
+ "stream": False
212
+ }]
213
  partial = ""
214
+ for chunk in client.chat.stream(model=chat_model, messages=messages):
215
+ if getattr(chunk, "data", None) and chunk.data.choices[0].delta.content is not None:
216
  partial += chunk.data.choices[0].delta.content
217
  yield partial
218
+ return
219
  except Exception as e:
220
+ yield f"Error processing media/audio fallback: {e}"
221
+ finally:
222
+ for p in (tmp_media, tmp_audio):
223
+ try:
224
+ if p and os.path.exists(p):
225
+ os.remove(p)
226
+ except Exception:
227
+ pass
228
 
229
+ # ---------------- Gradio UI ----------------
230
+ with gr.Blocks(title="Image/Video to Clinical Description") as demo:
231
  gr.Markdown("Image/Video to Clinical Description (custom prompt optional)")
232
 
233
  with gr.Row():
234
  with gr.Column(scale=1):
235
+ alt_key = gr.Textbox(label="Mistral API Key (optional)", type="password", max_lines=1)
236
+ preview_img = gr.Image(label="Preview image (first frame)", type="pil")
237
+ preview_video = gr.HTML("<div style='color:gray'>Video preview will appear here when a video URL is provided.</div>")
 
238
  url_input = gr.Textbox(label="Image/Video URL", placeholder="https://...")
239
  custom = gr.Textbox(label="Custom prompt (optional)", lines=4, placeholder="Enter custom prompt to override default")
240
+ model_select = gr.Dropdown(label="Model", choices=[DEFAULT_MODEL_IMAGE, DEFAULT_MODEL_VIDEO], value=DEFAULT_MODEL_VIDEO)
241
  submit = gr.Button("Submit")
242
  with gr.Column(scale=1):
 
243
  output_display = gr.Markdown("", elem_id="generated_output")
244
 
245
  def load_preview(url):
246
  if not url:
247
+ return None, "<div style='color:gray'>No URL provided.</div>"
248
+ # Try to preview as image first (works for image URLs)
249
  try:
250
  r = requests.get(url, timeout=30)
251
  r.raise_for_status()
252
+ # If content-type indicates video, create <video> tag for preview
253
+ content_type = r.headers.get("content-type", "")
254
+ if content_type.startswith("video/") or any(url.lower().endswith(ext) for ext in (".mp4", ".mov", ".webm", ".mkv")):
255
+ # build HTML5 video preview
256
+ video_html = f"""
257
+ <video controls style="max-width:100%;height:auto;">
258
+ <source src="{url}" type="{content_type or 'video/mp4'}">
259
+ Your browser does not support the video tag.
260
+ </video>
261
+ """
262
+ return None, video_html
263
+ # otherwise treat as image
264
  img = Image.open(BytesIO(r.content)).convert("RGB")
265
+ return img, "<div style='color:gray'>Image preview shown. If this is a video, server didn't report video content-type.</div>"
266
  except Exception:
267
+ # If remote fetch fails for preview, show nothing
268
+ return None, "<div style='color:red'>Preview failed to load.</div>"
269
 
270
+ def start_gen(url, custom_p, alt_k, model_name):
271
  if not url:
272
  return "No URL provided."
 
273
  text = ""
274
+ for chunk in generate_stream_multimedia(url, custom_p, alt_k, model=model_name):
275
  text += chunk
276
  yield text
 
277
 
278
+ url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_img, preview_video])
279
+ submit.click(fn=start_gen, inputs=[url_input, custom, alt_key, model_select], outputs=[output_display])
280
 
281
+ if __name__ == "__main__":
282
+ demo.launch()