Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,23 +9,28 @@ import requests
|
|
| 9 |
import gradio as gr
|
| 10 |
from mistralai import Mistral
|
| 11 |
|
|
|
|
| 12 |
DEFAULT_KEY = os.getenv("MISTRAL_API_KEY")
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def get_client(alt_key: str = None):
|
| 15 |
key = (alt_key or "").strip() or DEFAULT_KEY
|
| 16 |
return Mistral(api_key=key)
|
| 17 |
|
| 18 |
def is_remote(s: str):
|
| 19 |
-
return s.startswith("http://") or s.startswith("https://")
|
| 20 |
|
| 21 |
def fetch_bytes(src: str):
|
| 22 |
if is_remote(src):
|
| 23 |
-
r = requests.get(src, timeout=
|
| 24 |
r.raise_for_status()
|
| 25 |
return r.content
|
| 26 |
with open(src, "rb") as f:
|
| 27 |
return f.read()
|
| 28 |
|
|
|
|
| 29 |
def try_ffmpeg_extract_frame(in_path: str, out_path: str):
|
| 30 |
ffmpeg = shutil.which("ffmpeg")
|
| 31 |
if not ffmpeg:
|
|
@@ -80,76 +85,198 @@ def convert_to_jpeg_bytes(media_bytes: bytes, filename_hint: str = "input"):
|
|
| 80 |
|
| 81 |
def to_b64_jpeg(img_bytes: bytes):
|
| 82 |
return base64.b64encode(img_bytes).decode("utf-8")
|
|
|
|
| 83 |
|
| 84 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
try:
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
except Exception as e:
|
| 89 |
-
|
| 90 |
-
return
|
| 91 |
|
| 92 |
-
|
|
|
|
|
|
|
| 93 |
prompt_text = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else
|
| 94 |
-
"Provide a detailed, neutral, clinical-style description
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
partial = ""
|
| 108 |
-
for chunk in client.chat.stream(model=
|
| 109 |
-
if chunk.data.choices[0].delta.content is not None:
|
| 110 |
partial += chunk.data.choices[0].delta.content
|
| 111 |
yield partial
|
|
|
|
| 112 |
except Exception as e:
|
| 113 |
-
yield f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
|
| 115 |
-
|
|
|
|
| 116 |
gr.Markdown("Image/Video to Clinical Description (custom prompt optional)")
|
| 117 |
|
| 118 |
with gr.Row():
|
| 119 |
with gr.Column(scale=1):
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
preview = gr.Image(label="Preview", type="pil")
|
| 124 |
url_input = gr.Textbox(label="Image/Video URL", placeholder="https://...")
|
| 125 |
custom = gr.Textbox(label="Custom prompt (optional)", lines=4, placeholder="Enter custom prompt to override default")
|
|
|
|
| 126 |
submit = gr.Button("Submit")
|
| 127 |
with gr.Column(scale=1):
|
| 128 |
-
# Streamed text area rendered as HTML/text block filling right column
|
| 129 |
output_display = gr.Markdown("", elem_id="generated_output")
|
| 130 |
|
| 131 |
def load_preview(url):
|
| 132 |
if not url:
|
| 133 |
-
return None
|
|
|
|
| 134 |
try:
|
| 135 |
r = requests.get(url, timeout=30)
|
| 136 |
r.raise_for_status()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
img = Image.open(BytesIO(r.content)).convert("RGB")
|
| 138 |
-
return img
|
| 139 |
except Exception:
|
| 140 |
-
|
|
|
|
| 141 |
|
| 142 |
-
def start_gen(url, custom_p, alt_k):
|
| 143 |
if not url:
|
| 144 |
return "No URL provided."
|
| 145 |
-
# produce full combined text for Markdown via streaming
|
| 146 |
text = ""
|
| 147 |
-
for chunk in
|
| 148 |
text += chunk
|
| 149 |
yield text
|
| 150 |
-
# final yield already returned in loop
|
| 151 |
|
| 152 |
-
url_input.change(fn=load_preview, inputs=[url_input], outputs=[
|
| 153 |
-
submit.click(fn=start_gen, inputs=[url_input, custom, alt_key], outputs=[output_display])
|
| 154 |
|
| 155 |
-
|
|
|
|
|
|
| 9 |
import gradio as gr
|
| 10 |
from mistralai import Mistral
|
| 11 |
|
| 12 |
+
# --- Configuration ---
|
| 13 |
DEFAULT_KEY = os.getenv("MISTRAL_API_KEY")
|
| 14 |
+
DEFAULT_MODEL_IMAGE = "pixtral-12b-2409" # image-only model (default for images)
|
| 15 |
+
DEFAULT_MODEL_VIDEO = "voxtral-mini-latest" # audio/video-capable model (Voxtral)
|
| 16 |
+
# ---------------------
|
| 17 |
|
| 18 |
def get_client(alt_key: str = None):
|
| 19 |
key = (alt_key or "").strip() or DEFAULT_KEY
|
| 20 |
return Mistral(api_key=key)
|
| 21 |
|
| 22 |
def is_remote(s: str):
|
| 23 |
+
return bool(s) and (s.startswith("http://") or s.startswith("https://"))
|
| 24 |
|
| 25 |
def fetch_bytes(src: str):
|
| 26 |
if is_remote(src):
|
| 27 |
+
r = requests.get(src, timeout=60)
|
| 28 |
r.raise_for_status()
|
| 29 |
return r.content
|
| 30 |
with open(src, "rb") as f:
|
| 31 |
return f.read()
|
| 32 |
|
| 33 |
+
# ---------------- image conversion utilities (kept from your original) ----------------
|
| 34 |
def try_ffmpeg_extract_frame(in_path: str, out_path: str):
|
| 35 |
ffmpeg = shutil.which("ffmpeg")
|
| 36 |
if not ffmpeg:
|
|
|
|
| 85 |
|
| 86 |
def to_b64_jpeg(img_bytes: bytes):
|
| 87 |
return base64.b64encode(img_bytes).decode("utf-8")
|
| 88 |
+
# --------------------------------------------------------------------------------------
|
| 89 |
|
| 90 |
+
# ---------------- audio/video helpers ----------------
|
| 91 |
+
def model_supports_audio(model_name: str) -> bool:
|
| 92 |
+
if not model_name:
|
| 93 |
+
return False
|
| 94 |
+
mn = model_name.lower()
|
| 95 |
+
return "voxtral" in mn or "audio" in mn or "video" in mn
|
| 96 |
+
|
| 97 |
+
def save_remote_to_temp(url: str, suffix: str = "") -> str:
|
| 98 |
+
b = fetch_bytes(url)
|
| 99 |
+
fd, path = tempfile.mkstemp(suffix=suffix or os.path.splitext(url)[1] or "")
|
| 100 |
+
os.close(fd)
|
| 101 |
+
with open(path, "wb") as f:
|
| 102 |
+
f.write(b)
|
| 103 |
+
return path
|
| 104 |
+
|
| 105 |
+
def ffmpeg_extract_audio(in_path: str, out_path: str):
|
| 106 |
+
ffmpeg = shutil.which("ffmpeg")
|
| 107 |
+
if not ffmpeg:
|
| 108 |
+
raise RuntimeError("ffmpeg not available in runtime")
|
| 109 |
+
# mono 16k WAV for transcription robustness
|
| 110 |
+
cmd = [ffmpeg, "-y", "-i", in_path, "-vn", "-ar", "16000", "-ac", "1", "-f", "wav", out_path]
|
| 111 |
+
subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=120)
|
| 112 |
+
return out_path
|
| 113 |
+
|
| 114 |
+
def transcribe_audio_with_client(client, model, audio_bytes: bytes, language: str = None):
|
| 115 |
+
# Use the client's audio.transcriptions.complete if available
|
| 116 |
try:
|
| 117 |
+
# The mistralai client accepts a file-like object for "file"
|
| 118 |
+
bio = BytesIO(audio_bytes)
|
| 119 |
+
resp = client.audio.transcriptions.complete(model=model, file={"content": bio, "file_name": "audio.wav"}, language=language) # language optional
|
| 120 |
+
# resp typically includes "text"
|
| 121 |
+
if isinstance(resp, dict):
|
| 122 |
+
return resp.get("text", "")
|
| 123 |
+
# fallback attribute access
|
| 124 |
+
return getattr(resp, "text", "")
|
| 125 |
except Exception as e:
|
| 126 |
+
raise
|
|
|
|
| 127 |
|
| 128 |
+
# ---------------- streaming & processing ----------------
|
| 129 |
+
def generate_stream_multimedia(media_src: str, custom_prompt: str, alt_key: str, model: str = DEFAULT_MODEL_VIDEO):
|
| 130 |
+
client = get_client(alt_key)
|
| 131 |
prompt_text = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else
|
| 132 |
+
"Provide a detailed, neutral, clinical-style description focusing on observable non-sexual features, hygiene, skin condition, posture, and general anatomy. Keep language professional.")
|
| 133 |
+
# If input looks like an image (by extension) try the image path used previously
|
| 134 |
+
lower = (media_src or "").lower()
|
| 135 |
+
is_image_ext = lower.endswith((".jpg", ".jpeg", ".png", ".webp", ".gif")) or not is_remote(media_src) and os.path.isfile(media_src) and any(media_src.lower().endswith(ext) for ext in (".jpg", ".jpeg", ".png", ".webp", ".gif"))
|
| 136 |
+
# If it's an image, reuse existing image flow (convert to JPEG and send)
|
| 137 |
+
if is_image_ext:
|
| 138 |
+
try:
|
| 139 |
+
raw = fetch_bytes(media_src)
|
| 140 |
+
jpg = convert_to_jpeg_bytes(raw, filename_hint=os.path.basename(media_src) or "input")
|
| 141 |
+
except Exception as e:
|
| 142 |
+
yield f"Error processing image: {e}"
|
| 143 |
+
return
|
| 144 |
+
b64 = to_b64_jpeg(jpg)
|
| 145 |
+
# choose image-capable model (keep previous model)
|
| 146 |
+
image_model = DEFAULT_MODEL_IMAGE
|
| 147 |
+
messages = [{
|
| 148 |
+
"role": "user",
|
| 149 |
+
"content": [
|
| 150 |
+
{"type": "text", "text": prompt_text},
|
| 151 |
+
{"type": "image_url", "image_url": f"data:image/jpeg;base64,{b64}"}
|
| 152 |
+
],
|
| 153 |
+
"stream": False
|
| 154 |
+
}]
|
| 155 |
+
try:
|
| 156 |
+
partial = ""
|
| 157 |
+
for chunk in client.chat.stream(model=image_model, messages=messages):
|
| 158 |
+
if getattr(chunk, "data", None) and chunk.data.choices[0].delta.content is not None:
|
| 159 |
+
partial += chunk.data.choices[0].delta.content
|
| 160 |
+
yield partial
|
| 161 |
+
return
|
| 162 |
+
except Exception as e:
|
| 163 |
+
yield f"Model error (image): {e}"
|
| 164 |
+
return
|
| 165 |
|
| 166 |
+
# If model supports audio/video and input is a remote URL, try sending the video URL directly
|
| 167 |
+
if model_supports_audio(model) and is_remote(media_src):
|
| 168 |
+
# Try direct video URL block — many Mistral processors accept {"type":"video","url": ...}
|
| 169 |
+
messages = [{
|
| 170 |
+
"role": "user",
|
| 171 |
+
"content": [
|
| 172 |
+
{"type": "text", "text": prompt_text},
|
| 173 |
+
{"type": "video", "url": media_src}
|
| 174 |
+
],
|
| 175 |
+
"stream": False
|
| 176 |
+
}]
|
| 177 |
+
try:
|
| 178 |
+
partial = ""
|
| 179 |
+
for chunk in client.chat.stream(model=model, messages=messages):
|
| 180 |
+
if getattr(chunk, "data", None) and chunk.data.choices[0].delta.content is not None:
|
| 181 |
+
partial += chunk.data.choices[0].delta.content
|
| 182 |
+
yield partial
|
| 183 |
+
return
|
| 184 |
+
except Exception:
|
| 185 |
+
# if direct video URL fails, fall back to audio extraction/transcription below
|
| 186 |
+
pass
|
| 187 |
+
|
| 188 |
+
# Fallback: download media, extract audio, transcribe, then send transcript + prompt to chat
|
| 189 |
+
tmp_media = None
|
| 190 |
+
tmp_audio = None
|
| 191 |
try:
|
| 192 |
+
tmp_media = save_remote_to_temp(media_src, suffix=".mp4")
|
| 193 |
+
tmp_audio = tempfile.mktemp(suffix=".wav")
|
| 194 |
+
ffmpeg_extract_audio(tmp_media, tmp_audio)
|
| 195 |
+
with open(tmp_audio, "rb") as f:
|
| 196 |
+
audio_bytes = f.read()
|
| 197 |
+
# Use transcription endpoint
|
| 198 |
+
try:
|
| 199 |
+
transcript = transcribe_audio_with_client(client, model, audio_bytes)
|
| 200 |
+
except Exception as e:
|
| 201 |
+
yield f"Transcription error: {e}"
|
| 202 |
+
return
|
| 203 |
+
# Send transcript + prompt to chosen chat model for streaming description
|
| 204 |
+
# Use image model (text-only) or audio-capable chat model for richer understanding
|
| 205 |
+
chat_model = model if model_supports_audio(model) else DEFAULT_MODEL_IMAGE
|
| 206 |
+
messages = [{
|
| 207 |
+
"role": "user",
|
| 208 |
+
"content": [
|
| 209 |
+
{"type": "text", "text": f"{prompt_text}\n\nTranscript:\n{transcript}"}
|
| 210 |
+
],
|
| 211 |
+
"stream": False
|
| 212 |
+
}]
|
| 213 |
partial = ""
|
| 214 |
+
for chunk in client.chat.stream(model=chat_model, messages=messages):
|
| 215 |
+
if getattr(chunk, "data", None) and chunk.data.choices[0].delta.content is not None:
|
| 216 |
partial += chunk.data.choices[0].delta.content
|
| 217 |
yield partial
|
| 218 |
+
return
|
| 219 |
except Exception as e:
|
| 220 |
+
yield f"Error processing media/audio fallback: {e}"
|
| 221 |
+
finally:
|
| 222 |
+
for p in (tmp_media, tmp_audio):
|
| 223 |
+
try:
|
| 224 |
+
if p and os.path.exists(p):
|
| 225 |
+
os.remove(p)
|
| 226 |
+
except Exception:
|
| 227 |
+
pass
|
| 228 |
|
| 229 |
+
# ---------------- Gradio UI ----------------
|
| 230 |
+
with gr.Blocks(title="Image/Video to Clinical Description") as demo:
|
| 231 |
gr.Markdown("Image/Video to Clinical Description (custom prompt optional)")
|
| 232 |
|
| 233 |
with gr.Row():
|
| 234 |
with gr.Column(scale=1):
|
| 235 |
+
alt_key = gr.Textbox(label="Mistral API Key (optional)", type="password", max_lines=1)
|
| 236 |
+
preview_img = gr.Image(label="Preview image (first frame)", type="pil")
|
| 237 |
+
preview_video = gr.HTML("<div style='color:gray'>Video preview will appear here when a video URL is provided.</div>")
|
|
|
|
| 238 |
url_input = gr.Textbox(label="Image/Video URL", placeholder="https://...")
|
| 239 |
custom = gr.Textbox(label="Custom prompt (optional)", lines=4, placeholder="Enter custom prompt to override default")
|
| 240 |
+
model_select = gr.Dropdown(label="Model", choices=[DEFAULT_MODEL_IMAGE, DEFAULT_MODEL_VIDEO], value=DEFAULT_MODEL_VIDEO)
|
| 241 |
submit = gr.Button("Submit")
|
| 242 |
with gr.Column(scale=1):
|
|
|
|
| 243 |
output_display = gr.Markdown("", elem_id="generated_output")
|
| 244 |
|
| 245 |
def load_preview(url):
|
| 246 |
if not url:
|
| 247 |
+
return None, "<div style='color:gray'>No URL provided.</div>"
|
| 248 |
+
# Try to preview as image first (works for image URLs)
|
| 249 |
try:
|
| 250 |
r = requests.get(url, timeout=30)
|
| 251 |
r.raise_for_status()
|
| 252 |
+
# If content-type indicates video, create <video> tag for preview
|
| 253 |
+
content_type = r.headers.get("content-type", "")
|
| 254 |
+
if content_type.startswith("video/") or any(url.lower().endswith(ext) for ext in (".mp4", ".mov", ".webm", ".mkv")):
|
| 255 |
+
# build HTML5 video preview
|
| 256 |
+
video_html = f"""
|
| 257 |
+
<video controls style="max-width:100%;height:auto;">
|
| 258 |
+
<source src="{url}" type="{content_type or 'video/mp4'}">
|
| 259 |
+
Your browser does not support the video tag.
|
| 260 |
+
</video>
|
| 261 |
+
"""
|
| 262 |
+
return None, video_html
|
| 263 |
+
# otherwise treat as image
|
| 264 |
img = Image.open(BytesIO(r.content)).convert("RGB")
|
| 265 |
+
return img, "<div style='color:gray'>Image preview shown. If this is a video, server didn't report video content-type.</div>"
|
| 266 |
except Exception:
|
| 267 |
+
# If remote fetch fails for preview, show nothing
|
| 268 |
+
return None, "<div style='color:red'>Preview failed to load.</div>"
|
| 269 |
|
| 270 |
+
def start_gen(url, custom_p, alt_k, model_name):
|
| 271 |
if not url:
|
| 272 |
return "No URL provided."
|
|
|
|
| 273 |
text = ""
|
| 274 |
+
for chunk in generate_stream_multimedia(url, custom_p, alt_k, model=model_name):
|
| 275 |
text += chunk
|
| 276 |
yield text
|
|
|
|
| 277 |
|
| 278 |
+
url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_img, preview_video])
|
| 279 |
+
submit.click(fn=start_gen, inputs=[url_input, custom, alt_key, model_select], outputs=[output_display])
|
| 280 |
|
| 281 |
+
if __name__ == "__main__":
|
| 282 |
+
demo.launch()
|