Image-To-Flux-Prompt

Running

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

28f418e

verified ·

1 Parent(s): 8361fdd

Update app.py

Browse files

Files changed (1) hide show

app.py +165 -38

app.py CHANGED Viewed

@@ -9,23 +9,28 @@ import requests
 import gradio as gr
 from mistralai import Mistral
 DEFAULT_KEY = os.getenv("MISTRAL_API_KEY")
 def get_client(alt_key: str = None):
     key = (alt_key or "").strip() or DEFAULT_KEY
     return Mistral(api_key=key)
 def is_remote(s: str):
-    return s.startswith("http://") or s.startswith("https://")
 def fetch_bytes(src: str):
     if is_remote(src):
-        r = requests.get(src, timeout=30)
         r.raise_for_status()
         return r.content
     with open(src, "rb") as f:
         return f.read()
 def try_ffmpeg_extract_frame(in_path: str, out_path: str):
     ffmpeg = shutil.which("ffmpeg")
     if not ffmpeg:
@@ -80,76 +85,198 @@ def convert_to_jpeg_bytes(media_bytes: bytes, filename_hint: str = "input"):
 def to_b64_jpeg(img_bytes: bytes):
     return base64.b64encode(img_bytes).decode("utf-8")
-def generate_stream(image_src: str, custom_prompt: str, alt_key: str):
     try:
-        raw = fetch_bytes(image_src)
-        jpg = convert_to_jpeg_bytes(raw, filename_hint=os.path.basename(image_src) or "input")
     except Exception as e:
-        yield f"Error processing media: {e}"
-        return
-    b64 = to_b64_jpeg(jpg)
     prompt_text = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else
-                   "Provide a detailed, neutral, clinical-style description of the image focusing on observable non-sexual features, hygiene, skin condition, posture, and general anatomy. Keep language professional.")
-    model = "pixtral-12b-2409"
-    messages = [{
-        "role": "user",
-        "content": [
-            {"type": "text", "text": prompt_text},
-            {"type": "image_url", "image_url": f"data:image/jpeg;base64,{b64}"}
-        ],
-        "stream": False
-    }]
-    client = get_client(alt_key)
     try:
         partial = ""
-        for chunk in client.chat.stream(model=model, messages=messages):
-            if chunk.data.choices[0].delta.content is not None:
                 partial += chunk.data.choices[0].delta.content
                 yield partial
     except Exception as e:
-        yield f"Model error: {e}"
-with gr.Blocks() as demo:
     gr.Markdown("Image/Video to Clinical Description (custom prompt optional)")
     with gr.Row():
         with gr.Column(scale=1):
-            # Minimal API key field
-            alt_key = gr.Textbox(label="API Key (optional)", type="password", max_lines=1)
-            # Preview on top
-            preview = gr.Image(label="Preview", type="pil")
             url_input = gr.Textbox(label="Image/Video URL", placeholder="https://...")
             custom = gr.Textbox(label="Custom prompt (optional)", lines=4, placeholder="Enter custom prompt to override default")
             submit = gr.Button("Submit")
         with gr.Column(scale=1):
-            # Streamed text area rendered as HTML/text block filling right column
             output_display = gr.Markdown("", elem_id="generated_output")
     def load_preview(url):
         if not url:
-            return None
         try:
             r = requests.get(url, timeout=30)
             r.raise_for_status()
             img = Image.open(BytesIO(r.content)).convert("RGB")
-            return img
         except Exception:
-            return None
-    def start_gen(url, custom_p, alt_k):
         if not url:
             return "No URL provided."
-        # produce full combined text for Markdown via streaming
         text = ""
-        for chunk in generate_stream(url, custom_p, alt_k):
             text += chunk
             yield text
-        # final yield already returned in loop
-    url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview])
-    submit.click(fn=start_gen, inputs=[url_input, custom, alt_key], outputs=[output_display])
-demo.launch()

 import gradio as gr
 from mistralai import Mistral
+# --- Configuration ---
 DEFAULT_KEY = os.getenv("MISTRAL_API_KEY")
+DEFAULT_MODEL_IMAGE = "pixtral-12b-2409"       # image-only model (default for images)
+DEFAULT_MODEL_VIDEO = "voxtral-mini-latest"    # audio/video-capable model (Voxtral)
+# ---------------------
 def get_client(alt_key: str = None):
     key = (alt_key or "").strip() or DEFAULT_KEY
     return Mistral(api_key=key)
 def is_remote(s: str):
+    return bool(s) and (s.startswith("http://") or s.startswith("https://"))
 def fetch_bytes(src: str):
     if is_remote(src):
+        r = requests.get(src, timeout=60)
         r.raise_for_status()
         return r.content
     with open(src, "rb") as f:
         return f.read()
+# ---------------- image conversion utilities (kept from your original) ----------------
 def try_ffmpeg_extract_frame(in_path: str, out_path: str):
     ffmpeg = shutil.which("ffmpeg")
     if not ffmpeg:
 def to_b64_jpeg(img_bytes: bytes):
     return base64.b64encode(img_bytes).decode("utf-8")
+# --------------------------------------------------------------------------------------
+# ---------------- audio/video helpers ----------------
+def model_supports_audio(model_name: str) -> bool:
+    if not model_name:
+        return False
+    mn = model_name.lower()
+    return "voxtral" in mn or "audio" in mn or "video" in mn
+def save_remote_to_temp(url: str, suffix: str = "") -> str:
+    b = fetch_bytes(url)
+    fd, path = tempfile.mkstemp(suffix=suffix or os.path.splitext(url)[1] or "")
+    os.close(fd)
+    with open(path, "wb") as f:
+        f.write(b)
+    return path
+def ffmpeg_extract_audio(in_path: str, out_path: str):
+    ffmpeg = shutil.which("ffmpeg")
+    if not ffmpeg:
+        raise RuntimeError("ffmpeg not available in runtime")
+    # mono 16k WAV for transcription robustness
+    cmd = [ffmpeg, "-y", "-i", in_path, "-vn", "-ar", "16000", "-ac", "1", "-f", "wav", out_path]
+    subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=120)
+    return out_path
+def transcribe_audio_with_client(client, model, audio_bytes: bytes, language: str = None):
+    # Use the client's audio.transcriptions.complete if available
     try:
+        # The mistralai client accepts a file-like object for "file"
+        bio = BytesIO(audio_bytes)
+        resp = client.audio.transcriptions.complete(model=model, file={"content": bio, "file_name": "audio.wav"}, language=language)  # language optional
+        # resp typically includes "text"
+        if isinstance(resp, dict):
+            return resp.get("text", "")
+        # fallback attribute access
+        return getattr(resp, "text", "")
     except Exception as e:
+        raise
+# ---------------- streaming & processing ----------------
+def generate_stream_multimedia(media_src: str, custom_prompt: str, alt_key: str, model: str = DEFAULT_MODEL_VIDEO):
+    client = get_client(alt_key)
     prompt_text = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else
+                   "Provide a detailed, neutral, clinical-style description focusing on observable non-sexual features, hygiene, skin condition, posture, and general anatomy. Keep language professional.")
+    # If input looks like an image (by extension) try the image path used previously
+    lower = (media_src or "").lower()
+    is_image_ext = lower.endswith((".jpg", ".jpeg", ".png", ".webp", ".gif")) or not is_remote(media_src) and os.path.isfile(media_src) and any(media_src.lower().endswith(ext) for ext in (".jpg", ".jpeg", ".png", ".webp", ".gif"))
+    # If it's an image, reuse existing image flow (convert to JPEG and send)
+    if is_image_ext:
+        try:
+            raw = fetch_bytes(media_src)
+            jpg = convert_to_jpeg_bytes(raw, filename_hint=os.path.basename(media_src) or "input")
+        except Exception as e:
+            yield f"Error processing image: {e}"
+            return
+        b64 = to_b64_jpeg(jpg)
+        # choose image-capable model (keep previous model)
+        image_model = DEFAULT_MODEL_IMAGE
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt_text},
+                {"type": "image_url", "image_url": f"data:image/jpeg;base64,{b64}"}
+            ],
+            "stream": False
+        }]
+        try:
+            partial = ""
+            for chunk in client.chat.stream(model=image_model, messages=messages):
+                if getattr(chunk, "data", None) and chunk.data.choices[0].delta.content is not None:
+                    partial += chunk.data.choices[0].delta.content
+                    yield partial
+            return
+        except Exception as e:
+            yield f"Model error (image): {e}"
+            return
+    # If model supports audio/video and input is a remote URL, try sending the video URL directly
+    if model_supports_audio(model) and is_remote(media_src):
+        # Try direct video URL block — many Mistral processors accept {"type":"video","url": ...}
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt_text},
+                {"type": "video", "url": media_src}
+            ],
+            "stream": False
+        }]
+        try:
+            partial = ""
+            for chunk in client.chat.stream(model=model, messages=messages):
+                if getattr(chunk, "data", None) and chunk.data.choices[0].delta.content is not None:
+                    partial += chunk.data.choices[0].delta.content
+                    yield partial
+            return
+        except Exception:
+            # if direct video URL fails, fall back to audio extraction/transcription below
+            pass
+    # Fallback: download media, extract audio, transcribe, then send transcript + prompt to chat
+    tmp_media = None
+    tmp_audio = None
     try:
+        tmp_media = save_remote_to_temp(media_src, suffix=".mp4")
+        tmp_audio = tempfile.mktemp(suffix=".wav")
+        ffmpeg_extract_audio(tmp_media, tmp_audio)
+        with open(tmp_audio, "rb") as f:
+            audio_bytes = f.read()
+        # Use transcription endpoint
+        try:
+            transcript = transcribe_audio_with_client(client, model, audio_bytes)
+        except Exception as e:
+            yield f"Transcription error: {e}"
+            return
+        # Send transcript + prompt to chosen chat model for streaming description
+        # Use image model (text-only) or audio-capable chat model for richer understanding
+        chat_model = model if model_supports_audio(model) else DEFAULT_MODEL_IMAGE
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": f"{prompt_text}\n\nTranscript:\n{transcript}"}
+            ],
+            "stream": False
+        }]
         partial = ""
+        for chunk in client.chat.stream(model=chat_model, messages=messages):
+            if getattr(chunk, "data", None) and chunk.data.choices[0].delta.content is not None:
                 partial += chunk.data.choices[0].delta.content
                 yield partial
+        return
     except Exception as e:
+        yield f"Error processing media/audio fallback: {e}"
+    finally:
+        for p in (tmp_media, tmp_audio):
+            try:
+                if p and os.path.exists(p):
+                    os.remove(p)
+            except Exception:
+                pass
+# ---------------- Gradio UI ----------------
+with gr.Blocks(title="Image/Video to Clinical Description") as demo:
     gr.Markdown("Image/Video to Clinical Description (custom prompt optional)")
     with gr.Row():
         with gr.Column(scale=1):
+            alt_key = gr.Textbox(label="Mistral API Key (optional)", type="password", max_lines=1)
+            preview_img = gr.Image(label="Preview image (first frame)", type="pil")
+            preview_video = gr.HTML("<div style='color:gray'>Video preview will appear here when a video URL is provided.</div>")
             url_input = gr.Textbox(label="Image/Video URL", placeholder="https://...")
             custom = gr.Textbox(label="Custom prompt (optional)", lines=4, placeholder="Enter custom prompt to override default")
+            model_select = gr.Dropdown(label="Model", choices=[DEFAULT_MODEL_IMAGE, DEFAULT_MODEL_VIDEO], value=DEFAULT_MODEL_VIDEO)
             submit = gr.Button("Submit")
         with gr.Column(scale=1):
             output_display = gr.Markdown("", elem_id="generated_output")
     def load_preview(url):
         if not url:
+            return None, "<div style='color:gray'>No URL provided.</div>"
+        # Try to preview as image first (works for image URLs)
         try:
             r = requests.get(url, timeout=30)
             r.raise_for_status()
+            # If content-type indicates video, create <video> tag for preview
+            content_type = r.headers.get("content-type", "")
+            if content_type.startswith("video/") or any(url.lower().endswith(ext) for ext in (".mp4", ".mov", ".webm", ".mkv")):
+                # build HTML5 video preview
+                video_html = f"""
+                <video controls style="max-width:100%;height:auto;">
+                  <source src="{url}" type="{content_type or 'video/mp4'}">
+                  Your browser does not support the video tag.
+                </video>
+                """
+                return None, video_html
+            # otherwise treat as image
             img = Image.open(BytesIO(r.content)).convert("RGB")
+            return img, "<div style='color:gray'>Image preview shown. If this is a video, server didn't report video content-type.</div>"
         except Exception:
+            # If remote fetch fails for preview, show nothing
+            return None, "<div style='color:red'>Preview failed to load.</div>"
+    def start_gen(url, custom_p, alt_k, model_name):
         if not url:
             return "No URL provided."
         text = ""
+        for chunk in generate_stream_multimedia(url, custom_p, alt_k, model=model_name):
             text += chunk
             yield text
+    url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_img, preview_video])
+    submit.click(fn=start_gen, inputs=[url_input, custom, alt_key, model_select], outputs=[output_display])
+if __name__ == "__main__":
+    demo.launch()