Image-To-Flux-Prompt

Running

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

06e8576

verified ·

1 Parent(s): 28f418e

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -160

app.py CHANGED Viewed

@@ -1,28 +1,27 @@
 import os
 import shutil
-import base64
 import tempfile
 import subprocess
 from io import BytesIO
 from PIL import Image
 import requests
 import gradio as gr
 from mistralai import Mistral
-# --- Configuration ---
-DEFAULT_KEY = os.getenv("MISTRAL_API_KEY")
-DEFAULT_MODEL_IMAGE = "pixtral-12b-2409"       # image-only model (default for images)
-DEFAULT_MODEL_VIDEO = "voxtral-mini-latest"    # audio/video-capable model (Voxtral)
-# ---------------------
-def get_client(alt_key: str = None):
-    key = (alt_key or "").strip() or DEFAULT_KEY
-    return Mistral(api_key=key)
-def is_remote(s: str):
-    return bool(s) and (s.startswith("http://") or s.startswith("https://"))
-def fetch_bytes(src: str):
     if is_remote(src):
         r = requests.get(src, timeout=60)
         r.raise_for_status()
@@ -30,70 +29,29 @@ def fetch_bytes(src: str):
     with open(src, "rb") as f:
         return f.read()
-# ---------------- image conversion utilities (kept from your original) ----------------
-def try_ffmpeg_extract_frame(in_path: str, out_path: str):
-    ffmpeg = shutil.which("ffmpeg")
-    if not ffmpeg:
-        return False
-    cmd = [ffmpeg, "-y", "-i", in_path, "-vf", "scale=-2:512", "-frames:v", "1", out_path]
-    try:
-        subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=30)
-        return os.path.exists(out_path)
-    except Exception:
-        return False
-def ezgif_convert(media_bytes: bytes, filename: str = "input"):
-    files = {"new-image": (filename, media_bytes)}
-    r = requests.post("https://s.ezgif.com/upload", files=files, timeout=60)
-    r.raise_for_status()
-    import re
-    m = re.search(r'name="file" value="([^"]+)"', r.text)
-    if not m:
-        raise RuntimeError("ezgif upload failed")
-    key = m.group(1)
-    conv = requests.post("https://s.ezgif.com/gif-to-jpg", data={"file": key}, timeout=60)
-    conv.raise_for_status()
-    m2 = re.search(r'<img src="(https?://s.ezgif.com/tmp/[^"]+)"', conv.text) or re.search(r'<a href="(https?://s.ezgif.com/tmp/[^"]+)"', conv.text)
-    if not m2:
-        raise RuntimeError("ezgif conversion failed")
-    jpg_url = m2.group(1)
-    r2 = requests.get(jpg_url, timeout=60)
-    r2.raise_for_status()
-    return r2.content
 def convert_to_jpeg_bytes(media_bytes: bytes, filename_hint: str = "input"):
-    try:
-        img = Image.open(BytesIO(media_bytes))
-        if img.mode != "RGB":
-            img = img.convert("RGB")
-        base_h = 512
-        w = int(img.width * (base_h / img.height))
-        img = img.resize((w, base_h), Image.LANCZOS)
-        buf = BytesIO()
-        img.save(buf, format="JPEG", quality=90)
-        return buf.getvalue()
-    except Exception:
-        with tempfile.TemporaryDirectory() as td:
-            in_path = os.path.join(td, filename_hint)
-            with open(in_path, "wb") as f:
-                f.write(media_bytes)
-            out_path = os.path.join(td, "frame.jpg")
-            if try_ffmpeg_extract_frame(in_path, out_path) and os.path.exists(out_path):
-                with open(out_path, "rb") as f:
-                    return f.read()
-            return ezgif_convert(media_bytes, filename_hint)
-def to_b64_jpeg(img_bytes: bytes):
     return base64.b64encode(img_bytes).decode("utf-8")
-# --------------------------------------------------------------------------------------
-# ---------------- audio/video helpers ----------------
 def model_supports_audio(model_name: str) -> bool:
     if not model_name:
         return False
     mn = model_name.lower()
     return "voxtral" in mn or "audio" in mn or "video" in mn
 def save_remote_to_temp(url: str, suffix: str = "") -> str:
     b = fetch_bytes(url)
     fd, path = tempfile.mkstemp(suffix=suffix or os.path.splitext(url)[1] or "")
@@ -102,122 +60,138 @@ def save_remote_to_temp(url: str, suffix: str = "") -> str:
         f.write(b)
     return path
 def ffmpeg_extract_audio(in_path: str, out_path: str):
     ffmpeg = shutil.which("ffmpeg")
     if not ffmpeg:
-        raise RuntimeError("ffmpeg not available in runtime")
-    # mono 16k WAV for transcription robustness
     cmd = [ffmpeg, "-y", "-i", in_path, "-vn", "-ar", "16000", "-ac", "1", "-f", "wav", out_path]
     subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=120)
     return out_path
-def transcribe_audio_with_client(client, model, audio_bytes: bytes, language: str = None):
-    # Use the client's audio.transcriptions.complete if available
-    try:
-        # The mistralai client accepts a file-like object for "file"
-        bio = BytesIO(audio_bytes)
-        resp = client.audio.transcriptions.complete(model=model, file={"content": bio, "file_name": "audio.wav"}, language=language)  # language optional
-        # resp typically includes "text"
-        if isinstance(resp, dict):
-            return resp.get("text", "")
-        # fallback attribute access
-        return getattr(resp, "text", "")
-    except Exception as e:
-        raise
-# ---------------- streaming & processing ----------------
-def generate_stream_multimedia(media_src: str, custom_prompt: str, alt_key: str, model: str = DEFAULT_MODEL_VIDEO):
-    client = get_client(alt_key)
-    prompt_text = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else
                    "Provide a detailed, neutral, clinical-style description focusing on observable non-sexual features, hygiene, skin condition, posture, and general anatomy. Keep language professional.")
-    # If input looks like an image (by extension) try the image path used previously
-    lower = (media_src or "").lower()
-    is_image_ext = lower.endswith((".jpg", ".jpeg", ".png", ".webp", ".gif")) or not is_remote(media_src) and os.path.isfile(media_src) and any(media_src.lower().endswith(ext) for ext in (".jpg", ".jpeg", ".png", ".webp", ".gif"))
-    # If it's an image, reuse existing image flow (convert to JPEG and send)
-    if is_image_ext:
         try:
-            raw = fetch_bytes(media_src)
-            jpg = convert_to_jpeg_bytes(raw, filename_hint=os.path.basename(media_src) or "input")
         except Exception as e:
             yield f"Error processing image: {e}"
             return
-        b64 = to_b64_jpeg(jpg)
-        # choose image-capable model (keep previous model)
-        image_model = DEFAULT_MODEL_IMAGE
         messages = [{
             "role": "user",
             "content": [
-                {"type": "text", "text": prompt_text},
                 {"type": "image_url", "image_url": f"data:image/jpeg;base64,{b64}"}
             ],
             "stream": False
         }]
         try:
             partial = ""
-            for chunk in client.chat.stream(model=image_model, messages=messages):
-                if getattr(chunk, "data", None) and chunk.data.choices[0].delta.content is not None:
-                    partial += chunk.data.choices[0].delta.content
                     yield partial
             return
         except Exception as e:
             yield f"Model error (image): {e}"
             return
-    # If model supports audio/video and input is a remote URL, try sending the video URL directly
-    if model_supports_audio(model) and is_remote(media_src):
-        # Try direct video URL block — many Mistral processors accept {"type":"video","url": ...}
-        messages = [{
-            "role": "user",
-            "content": [
-                {"type": "text", "text": prompt_text},
-                {"type": "video", "url": media_src}
-            ],
-            "stream": False
-        }]
-        try:
-            partial = ""
-            for chunk in client.chat.stream(model=model, messages=messages):
-                if getattr(chunk, "data", None) and chunk.data.choices[0].delta.content is not None:
-                    partial += chunk.data.choices[0].delta.content
-                    yield partial
-            return
-        except Exception:
-            # if direct video URL fails, fall back to audio extraction/transcription below
-            pass
-    # Fallback: download media, extract audio, transcribe, then send transcript + prompt to chat
     tmp_media = None
     tmp_audio = None
     try:
-        tmp_media = save_remote_to_temp(media_src, suffix=".mp4")
         tmp_audio = tempfile.mktemp(suffix=".wav")
         ffmpeg_extract_audio(tmp_media, tmp_audio)
         with open(tmp_audio, "rb") as f:
             audio_bytes = f.read()
-        # Use transcription endpoint
         try:
-            transcript = transcribe_audio_with_client(client, model, audio_bytes)
         except Exception as e:
             yield f"Transcription error: {e}"
             return
-        # Send transcript + prompt to chosen chat model for streaming description
-        # Use image model (text-only) or audio-capable chat model for richer understanding
-        chat_model = model if model_supports_audio(model) else DEFAULT_MODEL_IMAGE
         messages = [{
             "role": "user",
             "content": [
-                {"type": "text", "text": f"{prompt_text}\n\nTranscript:\n{transcript}"}
             ],
             "stream": False
         }]
         partial = ""
         for chunk in client.chat.stream(model=chat_model, messages=messages):
-            if getattr(chunk, "data", None) and chunk.data.choices[0].delta.content is not None:
-                partial += chunk.data.choices[0].delta.content
                 yield partial
         return
     except Exception as e:
-        yield f"Error processing media/audio fallback: {e}"
     finally:
         for p in (tmp_media, tmp_audio):
             try:
@@ -226,57 +200,51 @@ def generate_stream_multimedia(media_src: str, custom_prompt: str, alt_key: str,
             except Exception:
                 pass
-# ---------------- Gradio UI ----------------
 with gr.Blocks(title="Image/Video to Clinical Description") as demo:
-    gr.Markdown("Image/Video to Clinical Description (custom prompt optional)")
     with gr.Row():
         with gr.Column(scale=1):
-            alt_key = gr.Textbox(label="Mistral API Key (optional)", type="password", max_lines=1)
-            preview_img = gr.Image(label="Preview image (first frame)", type="pil")
-            preview_video = gr.HTML("<div style='color:gray'>Video preview will appear here when a video URL is provided.</div>")
-            url_input = gr.Textbox(label="Image/Video URL", placeholder="https://...")
-            custom = gr.Textbox(label="Custom prompt (optional)", lines=4, placeholder="Enter custom prompt to override default")
-            model_select = gr.Dropdown(label="Model", choices=[DEFAULT_MODEL_IMAGE, DEFAULT_MODEL_VIDEO], value=DEFAULT_MODEL_VIDEO)
             submit = gr.Button("Submit")
         with gr.Column(scale=1):
             output_display = gr.Markdown("", elem_id="generated_output")
     def load_preview(url):
         if not url:
             return None, "<div style='color:gray'>No URL provided.</div>"
-        # Try to preview as image first (works for image URLs)
         try:
-            r = requests.get(url, timeout=30)
             r.raise_for_status()
-            # If content-type indicates video, create <video> tag for preview
-            content_type = r.headers.get("content-type", "")
-            if content_type.startswith("video/") or any(url.lower().endswith(ext) for ext in (".mp4", ".mov", ".webm", ".mkv")):
-                # build HTML5 video preview
-                video_html = f"""
-                <video controls style="max-width:100%;height:auto;">
-                  <source src="{url}" type="{content_type or 'video/mp4'}">
-                  Your browser does not support the video tag.
-                </video>
-                """
                 return None, video_html
             # otherwise treat as image
-            img = Image.open(BytesIO(r.content)).convert("RGB")
-            return img, "<div style='color:gray'>Image preview shown. If this is a video, server didn't report video content-type.</div>"
         except Exception:
-            # If remote fetch fails for preview, show nothing
             return None, "<div style='color:red'>Preview failed to load.</div>"
-    def start_gen(url, custom_p, alt_k, model_name):
         if not url:
             return "No URL provided."
         text = ""
-        for chunk in generate_stream_multimedia(url, custom_p, alt_k, model=model_name):
-            text += chunk
             yield text
     url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_img, preview_video])
-    submit.click(fn=start_gen, inputs=[url_input, custom, alt_key, model_select], outputs=[output_display])
 if __name__ == "__main__":
     demo.launch()

 import os
 import shutil
 import tempfile
 import subprocess
 from io import BytesIO
 from PIL import Image
+import base64
 import requests
 import gradio as gr
 from mistralai import Mistral
+# Configuration
+DEFAULT_KEY = os.getenv("MISTRAL_API_KEY", "")
+DEFAULT_IMAGE_MODEL = "pixtral-12b-2409"
+DEFAULT_VIDEO_MODEL = "voxtral-mini-latest"
+def get_client(key: str = None):
+    api_key = (key or "").strip() or DEFAULT_KEY
+    return Mistral(api_key=api_key)
+def is_remote(src: str) -> bool:
+    return bool(src) and (src.startswith("http://") or src.startswith("https://"))
+def fetch_bytes(src: str) -> bytes:
     if is_remote(src):
         r = requests.get(src, timeout=60)
         r.raise_for_status()
     with open(src, "rb") as f:
         return f.read()
+# Image utilities (kept minimal)
 def convert_to_jpeg_bytes(media_bytes: bytes, filename_hint: str = "input"):
+    img = Image.open(BytesIO(media_bytes))
+    if img.mode != "RGB":
+        img = img.convert("RGB")
+    base_h = 512
+    w = int(img.width * (base_h / img.height))
+    img = img.resize((w, base_h), Image.LANCZOS)
+    buf = BytesIO()
+    img.save(buf, format="JPEG", quality=90)
+    return buf.getvalue()
+def b64_jpeg(img_bytes: bytes) -> str:
     return base64.b64encode(img_bytes).decode("utf-8")
+# Model capability detection
 def model_supports_audio(model_name: str) -> bool:
     if not model_name:
         return False
     mn = model_name.lower()
     return "voxtral" in mn or "audio" in mn or "video" in mn
+# Temp file helpers
 def save_remote_to_temp(url: str, suffix: str = "") -> str:
     b = fetch_bytes(url)
     fd, path = tempfile.mkstemp(suffix=suffix or os.path.splitext(url)[1] or "")
         f.write(b)
     return path
+# ffmpeg audio extraction
 def ffmpeg_extract_audio(in_path: str, out_path: str):
     ffmpeg = shutil.which("ffmpeg")
     if not ffmpeg:
+        raise RuntimeError("ffmpeg not found in runtime")
     cmd = [ffmpeg, "-y", "-i", in_path, "-vn", "-ar", "16000", "-ac", "1", "-f", "wav", out_path]
     subprocess.run(cmd, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL, timeout=120)
     return out_path
+# Transcription via Mistral audio.transcriptions.complete
+def transcribe_audio(client: Mistral, model: str, audio_bytes: bytes, language: str = None) -> str:
+    bio = BytesIO(audio_bytes)
+    resp = client.audio.transcriptions.complete(model=model, file={"content": bio, "file_name": "audio.wav"}, language=language)
+    if isinstance(resp, dict):
+        return resp.get("text", "")
+    return getattr(resp, "text", "")
+# Core processing + streaming
+def generate_stream_multimedia(src: str, custom_prompt: str, api_key: str, model_name: str):
+    client = get_client(api_key)
+    prompt_base = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else
                    "Provide a detailed, neutral, clinical-style description focusing on observable non-sexual features, hygiene, skin condition, posture, and general anatomy. Keep language professional.")
+    # Label / heading used once at start
+    heading = f"### {custom_prompt.strip()}" if custom_prompt and custom_prompt.strip() else "### Clinical-style Description"
+    # If input looks like an image (ext or local file), use image flow
+    lower = (src or "").lower()
+    image_exts = (".jpg", ".jpeg", ".png", ".webp", ".gif")
+    is_image = lower.endswith(image_exts) or (not is_remote(src) and os.path.isfile(src) and src.lower().endswith(image_exts))
+    if is_image:
         try:
+            raw = fetch_bytes(src)
+            jpg = convert_to_jpeg_bytes(raw, filename_hint=os.path.basename(src) or "input")
         except Exception as e:
             yield f"Error processing image: {e}"
             return
+        b64 = b64_jpeg(jpg)
         messages = [{
             "role": "user",
             "content": [
+                {"type": "text", "text": prompt_base},
                 {"type": "image_url", "image_url": f"data:image/jpeg;base64,{b64}"}
             ],
             "stream": False
         }]
+        # stream from an image-capable model
         try:
+            yielded_heading = False
             partial = ""
+            for chunk in client.chat.stream(model=DEFAULT_IMAGE_MODEL, messages=messages):
+                delta = getattr(chunk, "data", None) and chunk.data.choices[0].delta.content
+                if delta is not None:
+                    if not yielded_heading:
+                        partial += heading + "\n\n"
+                        yielded_heading = True
+                    partial += delta
                     yield partial
             return
         except Exception as e:
             yield f"Model error (image): {e}"
             return
+    # If model supports audio/video and src is remote, try direct video URL variants
+    if model_supports_audio(model_name) and is_remote(src):
+        # Try a few common video/audio URL block shapes supported by Mistral clients
+        variants = [
+            {"type": "video", "url": src},
+            {"type": "video_url", "video_url": src},
+            {"type": "input_audio", "input_audio_url": src},  # less common; try anyway
+            {"type": "audio", "url": src}
+        ]
+        for v in variants:
+            messages = [{
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": prompt_base},
+                    v
+                ],
+                "stream": False
+            }]
+            try:
+                yielded_heading = False
+                partial = ""
+                for chunk in client.chat.stream(model=model_name, messages=messages):
+                    delta = getattr(chunk, "data", None) and chunk.data.choices[0].delta.content
+                    if delta is not None:
+                        if not yielded_heading:
+                            partial += heading + "\n\n"
+                            yielded_heading = True
+                        partial += delta
+                        yield partial
+                return
+            except Exception:
+                # try next variant
+                pass
+    # Fallback: download, extract audio, transcribe, then send transcript + prompt to chat model
     tmp_media = None
     tmp_audio = None
     try:
+        tmp_media = save_remote_to_temp(src, suffix=".mp4")
         tmp_audio = tempfile.mktemp(suffix=".wav")
         ffmpeg_extract_audio(tmp_media, tmp_audio)
         with open(tmp_audio, "rb") as f:
             audio_bytes = f.read()
+        # Use transcription endpoint (voxtral-mini-latest recommended)
         try:
+            transcript = transcribe_audio(client, model_name, audio_bytes)
         except Exception as e:
             yield f"Transcription error: {e}"
             return
+        # Send transcript + prompt to chat model and stream response
+        chat_model = model_name if model_supports_audio(model_name) else DEFAULT_IMAGE_MODEL
         messages = [{
             "role": "user",
             "content": [
+                {"type": "text", "text": f"{prompt_base}\n\nTranscript:\n{transcript}"}
             ],
             "stream": False
         }]
+        yielded_heading = False
         partial = ""
         for chunk in client.chat.stream(model=chat_model, messages=messages):
+            delta = getattr(chunk, "data", None) and chunk.data.choices[0].delta.content
+            if delta is not None:
+                if not yielded_heading:
+                    partial += heading + "\n\n"
+                    yielded_heading = True
+                partial += delta
                 yield partial
         return
     except Exception as e:
+        yield f"Error processing fallback: {e}"
     finally:
         for p in (tmp_media, tmp_audio):
             try:
             except Exception:
                 pass
+# --- Gradio UI ---
 with gr.Blocks(title="Image/Video to Clinical Description") as demo:
+    gr.Markdown("Image/Video to Clinical Description — provides a clinical, non-sexual, neutral description of images or video (audio optional).")
     with gr.Row():
         with gr.Column(scale=1):
+            api_key = gr.Textbox(label="Mistral API Key (optional)", type="password", max_lines=1)
+            preview_img = gr.Image(label="Image preview (if image)", type="pil")
+            preview_video = gr.HTML("<div style='color:gray'>Video preview will appear here for video URLs.</div>")
+            url_input = gr.Textbox(label="Image or Video URL", placeholder="https://...")
+            custom_prompt = gr.Textbox(label="Custom heading (optional)", lines=2, placeholder="Custom heading to appear above the description")
+            model_select = gr.Dropdown(label="Model", choices=[DEFAULT_IMAGE_MODEL, DEFAULT_VIDEO_MODEL], value=DEFAULT_VIDEO_MODEL)
             submit = gr.Button("Submit")
         with gr.Column(scale=1):
             output_display = gr.Markdown("", elem_id="generated_output")
+    # Preview loader: choose image preview if image, otherwise HTML5 video tag for video
     def load_preview(url):
         if not url:
             return None, "<div style='color:gray'>No URL provided.</div>"
         try:
+            r = requests.get(url, timeout=30, stream=True)
             r.raise_for_status()
+            ctype = r.headers.get("content-type", "")
+            # treat explicit video content-type or known extensions as video
+            if ctype.startswith("video/") or any(url.lower().endswith(ext) for ext in (".mp4", ".mov", ".webm", ".mkv")):
+                video_html = f'<video controls style="max-width:100%;height:auto;"><source src="{url}" type="{ctype or "video/mp4"}">Your browser does not support the video tag.</video>'
                 return None, video_html
             # otherwise treat as image
+            data = r.content
+            img = Image.open(BytesIO(data)).convert("RGB")
+            return img, "<div style='color:gray'>Image preview shown.</div>"
         except Exception:
             return None, "<div style='color:red'>Preview failed to load.</div>"
+    def run_generation(url, custom_h, key, model_name):
         if not url:
             return "No URL provided."
         text = ""
+        for chunk in generate_stream_multimedia(url, custom_h, key, model_name):
+            text = chunk  # chunk already accumulates heading + partial text
             yield text
     url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_img, preview_video])
+    submit.click(fn=run_generation, inputs=[url_input, custom_prompt, api_key, model_select], outputs=[output_display])
 if __name__ == "__main__":
     demo.launch()