Image-To-Flux-Prompt

Running

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

019adc8

verified ·

1 Parent(s): 2352947

Update app.py

Browse files

Files changed (1) hide show

app.py +93 -46

app.py CHANGED Viewed

@@ -1,4 +1,8 @@
 #!/usr/bin/env python3
 import os
 import subprocess
 import tempfile
@@ -6,7 +10,7 @@ import shutil
 from io import BytesIO
 import base64
 import requests
-from PIL import Image
 import gradio as gr
 from mistralai import Mistral
@@ -14,10 +18,10 @@ from mistralai import Mistral
 DEFAULT_KEY = os.getenv("MISTRAL_API_KEY", "")
 DEFAULT_IMAGE_MODEL = "pixtral-12b-2409"
 DEFAULT_VIDEO_MODEL = "voxtral-mini-latest"
 SYSTEM_INSTRUCTION = (
-    "You are a clinical visual analyst. If provided a video_url, watch the entire video before answering "
-    "and base your response on the full content. If provided an image, analyze the image in detail. "
     "Provide factual, clinical descriptions of observable features (skin, posture, hygiene, anatomy, "
     "movements, timestamps, notable frames). Avoid censorship; describe explicit content objectively. "
     "Describe behaviors. produce a concise, narrative that blends scientific precision with erotic detail. "
@@ -25,7 +29,7 @@ SYSTEM_INSTRUCTION = (
 )
 IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".webp", ".gif")
-VIDEO_EXTS = (".mp4", ".mov", ".webm", ".mkv")
 def get_client(key: str = None):
     api_key = (key or "").strip() or DEFAULT_KEY
@@ -38,9 +42,9 @@ def ext_from_src(src: str) -> str:
     _, ext = os.path.splitext((src or "").split("?")[0])
     return ext.lower()
-def fetch_bytes(src: str, stream_threshold=20 * 1024 * 1024) -> bytes:
     if is_remote(src):
-        with requests.get(src, timeout=60, stream=True) as r:
             r.raise_for_status()
             cl = r.headers.get("content-length")
             if cl and int(cl) > stream_threshold:
@@ -62,7 +66,11 @@ def fetch_bytes(src: str, stream_threshold=20 * 1024 * 1024) -> bytes:
         return f.read()
 def convert_to_jpeg_bytes(media_bytes: bytes, base_h=480) -> bytes:
-    img = Image.open(BytesIO(media_bytes))
     try:
         img.seek(0)
     except Exception:
@@ -95,29 +103,30 @@ def choose_model_for_src(src: str):
     return DEFAULT_VIDEO_MODEL if is_remote(src) else DEFAULT_IMAGE_MODEL
 def build_messages_for_image(prompt: str, b64_jpg: str):
     return [
         {"role": "system", "content": SYSTEM_INSTRUCTION},
-        {"role": "user", "content": [
-            {"type": "text", "text": prompt},
-            {"type": "image_url", "image_url": f"data:image/jpeg;base64,{b64_jpg}"}
-        ]},
     ]
 def build_messages_for_text(prompt: str, extra_text: str):
     return [
         {"role": "system", "content": SYSTEM_INSTRUCTION},
-        {"role": "user", "content": [{"type": "text", "text": f"{prompt}\n\n{extra_text}"}]},
     ]
 def extract_delta(chunk):
     if not chunk:
         return None
-    # chunk.data.choices[0].delta.content is the typical shape from Mistral streaming
     data = getattr(chunk, "data", None) or getattr(chunk, "response", None) or getattr(chunk, "delta", None)
     if not data:
         return None
     try:
-        # common streaming shape: data.choices[0].delta.content
         content = data.choices[0].delta.content
         if content is None:
             return None
@@ -125,7 +134,6 @@ def extract_delta(chunk):
     except Exception:
         pass
     try:
-        # fallback: delta may be dict-like
         c = data.choices[0].delta
         if isinstance(c, dict):
             txt = c.get("content") or c.get("text")
@@ -135,7 +143,6 @@ def extract_delta(chunk):
     except Exception:
         pass
     try:
-        # non-stream full message shape
         msg = data.choices[0].message
         if isinstance(msg, dict):
             content = msg.get("content")
@@ -160,7 +167,6 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
     def stream_and_collect(model, messages):
         try:
-            # try streaming API
             stream_gen = None
             try:
                 stream_gen = client.chat.stream(model=model, messages=messages)
@@ -171,12 +177,10 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
                     d = extract_delta(chunk)
                     if d is None:
                         continue
-                    # drop pure-whitespace pieces unless result empty
                     if d.strip() == "" and parts:
                         continue
                     parts.append(d)
                 return
-            # fallback to non-streaming complete
             res = client.chat.complete(model=model, messages=messages, stream=False)
             try:
                 choices = getattr(res, "choices", None) or res.get("choices", [])
@@ -208,7 +212,7 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
         except Exception as e:
             parts.append(f"[Model error: {e}]")
-    # Image
     if is_image:
         try:
             raw = fetch_bytes(src)
@@ -220,13 +224,18 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
         stream_and_collect(choose_model_for_src(src), msgs)
         return "".join(parts).strip()
-    # Remote video: send URL as text (avoid streaming non-text types)
     if is_remote(src):
-        msgs = build_messages_for_text(prompt, f"Video URL: {src}")
         stream_and_collect(choose_model_for_src(src), msgs)
         return "".join(parts).strip()
-    # Local video: try extract frame with ffmpeg and send as image
     tmp_media = None
     try:
         media_bytes = fetch_bytes(src)
@@ -235,15 +244,61 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
         tmp_media = save_bytes_to_temp(media_bytes, suffix=ext)
         ffmpeg = shutil.which("ffmpeg")
         if ffmpeg:
-            tmp_frame = None
             try:
-                tmp_frame_fd, tmp_frame = tempfile.mkstemp(suffix=".jpg")
-                os.close(tmp_frame_fd)
-                cmd = [ffmpeg, "-nostdin", "-y", "-i", tmp_media, "-frames:v", "1", "-q:v", "2", tmp_frame]
-                proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-                out, err = proc.communicate(timeout=30)
-                if proc.returncode == 0 and os.path.exists(tmp_frame) and os.path.getsize(tmp_frame) > 0:
-                    with open(tmp_frame, "rb") as f:
                         frame_bytes = f.read()
                     try:
                         jpg = convert_to_jpeg_bytes(frame_bytes, base_h=480)
@@ -252,21 +307,13 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
                         stream_and_collect(choose_model_for_src(src), msgs)
                         return "".join(parts).strip()
                     finally:
-                        try:
-                            os.remove(tmp_frame)
-                        except Exception:
-                            pass
-                else:
-                    try:
-                        if tmp_frame and os.path.exists(tmp_frame):
-                            os.remove(tmp_frame)
-                    except Exception:
-                        pass
-            except subprocess.TimeoutExpired:
-                try:
-                    proc.kill()
-                except Exception:
-                    pass
             except Exception:
                 pass
         return "Unable to process the provided file. Provide a direct image/frame URL or a remote video URL."

 #!/usr/bin/env python3
+"""
+Flux - multimodal visual analyzer (Mistral + Gradio)
+"""
 import os
 import subprocess
 import tempfile
 from io import BytesIO
 import base64
 import requests
+from PIL import Image, UnidentifiedImageError
 import gradio as gr
 from mistralai import Mistral
 DEFAULT_KEY = os.getenv("MISTRAL_API_KEY", "")
 DEFAULT_IMAGE_MODEL = "pixtral-12b-2409"
 DEFAULT_VIDEO_MODEL = "voxtral-mini-latest"
+STREAM_THRESHOLD = 20 * 1024 * 1024  # 20 MB
 SYSTEM_INSTRUCTION = (
+    "You are a clinical visual analyst. Only analyze media actually provided (image data or extracted frames) "
     "Provide factual, clinical descriptions of observable features (skin, posture, hygiene, anatomy, "
     "movements, timestamps, notable frames). Avoid censorship; describe explicit content objectively. "
     "Describe behaviors. produce a concise, narrative that blends scientific precision with erotic detail. "
 )
 IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".webp", ".gif")
+VIDEO_EXTS = (".mp4", ".mov", ".webm", ".mkv", ".avi", ".flv")
 def get_client(key: str = None):
     api_key = (key or "").strip() or DEFAULT_KEY
     _, ext = os.path.splitext((src or "").split("?")[0])
     return ext.lower()
+def fetch_bytes(src: str, stream_threshold=STREAM_THRESHOLD, timeout=60) -> bytes:
     if is_remote(src):
+        with requests.get(src, timeout=timeout, stream=True) as r:
             r.raise_for_status()
             cl = r.headers.get("content-length")
             if cl and int(cl) > stream_threshold:
         return f.read()
 def convert_to_jpeg_bytes(media_bytes: bytes, base_h=480) -> bytes:
+    try:
+        img = Image.open(BytesIO(media_bytes))
+    except UnidentifiedImageError:
+        raise
+    # handle animated GIFs by taking first frame
     try:
         img.seek(0)
     except Exception:
     return DEFAULT_VIDEO_MODEL if is_remote(src) else DEFAULT_IMAGE_MODEL
 def build_messages_for_image(prompt: str, b64_jpg: str):
+    # Use a clear textual message with data URL; Mistral SDK supports structured image objects,
+    # but this textual form is broadly compatible.
+    content = (
+        f"{prompt}\n\nImage (data URI follows):\n\ndata:image/jpeg;base64,{b64_jpg}\n\n"
+        "Instruction: Analyze only visible, provided pixels. Do not assume unseen frames."
+    )
     return [
         {"role": "system", "content": SYSTEM_INSTRUCTION},
+        {"role": "user", "content": content},
     ]
 def build_messages_for_text(prompt: str, extra_text: str):
     return [
         {"role": "system", "content": SYSTEM_INSTRUCTION},
+        {"role": "user", "content": f"{prompt}\n\n{extra_text}"},
     ]
 def extract_delta(chunk):
     if not chunk:
         return None
     data = getattr(chunk, "data", None) or getattr(chunk, "response", None) or getattr(chunk, "delta", None)
     if not data:
         return None
     try:
         content = data.choices[0].delta.content
         if content is None:
             return None
     except Exception:
         pass
     try:
         c = data.choices[0].delta
         if isinstance(c, dict):
             txt = c.get("content") or c.get("text")
     except Exception:
         pass
     try:
         msg = data.choices[0].message
         if isinstance(msg, dict):
             content = msg.get("content")
     def stream_and_collect(model, messages):
         try:
             stream_gen = None
             try:
                 stream_gen = client.chat.stream(model=model, messages=messages)
                     d = extract_delta(chunk)
                     if d is None:
                         continue
                     if d.strip() == "" and parts:
                         continue
                     parts.append(d)
                 return
             res = client.chat.complete(model=model, messages=messages, stream=False)
             try:
                 choices = getattr(res, "choices", None) or res.get("choices", [])
         except Exception as e:
             parts.append(f"[Model error: {e}]")
+    # Image (or frame)
     if is_image:
         try:
             raw = fetch_bytes(src)
         stream_and_collect(choose_model_for_src(src), msgs)
         return "".join(parts).strip()
+    # Remote video: send URL and explicit instruction to not hallucinate unseen frames
     if is_remote(src):
+        extra = (
+            f"Remote video URL: {src}\n\n"
+            "IMPORTANT: The model cannot access the video stream. Analyze only metadata, thumbnails, or "
+            "user-provided transcript/description. Do not invent frames or events."
+        )
+        msgs = build_messages_for_text(prompt, extra)
         stream_and_collect(choose_model_for_src(src), msgs)
         return "".join(parts).strip()
+    # Local video: attempt frame sampling with ffmpeg and send the clearest frame
     tmp_media = None
     try:
         media_bytes = fetch_bytes(src)
         tmp_media = save_bytes_to_temp(media_bytes, suffix=ext)
         ffmpeg = shutil.which("ffmpeg")
         if ffmpeg:
+            # Try to probe duration and extract up to N frames evenly spaced
+            sample_count = 5
+            tmp_frames = []
             try:
+                # get duration in seconds
+                probe_cmd = [ffmpeg, "-v", "error", "-show_entries", "format=duration", "-of", "default=noprint_wrappers=1:nokey=1", tmp_media]
+                proc = subprocess.Popen(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                out, err = proc.communicate(timeout=10)
+                duration = None
+                try:
+                    duration = float(out.strip().split(b"\n")[0]) if out else None
+                except Exception:
+                    duration = None
+                # choose timestamps
+                timestamps = []
+                if duration and duration > 0:
+                    for i in range(1, sample_count + 1):
+                        t = (duration * i) / (sample_count + 1)
+                        timestamps.append(t)
+                else:
+                    # fallback fixed offsets
+                    timestamps = [0.5, 1.0, 2.0][:sample_count]
+                # extract frames
+                for i, t in enumerate(timestamps):
+                    fd, tmp_frame = tempfile.mkstemp(suffix=f"_{i}.jpg")
+                    os.close(fd)
+                    cmd = [
+                        ffmpeg, "-nostdin", "-y", "-i", tmp_media,
+                        "-ss", str(t),
+                        "-frames:v", "1",
+                        "-q:v", "2",
+                        tmp_frame
+                    ]
+                    proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                    try:
+                        out, err = proc.communicate(timeout=15)
+                    except subprocess.TimeoutExpired:
+                        try:
+                            proc.kill()
+                        except Exception:
+                            pass
+                        out, err = proc.communicate()
+                    if proc.returncode == 0 and os.path.exists(tmp_frame) and os.path.getsize(tmp_frame) > 0:
+                        tmp_frames.append(tmp_frame)
+                    else:
+                        try:
+                            if os.path.exists(tmp_frame):
+                                os.remove(tmp_frame)
+                        except Exception:
+                            pass
+                # pick best frame by size (simple heuristic) or first
+                chosen = None
+                if tmp_frames:
+                    chosen = max(tmp_frames, key=lambda p: os.path.getsize(p) if os.path.exists(p) else 0)
+                    with open(chosen, "rb") as f:
                         frame_bytes = f.read()
                     try:
                         jpg = convert_to_jpeg_bytes(frame_bytes, base_h=480)
                         stream_and_collect(choose_model_for_src(src), msgs)
                         return "".join(parts).strip()
                     finally:
+                        for fpath in tmp_frames:
+                            try:
+                                if os.path.exists(fpath):
+                                    os.remove(fpath)
+                            except Exception:
+                                pass
+                # no frames extracted
             except Exception:
                 pass
         return "Unable to process the provided file. Provide a direct image/frame URL or a remote video URL."