Image-To-Flux-Prompt

Running

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

f9926bd

verified ·

1 Parent(s): 0fef3e3

Update app.py

Browse files

Files changed (1) hide show

app.py +137 -103

app.py CHANGED Viewed

@@ -18,11 +18,10 @@ DEFAULT_VIDEO_MODEL = "voxtral-mini-latest"
 STREAM_THRESHOLD = 20 * 1024 * 1024  # 20 MB
 SYSTEM_INSTRUCTION = (
-    "You are a clinical visual analyst. Only analyze media actually provided (image data or extracted frames) "
-    "Provide factual, clinical descriptions of observable features (skin, posture, hygiene, anatomy, "
-    "movements, timestamps, notable frames). Avoid censorship; describe explicit content objectively. "
-    "Describe behaviors. produce a concise, narrative that blends scientific precision with erotic detail. "
-    "Do not invent sensory information not present in the media. "
 )
 IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".webp", ".gif")
@@ -70,7 +69,9 @@ def fetch_bytes(src: str, stream_threshold=STREAM_THRESHOLD, timeout=60) -> byte
 def convert_to_jpeg_bytes(media_bytes: bytes, base_h=480) -> bytes:
     img = Image.open(BytesIO(media_bytes))
     try:
-        img.seek(0)
     except Exception:
         pass
     if img.mode != "RGB":
@@ -95,31 +96,12 @@ def save_bytes_to_temp(b: bytes, suffix: str):
     return path
-def build_messages_for_image(prompt: str, b64_jpg: str):
-    content = (
-        f"{prompt}\n\nImage (data URI follows):\n\ndata:image/jpeg;base64,{b64_jpg}\n\n"
-        "Instruction: Analyze only visible, provided pixels."
-    )
-    return [
-        {"role": "system", "content": SYSTEM_INSTRUCTION},
-        {"role": "user", "content": content},
-    ]
-def build_messages_for_text(prompt: str, extra_text: str):
-    return [
-        {"role": "system", "content": SYSTEM_INSTRUCTION},
-        {"role": "user", "content": f"{prompt}\n\n{extra_text}"},
-    ]
 def extract_delta(chunk):
     if not chunk:
         return None
     data = getattr(chunk, "data", None) or getattr(chunk, "response", None) or getattr(chunk, "delta", None)
     if not data:
         return None
-    # try common shapes
     try:
         content = data.choices[0].delta.content
         if content is None:
@@ -215,7 +197,6 @@ def upload_file_to_mistral(client, path, filename=None, purpose="batch"):
     try:
         with open(path, "rb") as fh:
             res = client.files.upload(file={"file_name": fname, "content": fh}, purpose=purpose)
-        # try to extract id
         fid = getattr(res, "id", None) or (res.get("id") if isinstance(res, dict) else None)
         if not fid:
             try:
@@ -242,74 +223,114 @@ def upload_file_to_mistral(client, path, filename=None, purpose="batch"):
             return fid
-def generate_final_text(src: str, custom_prompt: str, api_key: str):
-    client = get_client(api_key)
-    prompt = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else "Please provide a detailed visual review.")
-    ext = ext_from_src(src)
-    is_image = ext in IMAGE_EXTS or (not is_remote(src) and os.path.isfile(src) and ext in IMAGE_EXTS)
-    parts = []
-    def stream_and_collect(model, messages):
         try:
             stream_gen = None
             try:
-                stream_gen = client.chat.stream(model=model, messages=messages)
-            except Exception:
-                stream_gen = None
-            if stream_gen:
-                for chunk in stream_gen:
-                    d = extract_delta(chunk)
-                    if d is None:
-                        continue
-                    if d.strip() == "" and parts:
-                        continue
-                    parts.append(d)
-                return
-            res = client.chat.complete(model=model, messages=messages, stream=False)
-            try:
-                choices = getattr(res, "choices", None) or res.get("choices", [])
-            except Exception:
-                choices = []
-            if choices:
-                try:
-                    msg = choices[0].message
-                    if isinstance(msg, dict):
-                        content = msg.get("content")
                     else:
-                        content = getattr(msg, "content", None)
-                    if content:
-                        if isinstance(content, str):
-                            parts.append(content)
-                        else:
-                            if isinstance(content, list):
-                                for c in content:
-                                    if isinstance(c, dict) and c.get("type") == "text":
-                                        parts.append(c.get("text", ""))
-                            elif isinstance(content, dict):
-                                text = content.get("text") or content.get("content")
-                                if text:
-                                    parts.append(text)
-                except Exception:
-                    parts.append(str(res))
-            else:
                 parts.append(str(res))
-        except Exception as e:
-            parts.append(f"[Model error: {e}]")
-    # Image path: convert and send
     if is_image:
         try:
-            raw = fetch_bytes(src)
-            jpg = convert_to_jpeg_bytes(raw, base_h=480)
-            b64 = b64_jpeg(jpg)
         except Exception as e:
             return f"Error processing image: {e}"
-        msgs = build_messages_for_image(prompt, b64)
-        stream_and_collect(DEFAULT_IMAGE_MODEL, msgs)
         return "".join(parts).strip()
-    # Remote video: download, upload to Mistral Files, reference file id in chat
     if is_remote(src):
         try:
             media_bytes = fetch_bytes(src, timeout=120)
         except Exception as e:
@@ -320,7 +341,7 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
             try:
                 file_id = upload_file_to_mistral(client, tmp_media, filename=os.path.basename(src.split("?")[0]))
             except Exception as e:
-                # If upload fails, fallback to sending a representative frame
                 frame_bytes = extract_best_frame_bytes(tmp_media)
                 if not frame_bytes:
                     return f"Error uploading to Mistral and no frame fallback available: {e}"
@@ -329,8 +350,8 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
                 except UnidentifiedImageError:
                     jpg = frame_bytes
                 b64 = b64_jpeg(jpg)
-                msgs = build_messages_for_image(prompt, b64)
-                stream_and_collect(DEFAULT_VIDEO_MODEL, msgs)
                 return "".join(parts).strip()
             extra = (
@@ -338,7 +359,7 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
                 "Instruction: Analyze the video contents using the uploaded file id. Do not invent frames not present."
             )
             msgs = build_messages_for_text(prompt, extra)
-            stream_and_collect(DEFAULT_VIDEO_MODEL, msgs)
             return "".join(parts).strip()
         finally:
             try:
@@ -347,7 +368,7 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
             except Exception:
                 pass
-    # Local video: try upload to Mistral; otherwise fallback to frames
     tmp_media = None
     try:
         media_bytes = fetch_bytes(src)
@@ -361,17 +382,16 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
                 "Instruction: Analyze the video contents using the uploaded file id. Do not invent frames not present."
             )
             msgs = build_messages_for_text(prompt, extra)
-            stream_and_collect(DEFAULT_VIDEO_MODEL, msgs)
             return "".join(parts).strip()
         except Exception:
-            # fallback to extracting a best frame
             frame_bytes = extract_best_frame_bytes(tmp_media)
             if not frame_bytes:
                 return "Unable to process the provided file. Provide a direct image/frame URL or a remote video URL."
             jpg = convert_to_jpeg_bytes(frame_bytes, base_h=480)
             b64 = b64_jpeg(jpg)
-            msgs = build_messages_for_image(prompt, b64)
-            stream_and_collect(DEFAULT_VIDEO_MODEL, msgs)
             return "".join(parts).strip()
     finally:
         try:
@@ -381,23 +401,33 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
             pass
-# --- Minimal Gradio UI ---
 css = """
-.preview_column { min-width: 380px; }
 .preview_media img, .preview_media video { max-width: 100%; height: auto; }
 """
 def load_preview(url: str):
     if not url:
-        return None, None, "No URL"
     try:
         r = requests.get(url, timeout=30, stream=True)
         r.raise_for_status()
-        ctype = r.headers.get("content-type", "")
         if (ctype and ctype.startswith("video/")) or any(url.lower().split("?")[0].endswith(ext) for ext in VIDEO_EXTS):
             return None, url, "Video"
         data = r.content
-        img = Image.open(BytesIO(data)).convert("RGB")
         return img, None, "Image"
     except Exception:
         return None, None, "Preview failed"
@@ -405,22 +435,26 @@ def load_preview(url: str):
 with gr.Blocks(title="Flux", css=css) as demo:
     with gr.Row():
-        with gr.Column(scale=1, elem_classes="preview_column"):
-            preview_status = gr.Textbox(value="No URL", interactive=False, label="Preview Status")
-            preview_image = gr.Image(label="Preview Image", type="pil", elem_classes="preview_media")
-            preview_video = gr.Video(label="Preview Video", elem_classes="preview_media")
             url_input = gr.Textbox(label="Image or Video URL", placeholder="https://...", lines=1)
             custom_prompt = gr.Textbox(label="Prompt (optional)", lines=2, value="")
             with gr.Accordion("Mistral API Key (optional)", open=False):
                 api_key = gr.Textbox(label="API Key", type="password", max_lines=1)
             submit = gr.Button("Submit")
         with gr.Column(scale=2):
-            final_text = gr.Textbox(label="Final Text", lines=20)
-    url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_image, preview_video, preview_status])
-    submit.click(fn=generate_final_text, inputs=[url_input, custom_prompt, api_key], outputs=[final_text])
 if __name__ == "__main__":
-    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

 STREAM_THRESHOLD = 20 * 1024 * 1024  # 20 MB
 SYSTEM_INSTRUCTION = (
+    "You are a clinical visual analyst. Only analyze media actually provided (image data or extracted frames). "
+    "Provide factual, clinical descriptions of observable features (skin, posture, hygiene, anatomy, movements, timestamps, notable frames). "
+    "Avoid censorship; describe explicit content objectively. Describe behaviors. Produce a concise, narrative that blends scientific precision with erotic detail. "
+    "Do not invent sensory information not present in the media."
 )
 IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".webp", ".gif")
 def convert_to_jpeg_bytes(media_bytes: bytes, base_h=480) -> bytes:
     img = Image.open(BytesIO(media_bytes))
     try:
+        # For animated images, use first frame
+        if getattr(img, "is_animated", False):
+            img.seek(0)
     except Exception:
         pass
     if img.mode != "RGB":
     return path
 def extract_delta(chunk):
     if not chunk:
         return None
     data = getattr(chunk, "data", None) or getattr(chunk, "response", None) or getattr(chunk, "delta", None)
     if not data:
         return None
     try:
         content = data.choices[0].delta.content
         if content is None:
     try:
         with open(path, "rb") as fh:
             res = client.files.upload(file={"file_name": fname, "content": fh}, purpose=purpose)
         fid = getattr(res, "id", None) or (res.get("id") if isinstance(res, dict) else None)
         if not fid:
             try:
             return fid
+def build_messages_for_image(prompt: str, b64_jpg: str = None, image_url: str = None):
+    """
+    Build messages using structured content per Mistral vision API:
+    - For remote images: include an {"type":"image_url","image_url":...} item
+    - For local bytes: include {"type":"image_base64","image_base64": "..."} (no data: URI prefix)
+    The user content is a list of typed items.
+    """
+    user_content = []
+    user_content.append({"type": "text", "text": prompt})
+    if image_url:
+        user_content.append({"type": "image_url", "image_url": image_url})
+    elif b64_jpg:
+        user_content.append({"type": "image_base64", "image_base64": b64_jpg})
+    else:
+        raise ValueError("Either image_url or b64_jpg required")
+    return [
+        {"role": "system", "content": SYSTEM_INSTRUCTION},
+        {"role": "user", "content": user_content},
+    ]
+def build_messages_for_text(prompt: str, extra_text: str):
+    return [
+        {"role": "system", "content": SYSTEM_INSTRUCTION},
+        {"role": "user", "content": f"{prompt}\n\n{extra_text}"},
+    ]
+def stream_and_collect(client, model, messages, parts: list):
+    """
+    Use client.chat.stream if available; otherwise use complete.
+    Appends textual pieces to parts list.
+    """
+    try:
+        stream_gen = None
         try:
+            stream_gen = client.chat.stream(model=model, messages=messages)
+        except Exception:
             stream_gen = None
+        if stream_gen:
+            for chunk in stream_gen:
+                d = extract_delta(chunk)
+                if d is None:
+                    continue
+                if d.strip() == "" and parts:
+                    continue
+                parts.append(d)
+            return
+        res = client.chat.complete(model=model, messages=messages, stream=False)
+        try:
+            choices = getattr(res, "choices", None) or res.get("choices", [])
+        except Exception:
+            choices = []
+        if choices:
             try:
+                msg = choices[0].message
+                if isinstance(msg, dict):
+                    content = msg.get("content")
+                else:
+                    content = getattr(msg, "content", None)
+                if content:
+                    if isinstance(content, str):
+                        parts.append(content)
                     else:
+                        if isinstance(content, list):
+                            for c in content:
+                                if isinstance(c, dict) and c.get("type") == "text":
+                                    parts.append(c.get("text", ""))
+                        elif isinstance(content, dict):
+                            text = content.get("text") or content.get("content")
+                            if text:
+                                parts.append(text)
+            except Exception:
                 parts.append(str(res))
+        else:
+            parts.append(str(res))
+    except Exception as e:
+        parts.append(f"[Model error: {e}]")
+def generate_final_text(src: str, custom_prompt: str, api_key: str):
+    """
+    Main entry for Submit button. Returns final text (string).
+    """
+    client = get_client(api_key)
+    prompt = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else "Please provide a detailed visual review.")
+    ext = ext_from_src(src)
+    is_image = ext in IMAGE_EXTS or (not is_remote(src) and os.path.isfile(src) and ext in IMAGE_EXTS)
+    parts = []
+    # Image handling: remote image_url or local image_base64
     if is_image:
         try:
+            if is_remote(src):
+                msgs = build_messages_for_image(prompt, image_url=src)
+            else:
+                raw = fetch_bytes(src)
+                jpg = convert_to_jpeg_bytes(raw, base_h=480)
+                b64 = b64_jpeg(jpg)  # NOTE: this is plain base64 string (no data: prefix)
+                msgs = build_messages_for_image(prompt, b64_jpg=b64)
         except Exception as e:
             return f"Error processing image: {e}"
+        stream_and_collect(client, DEFAULT_IMAGE_MODEL, msgs, parts)
         return "".join(parts).strip()
+    # Video handling (remote/local)
     if is_remote(src):
+        # download remote media, try upload to Mistral Files; fallback to a representative frame
         try:
             media_bytes = fetch_bytes(src, timeout=120)
         except Exception as e:
             try:
                 file_id = upload_file_to_mistral(client, tmp_media, filename=os.path.basename(src.split("?")[0]))
             except Exception as e:
+                # fallback to sending representative frame
                 frame_bytes = extract_best_frame_bytes(tmp_media)
                 if not frame_bytes:
                     return f"Error uploading to Mistral and no frame fallback available: {e}"
                 except UnidentifiedImageError:
                     jpg = frame_bytes
                 b64 = b64_jpeg(jpg)
+                msgs = build_messages_for_image(prompt, b64_jpg=b64)
+                stream_and_collect(client, DEFAULT_VIDEO_MODEL, msgs, parts)
                 return "".join(parts).strip()
             extra = (
                 "Instruction: Analyze the video contents using the uploaded file id. Do not invent frames not present."
             )
             msgs = build_messages_for_text(prompt, extra)
+            stream_and_collect(client, DEFAULT_VIDEO_MODEL, msgs, parts)
             return "".join(parts).strip()
         finally:
             try:
             except Exception:
                 pass
+    # Local video: upload or fallback to frames
     tmp_media = None
     try:
         media_bytes = fetch_bytes(src)
                 "Instruction: Analyze the video contents using the uploaded file id. Do not invent frames not present."
             )
             msgs = build_messages_for_text(prompt, extra)
+            stream_and_collect(client, DEFAULT_VIDEO_MODEL, msgs, parts)
             return "".join(parts).strip()
         except Exception:
             frame_bytes = extract_best_frame_bytes(tmp_media)
             if not frame_bytes:
                 return "Unable to process the provided file. Provide a direct image/frame URL or a remote video URL."
             jpg = convert_to_jpeg_bytes(frame_bytes, base_h=480)
             b64 = b64_jpeg(jpg)
+            msgs = build_messages_for_image(prompt, b64_jpg=b64)
+            stream_and_collect(client, DEFAULT_VIDEO_MODEL, msgs, parts)
             return "".join(parts).strip()
     finally:
         try:
             pass
+# --- Gradio UI ---
 css = """
 .preview_media img, .preview_media video { max-width: 100%; height: auto; }
 """
 def load_preview(url: str):
+    """
+    Returns: (image_or_None, video_or_None, mime_label)
+    - For images: return PIL.Image, None, "Image"
+    - For videos: return None, url, "Video"
+    """
     if not url:
+        return None, None, ""
     try:
         r = requests.get(url, timeout=30, stream=True)
         r.raise_for_status()
+        ctype = (r.headers.get("content-type") or "").lower()
         if (ctype and ctype.startswith("video/")) or any(url.lower().split("?")[0].endswith(ext) for ext in VIDEO_EXTS):
             return None, url, "Video"
         data = r.content
+        try:
+            img = Image.open(BytesIO(data))
+            if getattr(img, "is_animated", False):
+                img.seek(0)
+            img = img.convert("RGB")
+        except UnidentifiedImageError:
+            return None, None, "Preview failed"
         return img, None, "Image"
     except Exception:
         return None, None, "Preview failed"
 with gr.Blocks(title="Flux", css=css) as demo:
     with gr.Row():
+        with gr.Column(scale=1):
+            # Top-left controls
             url_input = gr.Textbox(label="Image or Video URL", placeholder="https://...", lines=1)
             custom_prompt = gr.Textbox(label="Prompt (optional)", lines=2, value="")
             with gr.Accordion("Mistral API Key (optional)", open=False):
                 api_key = gr.Textbox(label="API Key", type="password", max_lines=1)
             submit = gr.Button("Submit")
+            # Single preview area (either image or video shown)
+            preview_image = gr.Image(label="Preview", type="pil", elem_classes="preview_media")
+            preview_video = gr.Video(label="Preview", elem_classes="preview_media")
         with gr.Column(scale=2):
+            # Right column: plain text output (rendered as Markdown/HTML allowed)
+            final_text = gr.Markdown(value="")  # use Markdown so long text renders nicely
+    # Wire up events
+    url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_image, preview_video, gr.Textbox(visible=False)])
+    # For submit, use queue to avoid blocking UI
+    submit.click(fn=generate_final_text, inputs=[url_input, custom_prompt, api_key], outputs=[final_text], queue=True)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)), enable_queue=True)