Image-To-Flux-Prompt

Running

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

b140dcf

verified ·

1 Parent(s): cf25146

Update app.py

Browse files

Files changed (1) hide show

app.py +97 -135

app.py CHANGED Viewed

@@ -3,16 +3,17 @@ import os
 import subprocess
 import tempfile
 import shutil
-from io import BytesIO
 import base64
 import requests
-from PIL import Image, UnidentifiedImageError, ImageFile
 import gradio as gr
 from mistralai import Mistral
 DEFAULT_KEY = os.getenv("MISTRAL_API_KEY", "")
-DEFAULT_IMAGE_MODEL = "pixtral-12b-2409"
-DEFAULT_VIDEO_MODEL = "voxtral-mini-latest"
 STREAM_THRESHOLD = 20 * 1024 * 1024
 FFMPEG_BIN = shutil.which("ffmpeg")
@@ -23,27 +24,23 @@ SYSTEM_INSTRUCTION = (
     "Do not invent sensory information not present in the media."
 )
-IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".webp", ".gif")
-VIDEO_EXTS = (".mp4", ".mov", ".webm", ".mkv", ".avi", ".flv")
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 Image.MAX_IMAGE_PIXELS = 10000 * 10000
 def get_client(key: str = None):
     api_key = (key or "").strip() or DEFAULT_KEY
     return Mistral(api_key=api_key)
 def is_remote(src: str) -> bool:
     return bool(src) and src.startswith(("http://", "https://"))
 def ext_from_src(src: str) -> str:
     _, ext = os.path.splitext((src or "").split("?")[0])
     return ext.lower()
 def fetch_bytes(src: str, stream_threshold=STREAM_THRESHOLD, timeout=60) -> bytes:
     if is_remote(src):
         with requests.get(src, timeout=timeout, stream=True) as r:
@@ -58,20 +55,23 @@ def fetch_bytes(src: str, stream_threshold=STREAM_THRESHOLD, timeout=60) -> byte
                             if chunk:
                                 f.write(chunk)
                     with open(path, "rb") as f:
-                        data = f.read()
                 finally:
-                    try:
-                        os.remove(path)
-                    except Exception:
-                        pass
-                return data
             return r.content
     with open(src, "rb") as f:
         return f.read()
-def convert_to_jpeg_bytes(media_bytes: bytes, base_h=480) -> bytes:
-    img = Image.open(BytesIO(media_bytes))
     try:
         if getattr(img, "is_animated", False):
             img.seek(0)
@@ -86,75 +86,54 @@ def convert_to_jpeg_bytes(media_bytes: bytes, base_h=480) -> bytes:
     img.save(buf, format="JPEG", quality=85)
     return buf.getvalue()
 def b64_jpeg(img_bytes: bytes) -> str:
     return base64.b64encode(img_bytes).decode("utf-8")
-def save_bytes_to_temp(b: bytes, suffix: str):
-    fd, path = tempfile.mkstemp(suffix=suffix)
-    os.close(fd)
-    with open(path, "wb") as f:
-        f.write(b)
-    return path
 def extract_best_frames_bytes(media_path: str, sample_count: int = 5, timeout_probe: int = 10, timeout_extract: int = 15):
     if not FFMPEG_BIN or not os.path.exists(media_path):
         return []
-    tmp_frames = []
     try:
-        probe_cmd = [FFMPEG_BIN, "-v", "error", "-show_entries", "format=duration",
-                     "-of", "default=noprint_wrappers=1:nokey=1", media_path]
-        proc = subprocess.Popen(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         try:
-            out, _ = proc.communicate(timeout=timeout_probe)
         except subprocess.TimeoutExpired:
-            proc.kill()
-            out, _ = proc.communicate()
-        duration = None
         try:
-            duration = float(out.strip().split(b"\n")[0]) if out else None
         except Exception:
-            duration = None
-        if duration and duration > 0:
-            timestamps = [(duration * i) / (sample_count + 1) for i in range(1, sample_count + 1)]
-        else:
-            timestamps = [0.5, 1.0, 2.0][:sample_count]
-        for i, t in enumerate(timestamps):
-            fd, tmp_frame = tempfile.mkstemp(suffix=f"_{i}.jpg")
-            os.close(fd)
-            cmd = [
-                FFMPEG_BIN, "-nostdin", "-y", "-i", media_path,
-                "-ss", str(t),
-                "-frames:v", "1",
-                "-q:v", "2",
-                tmp_frame
-            ]
-            proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-            try:
-                proc.communicate(timeout=timeout_extract)
-            except subprocess.TimeoutExpired:
-                try:
-                    proc.kill()
-                except Exception:
-                    pass
-                proc.communicate()
-            if proc.returncode == 0 and os.path.exists(tmp_frame) and os.path.getsize(tmp_frame) > 0:
-                with open(tmp_frame, "rb") as f:
-                    tmp_frames.append(f.read())
-            try:
-                if os.path.exists(tmp_frame):
-                    os.remove(tmp_frame)
-            except Exception:
-                pass
-        return tmp_frames
-    finally:
-        pass
 def upload_file_to_mistral(client, path, filename=None, purpose="batch"):
     fname = filename or os.path.basename(path)
@@ -185,7 +164,6 @@ def upload_file_to_mistral(client, path, filename=None, purpose="batch"):
                 raise RuntimeError(f"Upload failed to return id: {jr}")
             return fid
 def build_messages_for_image(prompt: str, b64_jpg: str = None, image_url: str = None):
     if image_url:
         content = f"{prompt}\n\nImage: {image_url}"
@@ -195,11 +173,9 @@ def build_messages_for_image(prompt: str, b64_jpg: str = None, image_url: str =
         raise ValueError("Either image_url or b64_jpg required")
     return [{"role": "system", "content": SYSTEM_INSTRUCTION}, {"role": "user", "content": content}]
 def build_messages_for_text(prompt: str, extra_text: str):
     return [{"role": "system", "content": SYSTEM_INSTRUCTION}, {"role": "user", "content": f"{prompt}\n\n{extra_text}"}]
 def extract_delta(chunk):
     if not chunk:
         return None
@@ -229,7 +205,6 @@ def extract_delta(chunk):
     except Exception:
         return None
 def extract_text_from_response(res, parts: list):
     try:
         choices = getattr(res, "choices", None) or res.get("choices", [])
@@ -259,7 +234,6 @@ def extract_text_from_response(res, parts: list):
     else:
         parts.append(str(res))
 def stream_and_collect(client, model, messages, parts: list):
     norm_msgs = []
     for m in messages:
@@ -309,8 +283,7 @@ def stream_and_collect(client, model, messages, parts: list):
     res = client.chat.complete(model=model, messages=norm_msgs, stream=False)
     extract_text_from_response(res, parts)
-def analyze_image_bytes(client, img_bytes: bytes, prompt: str, model=DEFAULT_IMAGE_MODEL):
     jpg = convert_to_jpeg_bytes(img_bytes, base_h=480)
     b64 = b64_jpeg(jpg)
     msgs = build_messages_for_image(prompt, b64_jpg=b64)
@@ -318,17 +291,14 @@ def analyze_image_bytes(client, img_bytes: bytes, prompt: str, model=DEFAULT_IMA
     stream_and_collect(client, model, msgs, parts)
     return "".join(parts).strip()
-def analyze_multiple_frames(client, frames_bytes_list, prompt: str, model=DEFAULT_IMAGE_MODEL):
     results = []
     for i, fb in enumerate(frames_bytes_list):
         res = analyze_image_bytes(client, fb, f"{prompt}\n\nFrame index: {i+1}", model=model)
         results.append((i, res))
     merged = []
     for i, text in results:
         merged.append(f"Frame {i+1} analysis:\n{text}")
     consolidation_prompt = (
         prompt
         + "\n\nConsolidate the key consistent observations across the provided frame analyses below. "
@@ -337,53 +307,67 @@ def analyze_multiple_frames(client, frames_bytes_list, prompt: str, model=DEFAUL
     )
     parts = []
     msgs = build_messages_for_text(consolidation_prompt, "")
-    stream_and_collect(client, DEFAULT_IMAGE_MODEL, msgs, parts)
     consolidated = "".join(parts).strip()
     if consolidated:
         merged.append("Consolidated summary:\n" + consolidated)
     return "\n\n".join(merged)
 def generate_final_text(src: str, custom_prompt: str, api_key: str):
     client = get_client(api_key)
     prompt = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else "Please provide a detailed visual review.")
     ext = ext_from_src(src)
     is_image = ext in IMAGE_EXTS or (not is_remote(src) and os.path.isfile(src) and ext in IMAGE_EXTS)
-    parts = []
     if is_image:
         try:
-            if is_remote(src):
-                raw = fetch_bytes(src)
-                return analyze_image_bytes(client, raw, prompt, model=DEFAULT_IMAGE_MODEL)
-            else:
-                raw = fetch_bytes(src)
-                return analyze_image_bytes(client, raw, prompt, model=DEFAULT_IMAGE_MODEL)
         except UnidentifiedImageError:
             return "Error: provided file is not a valid image."
         except Exception as e:
             return f"Error processing image: {e}"
-    if is_remote(src):
         tmp_media = None
         try:
-            media_bytes = fetch_bytes(src, timeout=120)
             ext = ext_from_src(src) or ".mp4"
             tmp_media = save_bytes_to_temp(media_bytes, suffix=ext)
             try:
                 file_id = upload_file_to_mistral(client, tmp_media, filename=os.path.basename(src.split("?")[0]))
                 extra = (
-                    f"Remote video uploaded to Mistral Files with id: {file_id}\n\n"
                     "Instruction: Analyze the video contents using the uploaded file id. Do not invent frames not present."
                 )
                 msgs = build_messages_for_text(prompt, extra)
-                stream_and_collect(client, DEFAULT_VIDEO_MODEL, msgs, parts)
                 return "".join(parts).strip()
             except Exception:
                 frames = extract_best_frames_bytes(tmp_media, sample_count=5)
                 if not frames:
                     return "Error: could not upload remote video and no frames extracted."
-                return analyze_multiple_frames(client, frames, prompt, model=DEFAULT_IMAGE_MODEL)
         finally:
             try:
                 if tmp_media and os.path.exists(tmp_media):
@@ -391,36 +375,10 @@ def generate_final_text(src: str, custom_prompt: str, api_key: str):
             except Exception:
                 pass
-    tmp_media = None
-    try:
-        media_bytes = fetch_bytes(src)
-        _, ext = os.path.splitext(src) if src else ("", ".mp4")
-        ext = ext or ".mp4"
-        tmp_media = save_bytes_to_temp(media_bytes, suffix=ext)
-        try:
-            file_id = upload_file_to_mistral(client, tmp_media, filename=os.path.basename(src))
-            extra = (
-                f"Local video uploaded to Mistral Files with id: {file_id}\n\n"
-                "Instruction: Analyze the video contents using the uploaded file id. Do not invent frames not present."
-            )
-            msgs = build_messages_for_text(prompt, extra)
-            stream_and_collect(client, DEFAULT_VIDEO_MODEL, msgs, parts)
-            return "".join(parts).strip()
-        except Exception:
-            frames = extract_best_frames_bytes(tmp_media, sample_count=5)
-            if not frames:
-                return "Unable to process the provided file. Provide a direct image/frame URL or a remote video URL."
-            return analyze_multiple_frames(client, frames, prompt, model=DEFAULT_IMAGE_MODEL)
-    finally:
-        try:
-            if tmp_media and os.path.exists(tmp_media):
-                os.remove(tmp_media)
-        except Exception:
-            pass
 css = ".preview_media img, .preview_media video { max-width: 100%; height: auto; }"
 def load_preview(url: str):
     if not url:
         return None, None, ""
@@ -442,24 +400,28 @@ def load_preview(url: str):
     except Exception:
         return None, None, "Preview failed"
-with gr.Blocks(title="Flux", css=css) as demo:
     with gr.Row():
         with gr.Column(scale=1):
-            url_input = gr.Textbox(label="Image or Video URL", placeholder="https://...", lines=1)
             custom_prompt = gr.Textbox(label="Prompt (optional)", lines=2, value="")
             with gr.Accordion("Mistral API Key (optional)", open=False):
                 api_key = gr.Textbox(label="API Key", type="password", max_lines=1)
             submit = gr.Button("Submit")
-            preview_image = gr.Image(label="Preview", type="pil", elem_classes="preview_media", visible=False)
-            preview_video = gr.Video(label="Preview", elem_classes="preview_media", visible=False)
         with gr.Column(scale=2):
             final_text = gr.Markdown(value="")
-    url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_image, preview_video, gr.Textbox(visible=False)])
     submit.click(fn=generate_final_text, inputs=[url_input, custom_prompt, api_key], outputs=[final_text])
     demo.queue()
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))

 import subprocess
 import tempfile
 import shutil
 import base64
 import requests
+from io import BytesIO
+from PIL import Image, ImageFile, UnidentifiedImageError
 import gradio as gr
 from mistralai import Mistral
+# Config
 DEFAULT_KEY = os.getenv("MISTRAL_API_KEY", "")
+PIXTRAL_MODEL = "pixtral-12b-2409"     # image-capable multimodal model
+VIDEO_MODEL = "voxtral-mini-latest"    # replace with your preferred video model
 STREAM_THRESHOLD = 20 * 1024 * 1024
 FFMPEG_BIN = shutil.which("ffmpeg")
     "Do not invent sensory information not present in the media."
 )
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 Image.MAX_IMAGE_PIXELS = 10000 * 10000
+IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".webp", ".gif")
+VIDEO_EXTS = (".mp4", ".mov", ".webm", ".mkv", ".avi", ".flv")
 def get_client(key: str = None):
     api_key = (key or "").strip() or DEFAULT_KEY
     return Mistral(api_key=api_key)
 def is_remote(src: str) -> bool:
     return bool(src) and src.startswith(("http://", "https://"))
 def ext_from_src(src: str) -> str:
     _, ext = os.path.splitext((src or "").split("?")[0])
     return ext.lower()
 def fetch_bytes(src: str, stream_threshold=STREAM_THRESHOLD, timeout=60) -> bytes:
     if is_remote(src):
         with requests.get(src, timeout=timeout, stream=True) as r:
                             if chunk:
                                 f.write(chunk)
                     with open(path, "rb") as f:
+                        return f.read()
                 finally:
+                    try: os.remove(path)
+                    except Exception: pass
             return r.content
     with open(src, "rb") as f:
         return f.read()
+def save_bytes_to_temp(b: bytes, suffix: str):
+    fd, path = tempfile.mkstemp(suffix=suffix)
+    os.close(fd)
+    with open(path, "wb") as f:
+        f.write(b)
+    return path
+def convert_to_jpeg_bytes(img_bytes: bytes, base_h=480) -> bytes:
+    img = Image.open(BytesIO(img_bytes))
     try:
         if getattr(img, "is_animated", False):
             img.seek(0)
     img.save(buf, format="JPEG", quality=85)
     return buf.getvalue()
 def b64_jpeg(img_bytes: bytes) -> str:
     return base64.b64encode(img_bytes).decode("utf-8")
 def extract_best_frames_bytes(media_path: str, sample_count: int = 5, timeout_probe: int = 10, timeout_extract: int = 15):
     if not FFMPEG_BIN or not os.path.exists(media_path):
         return []
+    frames = []
+    probe_cmd = [FFMPEG_BIN, "-v", "error", "-show_entries", "format=duration",
+                 "-of", "default=noprint_wrappers=1:nokey=1", media_path]
+    proc = subprocess.Popen(probe_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+    try:
+        out, _ = proc.communicate(timeout=timeout_probe)
+    except subprocess.TimeoutExpired:
+        proc.kill()
+        out, _ = proc.communicate()
+    duration = None
     try:
+        duration = float(out.strip().split(b"\n")[0]) if out else None
+    except Exception:
+        duration = None
+    if duration and duration > 0:
+        timestamps = [(duration * i) / (sample_count + 1) for i in range(1, sample_count + 1)]
+    else:
+        timestamps = [0.5, 1.0, 2.0][:sample_count]
+    for i, t in enumerate(timestamps):
+        fd, tmp_frame = tempfile.mkstemp(suffix=f"_{i}.jpg")
+        os.close(fd)
+        cmd = [
+            FFMPEG_BIN, "-nostdin", "-y", "-i", media_path,
+            "-ss", str(t), "-frames:v", "1", "-q:v", "2", tmp_frame
+        ]
+        proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
         try:
+            proc.communicate(timeout=timeout_extract)
         except subprocess.TimeoutExpired:
+            try: proc.kill()
+            except Exception: pass
+            proc.communicate()
+        if proc.returncode == 0 and os.path.exists(tmp_frame) and os.path.getsize(tmp_frame) > 0:
+            with open(tmp_frame, "rb") as f:
+                frames.append(f.read())
         try:
+            if os.path.exists(tmp_frame): os.remove(tmp_frame)
         except Exception:
+            pass
+    return frames
 def upload_file_to_mistral(client, path, filename=None, purpose="batch"):
     fname = filename or os.path.basename(path)
                 raise RuntimeError(f"Upload failed to return id: {jr}")
             return fid
 def build_messages_for_image(prompt: str, b64_jpg: str = None, image_url: str = None):
     if image_url:
         content = f"{prompt}\n\nImage: {image_url}"
         raise ValueError("Either image_url or b64_jpg required")
     return [{"role": "system", "content": SYSTEM_INSTRUCTION}, {"role": "user", "content": content}]
 def build_messages_for_text(prompt: str, extra_text: str):
     return [{"role": "system", "content": SYSTEM_INSTRUCTION}, {"role": "user", "content": f"{prompt}\n\n{extra_text}"}]
 def extract_delta(chunk):
     if not chunk:
         return None
     except Exception:
         return None
 def extract_text_from_response(res, parts: list):
     try:
         choices = getattr(res, "choices", None) or res.get("choices", [])
     else:
         parts.append(str(res))
 def stream_and_collect(client, model, messages, parts: list):
     norm_msgs = []
     for m in messages:
     res = client.chat.complete(model=model, messages=norm_msgs, stream=False)
     extract_text_from_response(res, parts)
+def analyze_image_bytes(client, img_bytes: bytes, prompt: str, model=PIXTRAL_MODEL):
     jpg = convert_to_jpeg_bytes(img_bytes, base_h=480)
     b64 = b64_jpeg(jpg)
     msgs = build_messages_for_image(prompt, b64_jpg=b64)
     stream_and_collect(client, model, msgs, parts)
     return "".join(parts).strip()
+def analyze_multiple_frames(client, frames_bytes_list, prompt: str, model=PIXTRAL_MODEL):
     results = []
     for i, fb in enumerate(frames_bytes_list):
         res = analyze_image_bytes(client, fb, f"{prompt}\n\nFrame index: {i+1}", model=model)
         results.append((i, res))
     merged = []
     for i, text in results:
         merged.append(f"Frame {i+1} analysis:\n{text}")
     consolidation_prompt = (
         prompt
         + "\n\nConsolidate the key consistent observations across the provided frame analyses below. "
     )
     parts = []
     msgs = build_messages_for_text(consolidation_prompt, "")
+    stream_and_collect(client, PIXTRAL_MODEL, msgs, parts)
     consolidated = "".join(parts).strip()
     if consolidated:
         merged.append("Consolidated summary:\n" + consolidated)
     return "\n\n".join(merged)
 def generate_final_text(src: str, custom_prompt: str, api_key: str):
     client = get_client(api_key)
     prompt = (custom_prompt.strip() if custom_prompt and custom_prompt.strip() else "Please provide a detailed visual review.")
+    parts = []
     ext = ext_from_src(src)
     is_image = ext in IMAGE_EXTS or (not is_remote(src) and os.path.isfile(src) and ext in IMAGE_EXTS)
+    is_video = ext in VIDEO_EXTS or (not is_remote(src) and os.path.isfile(src) and ext in VIDEO_EXTS)
+    # If remote and content-type suggests video, treat as video
+    if is_remote(src):
+        try:
+            r = requests.head(src, timeout=10, allow_redirects=True)
+            ctype = (r.headers.get("content-type") or "").lower()
+            if ctype.startswith("video/"):
+                is_video = True
+            elif ctype.startswith("image/"):
+                is_image = True
+        except Exception:
+            pass
     if is_image:
         try:
+            raw = fetch_bytes(src)
+        except Exception as e:
+            return f"Error fetching image: {e}"
+        try:
+            return analyze_image_bytes(client, raw, prompt, model=PIXTRAL_MODEL)
         except UnidentifiedImageError:
             return "Error: provided file is not a valid image."
         except Exception as e:
             return f"Error processing image: {e}"
+    if is_video:
         tmp_media = None
         try:
+            try:
+                media_bytes = fetch_bytes(src, timeout=120)
+            except Exception as e:
+                return f"Error fetching video: {e}"
             ext = ext_from_src(src) or ".mp4"
             tmp_media = save_bytes_to_temp(media_bytes, suffix=ext)
             try:
                 file_id = upload_file_to_mistral(client, tmp_media, filename=os.path.basename(src.split("?")[0]))
                 extra = (
+                    f"Uploaded video to Mistral Files with id: {file_id}\n\n"
                     "Instruction: Analyze the video contents using the uploaded file id. Do not invent frames not present."
                 )
                 msgs = build_messages_for_text(prompt, extra)
+                stream_and_collect(client, VIDEO_MODEL, msgs, parts)
                 return "".join(parts).strip()
             except Exception:
                 frames = extract_best_frames_bytes(tmp_media, sample_count=5)
                 if not frames:
                     return "Error: could not upload remote video and no frames extracted."
+                return analyze_multiple_frames(client, frames, prompt, model=PIXTRAL_MODEL)
         finally:
             try:
                 if tmp_media and os.path.exists(tmp_media):
             except Exception:
                 pass
+    return "Unable to determine media type from the provided URL or file extension."
+# UI helpers
 css = ".preview_media img, .preview_media video { max-width: 100%; height: auto; }"
 def load_preview(url: str):
     if not url:
         return None, None, ""
     except Exception:
         return None, None, "Preview failed"
+# Gradio app
+with gr.Blocks(title="Flux Multimodal", css=css) as demo:
     with gr.Row():
         with gr.Column(scale=1):
+            url_input = gr.Textbox(label="Image or Video URL or local path", placeholder="https://... or /path/to/file", lines=1)
             custom_prompt = gr.Textbox(label="Prompt (optional)", lines=2, value="")
             with gr.Accordion("Mistral API Key (optional)", open=False):
                 api_key = gr.Textbox(label="API Key", type="password", max_lines=1)
             submit = gr.Button("Submit")
+            preview_image = gr.Image(label="Preview Image", type="pil", elem_classes="preview_media", visible=False)
+            preview_video = gr.Video(label="Preview Video", elem_classes="preview_media", visible=False)
         with gr.Column(scale=2):
             final_text = gr.Markdown(value="")
+    def _preview_wrapper(url):
+        img, vid, label = load_preview(url)
+        return img, vid, label
+    url_input.change(fn=_preview_wrapper, inputs=[url_input], outputs=[preview_image, preview_video, gr.Textbox(visible=False)])
     submit.click(fn=generate_final_text, inputs=[url_input, custom_prompt, api_key], outputs=[final_text])
     demo.queue()
 if __name__ == "__main__":
     demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))