Image-To-Flux-Prompt

Running

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

b9f10dd

verified ·

1 Parent(s): 62ffc11

Update app.py

Browse files

Files changed (1) hide show

app.py +128 -144

app.py CHANGED Viewed

@@ -11,15 +11,16 @@ from io import BytesIO
 from typing import List, Tuple
 from PIL import Image, ImageFile, UnidentifiedImageError
 import gradio as gr
-# Import Mistral client in the same way original code did.
-# If you have a different client interface, adjust get_client/upload_file_to_mistral accordingly.
 try:
     from mistralai import Mistral
 except Exception:
-    Mistral = None  # Fallback; upload will use raw HTTP if needed
-# --- Configuration / constants ---
 DEFAULT_KEY = os.getenv("MISTRAL_API_KEY", "")
 PIXTRAL_MODEL = "pixtral-12b-2409"
 VIDEO_MODEL = "voxtral-mini-latest"
@@ -35,16 +36,15 @@ SYSTEM_INSTRUCTION = (
     "Do not invent sensory information not present in the media."
 )
-# Pillow config
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 Image.MAX_IMAGE_PIXELS = 10000 * 10000
-# --- Utilities ---
 def get_client(key: str | None = None):
     api_key = (key or "").strip() or DEFAULT_KEY
     if Mistral is None:
-        # If mistralai package is not available, return a thin object with api_key attr for upload fallback.
-        class Dummy:
             def __init__(self, k): self.api_key = k
         return Dummy(api_key)
     return Mistral(api_key=api_key)
@@ -58,7 +58,7 @@ def ext_from_src(src: str) -> str:
     _, ext = os.path.splitext((src or "").split("?")[0])
     return ext.lower()
-def safe_head(url: str, timeout: int = 6) -> requests.Response | None:
     try:
         r = requests.head(url, timeout=timeout, allow_redirects=True)
         if r.status_code >= 400:
@@ -67,20 +67,18 @@ def safe_head(url: str, timeout: int = 6) -> requests.Response | None:
     except Exception:
         return None
-def safe_get(url: str, timeout: int = 15) -> requests.Response:
     r = requests.get(url, timeout=timeout)
     r.raise_for_status()
     return r
 def fetch_bytes(src: str, stream_threshold: int = STREAM_THRESHOLD, timeout: int = 60) -> bytes:
     if is_remote(src):
-        # try HEAD to learn content-length
         head = safe_head(src)
         if head is not None:
             cl = head.headers.get("content-length")
             try:
                 if cl and int(cl) > stream_threshold:
-                    # stream download to temp file to avoid memory spike
                     with requests.get(src, timeout=timeout, stream=True) as r:
                         r.raise_for_status()
                         fd, p = tempfile.mkstemp()
@@ -96,9 +94,7 @@ def fetch_bytes(src: str, stream_threshold: int = STREAM_THRESHOLD, timeout: int
                             try: os.remove(p)
                             except Exception: pass
             except Exception:
-                # fallthrough to simple get
                 pass
-        # regular GET
         r = safe_get(src, timeout=timeout)
         return r.content
     else:
@@ -122,15 +118,14 @@ def convert_to_jpeg_bytes(img_bytes: bytes, base_h: int = 480) -> bytes:
     if img.mode != "RGB":
         img = img.convert("RGB")
     h = base_h
-    # maintain aspect
     w = max(1, int(img.width * (h / img.height)))
     img = img.resize((w, h), Image.LANCZOS)
     buf = BytesIO()
     img.save(buf, format="JPEG", quality=85)
     return buf.getvalue()
-def b64_jpeg(img_bytes: bytes) -> str:
-    return base64.b64encode(img_bytes).decode("utf-8")
 def extract_best_frames_bytes(media_path: str, sample_count: int = 5, timeout_extract: int = 15) -> List[bytes]:
     frames = []
@@ -153,59 +148,37 @@ def extract_best_frames_bytes(media_path: str, sample_count: int = 5, timeout_ex
             except Exception: pass
     return frames
-# --- Mistral interaction helpers ---
-def build_messages_for_image(prompt: str, b64_jpg: str):
-    content = f"{prompt}\n\nImage (base64): data:image/jpeg;base64,{b64_jpg}"
-    return [{"role": "system", "content": SYSTEM_INSTRUCTION}, {"role": "user", "content": content}]
-def build_messages_for_text(prompt: str, extra: str):
-    return [{"role": "system", "content": SYSTEM_INSTRUCTION}, {"role": "user", "content": f"{prompt}\n\n{extra}"}]
-def extract_text_from_response(res, parts: list):
-    try:
-        choices = getattr(res, "choices", None) or (res.get("choices") if isinstance(res, dict) else [])
-    except Exception:
-        choices = []
-    if not choices:
-        parts.append(str(res))
-        return
-    try:
-        first = choices[0]
-        msg = first.message if hasattr(first, "message") else (first.get("message") if isinstance(first, dict) else first)
-        if isinstance(msg, dict):
-            content = msg.get("content")
-        else:
-            content = getattr(msg, "content", None)
-        if isinstance(content, str):
-            parts.append(content)
-        else:
-            parts.append(str(content))
-    except Exception:
-        parts.append(str(res))
-def chat_complete(client, model: str, messages: list) -> str:
-    # Prefer client.chat.complete if available; otherwise attempt REST call
     parts = []
     try:
         if hasattr(client, "chat") and hasattr(client.chat, "complete"):
             res = client.chat.complete(model=model, messages=messages, stream=False)
         else:
-            # Try basic HTTP request (Mistral REST)
             api_key = getattr(client, "api_key", "") or DEFAULT_KEY
-            url = f"https://api.mistral.ai/v1/chat/completions"
             headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
-            payload = {"model": model, "messages": messages}
-            r = requests.post(url, json=payload, headers=headers, timeout=120)
             r.raise_for_status()
             res = r.json()
-        extract_text_from_response(res, parts)
     except Exception as e:
-        parts.append(f"Error during model call: {e}")
-    return "".join(parts).strip()
 def upload_file_to_mistral(client, path: str, filename: str | None = None, purpose: str = "batch") -> str:
     fname = filename or os.path.basename(path)
-    # Prefer SDK upload if available
     try:
         if hasattr(client, "files") and hasattr(client.files, "upload"):
             with open(path, "rb") as fh:
@@ -216,7 +189,6 @@ def upload_file_to_mistral(client, path: str, filename: str | None = None, purpo
             return fid
     except Exception:
         pass
-    # Fallback to HTTP upload
     api_key = getattr(client, "api_key", "") or DEFAULT_KEY
     url = "https://api.mistral.ai/v1/files"
     headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
@@ -228,106 +200,112 @@ def upload_file_to_mistral(client, path: str, filename: str | None = None, purpo
         jr = r.json()
         return jr.get("id") or jr.get("data", [{}])[0].get("id")
-def analyze_image(client, img_bytes: bytes, prompt: str) -> str:
-    jpeg = convert_to_jpeg_bytes(img_bytes, base_h=640)
-    b64 = b64_jpeg(jpeg)
-    msgs = build_messages_for_image(prompt, b64)
-    return chat_complete(client, PIXTRAL_MODEL, msgs)
-def analyze_frames_and_consolidate(client, frames: List[bytes], prompt: str) -> str:
-    per_frame = []
-    for i, fb in enumerate(frames):
-        txt = analyze_image(client, fb, f"{prompt}\n\nFrame index: {i + 1}")
-        per_frame.append(f"Frame {i + 1} analysis:\n{txt}")
-    consolidation = (
-        f"{prompt}\n\n"
-        "Consolidate the key consistent observations across the provided frame analyses below. "
-        "List consistent findings first, then note any differences between frames.\n\n"
-        + "\n\n".join(per_frame)
-    )
-    msgs = build_messages_for_text(consolidation, "")
-    summary = chat_complete(client, PIXTRAL_MODEL, msgs)
-    return "\n\n".join(per_frame + [f"Consolidated summary:\n{summary}"])
-# --- Core processing ---
-def determine_media_type_from_remote(url: str) -> Tuple[bool, bool]:
-    """
-    Returns (is_image, is_video) based on HEAD content-type or URL extension
-    """
-    is_image = False
-    is_video = False
-    if not url:
-        return is_image, is_video
-    ext = ext_from_src(url)
-    if ext in IMAGE_EXTS:
-        is_image = True
-    if ext in VIDEO_EXTS:
-        is_video = True
-    head = safe_head(url)
-    if head is not None:
-        ctype = (head.headers.get("content-type") or "").lower()
-        if ctype.startswith("video/"):
-            is_video = True; is_image = False
-        elif ctype.startswith("image/"):
-            is_image = True; is_video = False
     return is_image, is_video
 def process_media(src: str, custom_prompt: str, api_key: str) -> str:
     client = get_client(api_key)
-    prompt = custom_prompt.strip() or "Please provide a detailed visual review."
-    ext = ext_from_src(src)
-    is_image = ext in IMAGE_EXTS
-    is_video = ext in VIDEO_EXTS
-    if is_remote(src):
-        ri, rv = determine_media_type_from_remote(src)
-        if ri or rv:
-            is_image, is_video = ri, rv
     if is_image:
         try:
             raw = fetch_bytes(src)
         except Exception as e:
             return f"Error fetching image: {e}"
         try:
-            return analyze_image(client, raw, prompt)
         except UnidentifiedImageError:
             return "Error: provided file is not a valid image."
         except Exception as e:
-            return f"Error processing image: {e}"
     if is_video:
         try:
             raw = fetch_bytes(src, timeout=120)
         except Exception as e:
             return f"Error fetching video: {e}"
-        tmp_suffix = ext or ".mp4"
-        tmp_path = save_bytes_to_temp(raw, suffix=tmp_suffix)
         try:
-            # Try uploading file to Mistral first
-            try:
-                file_id = upload_file_to_mistral(client, tmp_path, filename=os.path.basename(src.split("?")[0]))
-                extra = f"Uploaded video to Mistral Files with id: {file_id}\n\nInstruction: Analyze the video contents using the uploaded file id. Do not invent frames not present."
-                msgs = build_messages_for_text(prompt, extra)
-                return chat_complete(client, VIDEO_MODEL, msgs)
-            except Exception:
-                # fallback to extracting frames
-                frames = extract_best_frames_bytes(tmp_path, sample_count=5)
-                if not frames:
-                    return "Error: could not upload remote video and no frames extracted (ffmpeg missing or extraction failed)."
-                return analyze_frames_and_consolidate(client, frames, prompt)
         finally:
-            try: os.remove(tmp_path)
             except Exception: pass
-    return "Unable to determine media type from the provided URL or file extension."
-# --- Gradio app UI helpers ---
 css = ".preview_media img, .preview_media video { max-width: 100%; height: auto; }"
 def load_preview(url: str):
-    # Returns (preview_image, preview_video) where only one is visible at a time
     empty_img = gr.update(value=None, visible=False)
     empty_vid = gr.update(value=None, visible=False)
     if not url:
         return empty_img, empty_vid
-    # Local file
     if not is_remote(url) and os.path.exists(url):
         ext = ext_from_src(url)
         if ext in VIDEO_EXTS:
@@ -335,30 +313,25 @@ def load_preview(url: str):
         if ext in IMAGE_EXTS:
             try:
                 img = Image.open(url)
-                if getattr(img, "is_animated", False):
-                    img.seek(0)
                 return gr.update(value=img.convert("RGB"), visible=True), empty_vid
             except Exception:
                 return empty_img, empty_vid
-    # Remote: first try HEAD
     head = safe_head(url)
     if head:
         ctype = (head.headers.get("content-type") or "").lower()
         if ctype.startswith("video/") or any(url.lower().split("?")[0].endswith(ext) for ext in VIDEO_EXTS):
             return empty_img, gr.update(value=url, visible=True)
-    # Finally try GET and attempt to open as image
     try:
         r = safe_get(url, timeout=15)
         img = Image.open(BytesIO(r.content))
-        if getattr(img, "is_animated", False):
-            img.seek(0)
         return gr.update(value=img.convert("RGB"), visible=True), empty_vid
     except Exception:
         return empty_img, empty_vid
-# --- Gradio app layout ---
-def create_app():
-    with gr.Blocks(title="Flux Multimodal (fixed)", css=css) as demo:
         with gr.Row():
             with gr.Column(scale=1):
                 url_input = gr.Textbox(label="Image / Video URL or local path", placeholder="https://... or /path/to/file", lines=1)
@@ -371,19 +344,30 @@ def create_app():
             with gr.Column(scale=2):
                 final_md = gr.Markdown(value="")
-        # Update preview on change
         url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_image, preview_video])
-        def submit_wrapper(url, prompt, key):
             try:
-                return process_media(url or "", prompt or "", key or "")
             except Exception as e:
-                return f"Unhandled error: {e}"
-        submit_btn.click(fn=submit_wrapper, inputs=[url_input, custom_prompt, api_key], outputs=[final_md])
     return demo
 if __name__ == "__main__":
-    demo = create_app()
     demo.queue().launch()

 from typing import List, Tuple
 from PIL import Image, ImageFile, UnidentifiedImageError
 import gradio as gr
+import threading
+import time
+# Optional SDK client (works if installed)
 try:
     from mistralai import Mistral
 except Exception:
+    Mistral = None
+# Config
 DEFAULT_KEY = os.getenv("MISTRAL_API_KEY", "")
 PIXTRAL_MODEL = "pixtral-12b-2409"
 VIDEO_MODEL = "voxtral-mini-latest"
     "Do not invent sensory information not present in the media."
 )
+# Pillow
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 Image.MAX_IMAGE_PIXELS = 10000 * 10000
+# --- Helpers ---
 def get_client(key: str | None = None):
     api_key = (key or "").strip() or DEFAULT_KEY
     if Mistral is None:
+        class Dummy:
             def __init__(self, k): self.api_key = k
         return Dummy(api_key)
     return Mistral(api_key=api_key)
     _, ext = os.path.splitext((src or "").split("?")[0])
     return ext.lower()
+def safe_head(url: str, timeout: int = 6):
     try:
         r = requests.head(url, timeout=timeout, allow_redirects=True)
         if r.status_code >= 400:
     except Exception:
         return None
+def safe_get(url: str, timeout: int = 15):
     r = requests.get(url, timeout=timeout)
     r.raise_for_status()
     return r
 def fetch_bytes(src: str, stream_threshold: int = STREAM_THRESHOLD, timeout: int = 60) -> bytes:
     if is_remote(src):
         head = safe_head(src)
         if head is not None:
             cl = head.headers.get("content-length")
             try:
                 if cl and int(cl) > stream_threshold:
                     with requests.get(src, timeout=timeout, stream=True) as r:
                         r.raise_for_status()
                         fd, p = tempfile.mkstemp()
                             try: os.remove(p)
                             except Exception: pass
             except Exception:
                 pass
         r = safe_get(src, timeout=timeout)
         return r.content
     else:
     if img.mode != "RGB":
         img = img.convert("RGB")
     h = base_h
     w = max(1, int(img.width * (h / img.height)))
     img = img.resize((w, h), Image.LANCZOS)
     buf = BytesIO()
     img.save(buf, format="JPEG", quality=85)
     return buf.getvalue()
+def b64_bytes(b: bytes, mime: str = "image/jpeg") -> str:
+    return f"data:{mime};base64," + base64.b64encode(b).decode("utf-8")
 def extract_best_frames_bytes(media_path: str, sample_count: int = 5, timeout_extract: int = 15) -> List[bytes]:
     frames = []
             except Exception: pass
     return frames
+# --- Mistral interaction (structured multimodal messages) ---
+def chat_complete(client, model: str, messages, timeout: int = 120) -> str:
     parts = []
     try:
         if hasattr(client, "chat") and hasattr(client.chat, "complete"):
             res = client.chat.complete(model=model, messages=messages, stream=False)
+            # SDK response shape expected; extract later
         else:
             api_key = getattr(client, "api_key", "") or DEFAULT_KEY
+            url = "https://api.mistral.ai/v1/chat/completions"
             headers = {"Authorization": f"Bearer {api_key}", "Content-Type": "application/json"}
+            r = requests.post(url, json={"model": model, "messages": messages}, headers=headers, timeout=timeout)
             r.raise_for_status()
             res = r.json()
+        # extract text
+        try:
+            choices = getattr(res, "choices", None) or (res.get("choices") if isinstance(res, dict) else [])
+            first = choices[0]
+            msg = first.message if hasattr(first, "message") else (first.get("message") if isinstance(first, dict) else first)
+            content = msg.get("content") if isinstance(msg, dict) else getattr(msg, "content", None)
+            if isinstance(content, str):
+                return content.strip()
+            # sometimes content is list or dict; stringify neatly
+            return str(content)
+        except Exception:
+            return str(res)
     except Exception as e:
+        return f"Error during model call: {e}"
 def upload_file_to_mistral(client, path: str, filename: str | None = None, purpose: str = "batch") -> str:
     fname = filename or os.path.basename(path)
     try:
         if hasattr(client, "files") and hasattr(client.files, "upload"):
             with open(path, "rb") as fh:
             return fid
     except Exception:
         pass
     api_key = getattr(client, "api_key", "") or DEFAULT_KEY
     url = "https://api.mistral.ai/v1/files"
     headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
         jr = r.json()
         return jr.get("id") or jr.get("data", [{}])[0].get("id")
+# --- Higher-level analysis functions ---
+def analyze_image_structured(client, img_bytes: bytes, prompt: str) -> str:
+    # Convert to JPEG for consistent mime and reasonable size
+    jpeg = convert_to_jpeg_bytes(img_bytes, base_h=1024)
+    data_url = b64_bytes(jpeg, mime="image/jpeg")
+    # Build structured multimodal message expected by Pixtral
+    messages = [
+        {"role": "system", "content": SYSTEM_INSTRUCTION},
+        {"role": "user", "content": [
+            {"type": "text", "text": prompt},
+            {"type": "image_url", "image_url": data_url}
+        ]}
+    ]
+    return chat_complete(client, PIXTRAL_MODEL, messages)
+def analyze_video_cohesive(client, video_path: str, prompt: str) -> str:
+    # Try upload + single unified instruction to video model
+    try:
+        file_id = upload_file_to_mistral(client, video_path, filename=os.path.basename(video_path))
+        extra_msg = (
+            f"Uploaded video file id: {file_id}\n\n"
+            "Instruction: Analyze the entire video and produce a single cohesive narrative describing consistent observations, "
+            "noting timestamps or notable segments only where necessary. Do NOT produce separate isolated per-frame reports; produce one integrated analysis."
+        )
+        messages = [
+            {"role": "system", "content": SYSTEM_INSTRUCTION},
+            {"role": "user", "content": extra_msg + "\n\n" + prompt}
+        ]
+        return chat_complete(client, VIDEO_MODEL, messages)
+    except Exception:
+        # Fallback: extract frames and send ONE consolidation request to PIXTRAL
+        frames = extract_best_frames_bytes(video_path, sample_count=6)
+        if not frames:
+            return "Error: could not upload video and no frames extracted (ffmpeg missing or failed)."
+        # Prepare a single message that includes all frames as image_url entries in one user content list
+        images_entries = []
+        for i, fb in enumerate(frames, start=1):
+            try:
+                j = convert_to_jpeg_bytes(fb, base_h=720)
+                images_entries.append({"type": "image_url", "image_url": b64_bytes(j, mime="image/jpeg"), "meta": {"frame_index": i}})
+            except Exception:
+                continue
+        content_list = [{"type":"text", "text": prompt + "\n\nPlease consolidate observations across these frames into a single cohesive narrative."}] + images_entries
+        messages = [{"role": "system", "content": SYSTEM_INSTRUCTION}, {"role": "user", "content": content_list}]
+        return chat_complete(client, PIXTRAL_MODEL, messages)
+# Determine type heuristics
+def determine_media_type(src: str) -> Tuple[bool, bool]:
+    is_image = False; is_video = False
+    ext = ext_from_src(src)
+    if ext in IMAGE_EXTS: is_image = True
+    if ext in VIDEO_EXTS: is_video = True
+    if is_remote(src):
+        head = safe_head(src)
+        if head:
+            ctype = (head.headers.get("content-type") or "").lower()
+            if ctype.startswith("image/"):
+                is_image, is_video = True, False
+            elif ctype.startswith("video/"):
+                is_video, is_image = True, False
     return is_image, is_video
 def process_media(src: str, custom_prompt: str, api_key: str) -> str:
     client = get_client(api_key)
+    prompt = (custom_prompt or "").strip() or "Please provide a detailed visual review."
+    if not src:
+        return "No URL or path provided."
+    is_image, is_video = determine_media_type(src)
+    # If extension ambiguous but remote, prefer HEAD detection above.
     if is_image:
         try:
             raw = fetch_bytes(src)
         except Exception as e:
             return f"Error fetching image: {e}"
         try:
+            return analyze_image_structured(client, raw, prompt)
         except UnidentifiedImageError:
             return "Error: provided file is not a valid image."
         except Exception as e:
+            return f"Error analyzing image: {e}"
     if is_video:
         try:
             raw = fetch_bytes(src, timeout=120)
         except Exception as e:
             return f"Error fetching video: {e}"
+        tmp = save_bytes_to_temp(raw, suffix=ext_from_src(src) or ".mp4")
         try:
+            return analyze_video_cohesive(client, tmp, prompt)
         finally:
+            try: os.remove(tmp)
             except Exception: pass
+    # As last resort, try to fetch and treat as image
+    try:
+        raw = fetch_bytes(src)
+        return analyze_image_structured(client, raw, prompt)
+    except Exception as e:
+        return f"Unable to determine media type or fetch file: {e}"
+# --- Gradio UI ---
 css = ".preview_media img, .preview_media video { max-width: 100%; height: auto; }"
 def load_preview(url: str):
     empty_img = gr.update(value=None, visible=False)
     empty_vid = gr.update(value=None, visible=False)
     if not url:
         return empty_img, empty_vid
     if not is_remote(url) and os.path.exists(url):
         ext = ext_from_src(url)
         if ext in VIDEO_EXTS:
         if ext in IMAGE_EXTS:
             try:
                 img = Image.open(url)
+                if getattr(img, "is_animated", False): img.seek(0)
                 return gr.update(value=img.convert("RGB"), visible=True), empty_vid
             except Exception:
                 return empty_img, empty_vid
     head = safe_head(url)
     if head:
         ctype = (head.headers.get("content-type") or "").lower()
         if ctype.startswith("video/") or any(url.lower().split("?")[0].endswith(ext) for ext in VIDEO_EXTS):
             return empty_img, gr.update(value=url, visible=True)
     try:
         r = safe_get(url, timeout=15)
         img = Image.open(BytesIO(r.content))
+        if getattr(img, "is_animated", False): img.seek(0)
         return gr.update(value=img.convert("RGB"), visible=True), empty_vid
     except Exception:
         return empty_img, empty_vid
+def create_demo():
+    with gr.Blocks(title="Flux Multimodal (Pixtral fixed)", css=css) as demo:
         with gr.Row():
             with gr.Column(scale=1):
                 url_input = gr.Textbox(label="Image / Video URL or local path", placeholder="https://... or /path/to/file", lines=1)
             with gr.Column(scale=2):
                 final_md = gr.Markdown(value="")
         url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_image, preview_video])
+        # Disable button while processing to avoid race conditions where nothing appears
+        def submit_wrapper(url, prompt, key, btn):
+            btn.update(interactive=False)
             try:
+                out = process_media(url or "", prompt or "", key or "")
             except Exception as e:
+                out = f"Unhandled error: {e}"
+            finally:
+                # re-enable after short pause to ensure UI refresh
+                time.sleep(0.2)
+                btn.update(interactive=True)
+            return out
+        # Use a small helper to get the button object for disabling
+        def on_click(url, prompt, key):
+            return process_media(url or "", prompt or "", key or "")
+        # Use click with immediate handler but guard concurrent clicks using extra state
+        submit_btn.click(fn=submit_wrapper, inputs=[url_input, custom_prompt, api_key, submit_btn], outputs=[final_md])
     return demo
 if __name__ == "__main__":
+    demo = create_demo()
     demo.queue().launch()