Image-To-Flux-Prompt

Running

App Files Files Community

Hug0endob commited on Dec 14, 2025

Commit

b5d1da5

verified ·

1 Parent(s): aab85a4

Update app.py

Browse files

Files changed (1) hide show

app.py +250 -1

app.py CHANGED Viewed

@@ -30,7 +30,7 @@ except Exception:  # pragma: no cover
 DEFAULT_KEY = os.getenv("MISTRAL_API_KEY", "")
 PIXTRAL_MODEL = "pixtral-12b-2409"
 VIDEO_MODEL = "voxtral-mini-latest"
-STREAM_THRESHOLD = 20 * 1024 * 1024          # 20 MiB
 FFMPEG_BIN = shutil.which("ffmpeg")
 IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".webp", ".gif")
 VIDEO_EXTS = (".mp4", ".mov", ".webm", ".mkv", ".avi", ".flv")
@@ -56,6 +56,7 @@ def get_client(key: str | None = None):
     if Mistral is None:
         class Dummy:
             def __init__(self, k): self.api_key = k
         return Dummy(api_key)
     return Mistral(api_key=api_key)
@@ -262,3 +263,251 @@ def upload_file_to_mistral(
                 fid = res["data"][0]["id"]
             return fid
     except Exception:

 DEFAULT_KEY = os.getenv("MISTRAL_API_KEY", "")
 PIXTRAL_MODEL = "pixtral-12b-2409"
 VIDEO_MODEL = "voxtral-mini-latest"
+STREAM_THRESHOLD = 20 * 1024 * 1024  # 20 MiB
 FFMPEG_BIN = shutil.which("ffmpeg")
 IMAGE_EXTS = (".jpg", ".jpeg", ".png", ".webp", ".gif")
 VIDEO_EXTS = (".mp4", ".mov", ".webm", ".mkv", ".avi", ".flv")
     if Mistral is None:
         class Dummy:
             def __init__(self, k): self.api_key = k
         return Dummy(api_key)
     return Mistral(api_key=api_key)
                 fid = res["data"][0]["id"]
             return fid
     except Exception:
+        pass
+    # Raw‑HTTP fallback ---------------------------------------
+    api_key = getattr(client, "api_key", "") or DEFAULT_KEY
+    url = "https://api.mistral.ai/v1/files"
+    headers = {"Authorization": f"Bearer {api_key}"} if api_key else {}
+    with open(path, "rb") as fh:
+        files = {"file": (fname, fh)}
+        data = {"purpose": purpose}
+        r = requests.post(url, headers=headers, files=files, data=data, timeout=timeout)
+        r.raise_for_status()
+        jr = r.json()
+        return jr.get("id") or jr.get("data", [{}])[0].get("id")
+def analyze_image_structured(client, img_bytes: bytes, prompt: str) -> str:
+    """Resize, encode, and send an image to Pixtral."""
+    jpeg = convert_to_jpeg_bytes(img_bytes, base_h=1024)
+    data_url = b64_bytes(jpeg, mime="image/jpeg")
+    messages = [
+        {"role": "system", "content": SYSTEM_INSTRUCTION},
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": data_url},
+            ],
+        },
+    ]
+    return chat_complete(client, PIXTRAL_MODEL, messages)
+def analyze_video_cohesive(client, video_path: str, prompt: str) -> str:
+    """Upload video; if that fails, fall back to frame extraction."""
+    try:
+        file_id = upload_file_to_mistral(client, video_path, filename=os.path.basename(video_path))
+        extra_msg = (
+            f"Uploaded video file id: {file_id}\n\n"
+            "Instruction: Analyze the entire video and produce a single cohesive narrative describing consistent observations."
+        )
+        messages = [
+            {"role": "system", "content": SYSTEM_INSTRUCTION},
+            {"role": "user", "content": extra_msg + "\n\n" + prompt},
+        ]
+        return chat_complete(client, VIDEO_MODEL, messages)
+    except Exception:
+        # Fallback: extract a few representative frames
+        frames = extract_best_frames_bytes(video_path, sample_count=6)
+        if not frames:
+            return "Error: could not upload video and no frames could be extracted."
+        image_entries = []
+        for i, fb in enumerate(frames, start=1):
+            try:
+                j = convert_to_jpeg_bytes(fb, base_h=720)
+                image_entries.append(
+                    {
+                        "type": "image_url",
+                        "image_url": b64_bytes(j, mime="image/jpeg"),
+                        "meta": {"frame_index": i},
+                    }
+                )
+            except Exception:
+                continue
+        content = [
+            {"type": "text", "text": prompt + "\n\nPlease consolidate observations across these frames into a single cohesive narrative."}
+        ] + image_entries
+        messages = [
+            {"role": "system", "content": SYSTEM_INSTRUCTION},
+            {"role": "user", "content": content},
+        ]
+        return chat_complete(client, PIXTRAL_MODEL, messages)
+def determine_media_type(src: str) -> Tuple[bool, bool]:
+    """Return (is_image, is_video)."""
+    is_image = False
+    is_video = False
+    ext = ext_from_src(src)
+    if ext in IMAGE_EXTS:
+        is_image = True
+    if ext in VIDEO_EXTS:
+        is_video = True
+    if is_remote(src):
+        head = safe_head(src)
+        if head:
+            ctype = (head.headers.get("content-type") or "").lower()
+            if ctype.startswith("image/"):
+                is_image, is_video = True, False
+            elif ctype.startswith("video/"):
+                is_video, is_image = True, False
+    return is_image, is_video
+def process_media(src: str, custom_prompt: str, api_key: str, progress=gr.Progress()) -> str:
+    client = get_client(api_key)
+    prompt = (custom_prompt or "").strip() or "Please provide a detailed visual review."
+    if not src:
+        return "No URL or path provided."
+    progress(0.05, desc="Determining media type")
+    is_image, is_video = determine_media_type(src)
+    if is_image:
+        try:
+            raw = fetch_bytes(src)
+        except Exception as e:
+            return f"Error fetching image: {e}"
+        progress(0.2, desc="Analyzing image")
+        try:
+            return analyze_image_structured(client, raw, prompt)
+        except UnidentifiedImageError:
+            return "Error: provided file is not a valid image."
+        except Exception as e:
+            return f"Error analyzing image: {e}"
+    if is_video:
+        try:
+            raw = fetch_bytes(src, timeout=120)
+        except Exception as e:
+            return f"Error fetching video: {e}"
+        tmp_path = save_bytes_to_temp(raw, suffix=ext_from_src(src) or ".mp4")
+        try:
+            progress(0.2, desc="Analyzing video")
+            return analyze_video_cohesive(client, tmp_path, prompt)
+        finally:
+            try:
+                os.remove(tmp_path)
+            except Exception:
+                pass
+    # Fallback: treat as image
+    try:
+        raw = fetch_bytes(src)
+        progress(0.2, desc="Treating as image")
+        return analyze_image_structured(client, raw, prompt)
+    except Exception as e:
+        return f"Unable to determine media type or fetch file: {e}"
+# ----------------------------------------------------------------------
+# Gradio UI helpers
+# ----------------------------------------------------------------------
+css = ".preview_media img, .preview_media video { max-width: 100%; height: auto; }"
+def load_preview(url: str):
+    """Return (image_component, video_component) updates."""
+    empty_img = gr.update(value=None, visible=False)
+    empty_vid = gr.update(value=None, visible=False)
+    if not url:
+        return empty_img, empty_vid
+    # Local file handling
+    if not is_remote(url) and os.path.exists(url):
+        ext = ext_from_src(url)
+        if ext in VIDEO_EXTS:
+            return empty_img, gr.update(value=os.path.abspath(url), visible=True)
+        if ext in IMAGE_EXTS:
+            try:
+                img = Image.open(url)
+                if getattr(img, "is_animated", False):
+                    img.seek(0)
+                return gr.update(value=img.convert("RGB"), visible=True), empty_vid
+            except Exception:
+                return empty_img, empty_vid
+    # Remote handling – try to infer from headers
+    head = safe_head(url)
+    if head:
+        ctype = (head.headers.get("content-type") or "").lower()
+        if ctype.startswith("video/") or any(url.lower().endswith(ext) for ext in VIDEO_EXTS):
+            return empty_img, gr.update(value=url, visible=True)
+    # Try to load as image
+    try:
+        r = safe_get(url, timeout=15)
+        img = Image.open(BytesIO(r.content))
+        if getattr(img, "is_animated", False):
+            img.seek(0)
+        return gr.update(value=img.convert("RGB"), visible=True), empty_vid
+    except Exception:
+        return empty_img, empty_vid
+def _btn_label_for_status(status: str) -> str:
+    return {
+        "idle": "Submit",
+        "busy": "Processing…",
+        "done": "Submit",
+        "error": "Retry",
+    }.get(status or "idle", "Submit")
+# ----------------------------------------------------------------------
+# Build Gradio demo
+# ----------------------------------------------------------------------
+def create_demo():
+    with gr.Blocks(title="Flux Multimodal (Pixtral / Voxtral)", css=css) as demo:
+        with gr.Row():
+            with gr.Column(scale=1):
+                url_input = gr.Textbox(
+                    label="Image / Video URL or local path",
+                    placeholder="https://... or /path/to/file",
+                    lines=1,
+                )
+                custom_prompt = gr.Textbox(label="Prompt (optional)", lines=2, value="")
+                with gr.Accordion("Mistral API Key (optional)", open=False):
+                    api_key = gr.Textbox(label="API Key", type="password", max_lines=1)
+                submit_btn = gr.Button("Submit")
+                clear_btn = gr.Button("Clear")
+                preview_image = gr.Image(
+                    label="Preview Image",
+                    type="pil",
+                    elem_classes="preview_media",
+                    visible=False,
+                )
+                preview_video = gr.Video(
+                    label="Preview Video",
+                    elem_classes="preview_media",
+                    visible=False,
+                )
+            with gr.Column(scale=2):
+                final_md = gr.Markdown(value="")
+        # Live preview
+        url_input.change(fn=load_preview, inputs=[url_input], outputs=[preview_image, preview_video])
+        # Clear button
+        clear_btn.click(
+            fn=lambda: (
+                "",  # clear textbox
+                gr.update(value=None, visible=False),  # hide image
+                gr.update(value=None, visible=False),  # hide video
+            ),
+            inputs=[],
+            outputs=[url_input, preview_image, preview_video],
+        )
+        # State to track button status
+        status = gr.State("idle")
+        def start_busy() -> str:
+            return "busy"
+        def worker(url: str, prompt: str, key: str, progress=gr.Progress()):
+            return process_media(url or "", prompt or "", key or "", progress=progress)
+        def finish(result: str) -> tuple[str, str]:
+            if not result or result.lower().startswith(("error", "unhandled"))