Spaces:

prithivMLmods
/

Multimodal-Edge-Node

Running on Zero

App Files Files Community

prithivMLmods commited on Apr 30

Commit

153a20d

verified ·

1 Parent(s): 383ed70

Update app.py

Browse files

Files changed (1) hide show

app.py +435 -87

app.py CHANGED Viewed

@@ -1,17 +1,26 @@
-import gradio as gr
-import torch
-import spaces
 import json
 import ast
 import re
 from threading import Thread
 from PIL import Image
 from transformers import (
     Qwen3_5ForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = (
     torch.bfloat16
@@ -23,58 +32,31 @@ MODEL_NAME = "Qwen/Qwen3.5-2B"
 CATEGORIES = ["Query", "Caption", "Point", "Detect"]
 print(f"Loading model: {MODEL_NAME} ...")
-qwen_model = Qwen3_5ForConditionalGeneration.from_pretrained(
-    MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE,
-).eval()
-qwen_processor = AutoProcessor.from_pretrained(MODEL_NAME)
-print("Model loaded.")
-def safe_parse_json(text: str):
-    text = text.strip()
-    text = re.sub(r"^```(json)?", "", text)
-    text = re.sub(r"```$", "", text)
-    text = text.strip()
-    try:
-        return json.loads(text)
-    except json.JSONDecodeError:
-        pass
-    try:
-        return ast.literal_eval(text)
-    except Exception:
-        return {}
-def on_category_change(category: str):
-    placeholders = {
-        "Query": "e.g., Count the total number of boats and describe the environment.",
-        "Caption": "e.g., short, normal, detailed",
-        "Point": "e.g., The gun held by the person.",
-        "Detect": "e.g., The headlight of the car.",
-    }
-    return gr.Textbox(placeholder=placeholders.get(category, "Enter your prompt here."))
-@spaces.GPU
-def process_inputs(image, category, prompt):
-    if image is None:
-        raise gr.Error("Please upload an image.")
-    if not prompt or not prompt.strip():
-        raise gr.Error("Please provide a prompt.")
-    image = image.convert("RGB")
-    image.thumbnail((512, 512))
     if category == "Query":
-        full_prompt = prompt
     elif category == "Caption":
-        full_prompt = f"Provide a {prompt} length caption for the image."
     elif category == "Point":
-        full_prompt = f"Provide 2d point coordinates for {prompt}. Report in JSON format."
     elif category == "Detect":
-        full_prompt = f"Provide bounding box coordinates for {prompt}. Report in JSON format."
-    else:
-        full_prompt = prompt
     messages = [
         {
@@ -85,6 +67,7 @@ def process_inputs(image, category, prompt):
             ],
         }
     ]
     text = qwen_processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
@@ -98,6 +81,7 @@ def process_inputs(image, category, prompt):
         skip_special_tokens=True,
         timeout=120,
     )
     thread = Thread(
         target=qwen_model.generate,
         kwargs=dict(
@@ -111,48 +95,412 @@ def process_inputs(image, category, prompt):
     )
     thread.start()
-    full_text = ""
     for tok in streamer:
-        full_text += tok
-        yield full_text
     thread.join()
-with gr.Blocks() as demo:
-    gr.Markdown("## Qwen 3.5 - Image Understanding")
-    with gr.Row():
-        with gr.Column():
-            image_input = gr.Image(type="pil", label="Upload Image", height=350)
-            category_select = gr.Dropdown(
-                choices=CATEGORIES,
-                value="Query",
-                label="Task Category",
-                interactive=True,
-            )
-            prompt_input = gr.Textbox(
-                placeholder="e.g., Count the total number of boats and describe the environment.",
-                label="Prompt",
-                lines=3,
-            )
-            run_btn = gr.Button("Run", variant="primary")
-        with gr.Column():
-            output_text = gr.Textbox(label="Output", lines=20, interactive=False)
-    category_select.change(
-        fn=on_category_change,
-        inputs=[category_select],
-        outputs=[prompt_input],
-    )
-    run_btn.click(
-        fn=process_inputs,
-        inputs=[image_input, category_select, prompt_input],
-        outputs=[output_text],
     )
-if __name__ == "__main__":
-    demo.launch(show_error=True, ssr_mode=False)

+import os
+import io
 import json
 import ast
 import re
 from threading import Thread
 from PIL import Image
+import torch
+import spaces
 from transformers import (
     Qwen3_5ForConditionalGeneration,
     AutoProcessor,
     TextIteratorStreamer,
 )
+from gradio import Server
+from fastapi import Request, UploadFile, File, Form, HTTPException
+from fastapi.responses import HTMLResponse, StreamingResponse
+# --- App Configuration & Initializations ---
+app = Server()
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 DTYPE = (
     torch.bfloat16
 CATEGORIES = ["Query", "Caption", "Point", "Detect"]
 print(f"Loading model: {MODEL_NAME} ...")
+try:
+    qwen_model = Qwen3_5ForConditionalGeneration.from_pretrained(
+        MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE,
+    ).eval()
+    qwen_processor = AutoProcessor.from_pretrained(MODEL_NAME)
+    print("Model loaded successfully.")
+except Exception as e:
+    print(f"Warning: Model failed to load (ignoring if building environment). Error: {e}")
+# --- Helper Functions ---
+def process_prompt_by_category(category: str, prompt: str) -> str:
     if category == "Query":
+        return prompt
     elif category == "Caption":
+        return f"Provide a {prompt} length caption for the image."
     elif category == "Point":
+        return f"Provide 2d point coordinates for {prompt}. Report in JSON format."
     elif category == "Detect":
+        return f"Provide bounding box coordinates for {prompt}. Report in JSON format."
+    return prompt
+# --- Generator with ZeroGPU Space wrapper ---
+@spaces.GPU(duration=120)
+def generate_stream(image: Image.Image, category: str, prompt: str):
+    full_prompt = process_prompt_by_category(category, prompt)
     messages = [
         {
             ],
         }
     ]
     text = qwen_processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
         skip_special_tokens=True,
         timeout=120,
     )
     thread = Thread(
         target=qwen_model.generate,
         kwargs=dict(
     )
     thread.start()
     for tok in streamer:
+        yield tok
     thread.join()
+# --- FastAPI Endpoints ---
+@app.post("/api/run")
+async def run_node_graph(
+    image: UploadFile = File(...),
+    category: str = Form(...),
+    prompt: str = Form(...)
+):
+    if not image:
+        raise HTTPException(status_code=400, detail="Image is required")
+    if not prompt.strip():
+        raise HTTPException(status_code=400, detail="Prompt is required")
+    if category not in CATEGORIES:
+        raise HTTPException(status_code=400, detail="Invalid Category")
+    try:
+        image_bytes = await image.read()
+        pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+        pil_image.thumbnail((512, 512)) # Downscale to fit limits
+    except Exception as e:
+        raise HTTPException(status_code=400, detail=f"Invalid image file: {e}")
+    # Return a StreamingResponse to stream tokens to the frontend
+    return StreamingResponse(
+        generate_stream(pil_image, category, prompt),
+        media_type="text/plain"
     )
+@app.get("/", response_class=HTMLResponse)
+async def homepage(request: Request):
+    return """
+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>Multimodal-Edge-Comparator</title>
+    <style>
+        :root {
+            --bg-color: #1e1e1e;
+            --grid-color: #2a2a2a;
+            --node-bg: #333333;
+            --node-border: #444444;
+            --text-color: #eeeeee;
+            --port-color: #64b5f6;
+            --wire-color: #81c784;
+            --title-input: #e53935;
+            --title-task: #1e88e5;
+            --title-output: #43a047;
+        }
+        body {
+            margin: 0;
+            padding: 0;
+            background-color: var(--bg-color);
+            background-image:
+                linear-gradient(var(--grid-color) 1px, transparent 1px),
+                linear-gradient(90deg, var(--grid-color) 1px, transparent 1px);
+            background-size: 20px 20px;
+            font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+            color: var(--text-color);
+            overflow: hidden;
+            width: 100vw;
+            height: 100vh;
+        }
+        #topbar {
+            position: absolute;
+            top: 0; left: 0; right: 0;
+            background: rgba(0,0,0,0.6);
+            padding: 10px 20px;
+            font-size: 18px;
+            font-weight: bold;
+            color: #ccc;
+            z-index: 1000;
+            pointer-events: none;
+            display: flex;
+            justify-content: space-between;
+        }
+        #topbar a { color: #fff; text-decoration: none; }
+        /* SVG Wire Canvas */
+        #wire-canvas {
+            position: absolute;
+            top: 0; left: 0;
+            width: 100%; height: 100%;
+            pointer-events: none;
+            z-index: 1;
+        }
+        .wire-path {
+            fill: none;
+            stroke: var(--wire-color);
+            stroke-width: 4;
+            stroke-linecap: round;
+        }
+        /* Nodes */
+        .node {
+            position: absolute;
+            background: var(--node-bg);
+            border: 1px solid var(--node-border);
+            border-radius: 8px;
+            box-shadow: 0 4px 10px rgba(0,0,0,0.5);
+            min-width: 250px;
+            z-index: 10;
+            display: flex;
+            flex-direction: column;
+        }
+        .node-header {
+            padding: 8px 12px;
+            font-weight: bold;
+            font-size: 14px;
+            border-top-left-radius: 7px;
+            border-top-right-radius: 7px;
+            cursor: grab;
+            user-select: none;
+            color: white;
+            text-shadow: 1px 1px 2px rgba(0,0,0,0.8);
+        }
+        .node-header:active { cursor: grabbing; }
+        .node-content {
+            padding: 15px;
+            display: flex;
+            flex-direction: column;
+            gap: 10px;
+        }
+        /* Ports */
+        .port {
+            width: 14px; height: 14px;
+            background: var(--port-color);
+            border-radius: 50%;
+            position: absolute;
+            top: 50%;
+            transform: translateY(-50%);
+            border: 2px solid var(--node-bg);
+            z-index: 15;
+        }
+        .port-out { right: -8px; }
+        .port-in { left: -8px; }
+        /* Controls */
+        input[type="file"], select, textarea, button {
+            width: 100%;
+            box-sizing: border-box;
+            background: #222;
+            color: white;
+            border: 1px solid #555;
+            padding: 8px;
+            border-radius: 4px;
+            font-family: inherit;
+        }
+        textarea { resize: vertical; min-height: 60px; }
+        button {
+            background: #1e88e5;
+            font-weight: bold;
+            cursor: pointer;
+            transition: background 0.2s;
+            border: none;
+        }
+        button:hover { background: #1565c0; }
+        button:disabled { background: #555; cursor: not-allowed; }
+        .image-preview {
+            width: 100%;
+            height: 180px;
+            background: #111;
+            border-radius: 4px;
+            object-fit: contain;
+            display: none;
+        }
+        #output-text {
+            min-height: 150px;
+            max-height: 300px;
+            overflow-y: auto;
+            white-space: pre-wrap;
+            font-family: monospace;
+            font-size: 13px;
+            color: #aed581;
+            background: #111;
+            padding: 10px;
+            border-radius: 4px;
+        }
+        .label { font-size: 12px; color: #aaa; margin-bottom: 2px; }
+        .control-group { display: flex; flex-direction: column; }
+    </style>
+</head>
+<body>
+    <div id="topbar">
+        <a href="https://huggingface.co/spaces/prithivMLmods/Multimodal-Edge-Comparator" target="_blank">Multimodal-Edge-Comparator UI</a>
+        <span>Qwen 3.5 - 2B Backend</span>
+    </div>
+    <svg id="wire-canvas">
+        <path id="wire1" class="wire-path" d="" />
+        <path id="wire2" class="wire-path" d="" />
+    </svg>
+    <!-- NODE 1: INPUT -->
+    <div class="node" id="node-input" style="top: 150px; left: 100px; width: 280px;">
+        <div class="node-header" style="background-color: var(--title-input);">1. Image Input Node</div>
+        <div class="node-content">
+            <div class="control-group">
+                <span class="label">Upload Image</span>
+                <input type="file" id="file-input" accept="image/*">
+            </div>
+            <img id="img-preview" class="image-preview">
+        </div>
+        <div class="port port-out" id="port-input-out"></div>
+    </div>
+    <!-- NODE 2: TASK & PROMPT -->
+    <div class="node" id="node-task" style="top: 150px; left: 450px; width: 300px;">
+        <div class="port port-in" id="port-task-in"></div>
+        <div class="node-header" style="background-color: var(--title-task);">2. Processing Node</div>
+        <div class="node-content">
+            <div class="control-group">
+                <span class="label">Task Category</span>
+                <select id="category-select">
+                    <option value="Query">Query</option>
+                    <option value="Caption">Caption</option>
+                    <option value="Point">Point</option>
+                    <option value="Detect">Detect</option>
+                </select>
+            </div>
+            <div class="control-group">
+                <span class="label">Prompt Details</span>
+                <textarea id="prompt-input" placeholder="e.g., Count the total number of boats and describe the environment."></textarea>
+            </div>
+            <button id="run-btn">Queue Run</button>
+        </div>
+        <div class="port port-out" id="port-task-out"></div>
+    </div>
+    <!-- NODE 3: OUTPUT -->
+    <div class="node" id="node-output" style="top: 150px; left: 820px; width: 350px;">
+        <div class="port port-in" id="port-output-in"></div>
+        <div class="node-header" style="background-color: var(--title-output);">3. Text Output Node</div>
+        <div class="node-content">
+            <div class="control-group">
+                <span class="label">Streamed Results</span>
+                <div id="output-text">Awaiting execution...</div>
+            </div>
+        </div>
+    </div>
+    <script>
+        // --- 1. Draggable Nodes Logic ---
+        let draggedNode = null;
+        let offsetX = 0, offsetY = 0;
+        document.querySelectorAll('.node-header').forEach(header => {
+            header.addEventListener('mousedown', (e) => {
+                draggedNode = e.target.closest('.node');
+                const rect = draggedNode.getBoundingClientRect();
+                offsetX = e.clientX - rect.left;
+                offsetY = e.clientY - rect.top;
+                draggedNode.style.zIndex = 100; // bring to front
+            });
+        });
+        document.addEventListener('mousemove', (e) => {
+            if (!draggedNode) return;
+            const x = e.clientX - offsetX;
+            const y = e.clientY - offsetY;
+            draggedNode.style.left = `${x}px`;
+            draggedNode.style.top = `${y}px`;
+            updateWires();
+        });
+        document.addEventListener('mouseup', () => {
+            if (draggedNode) {
+                draggedNode.style.zIndex = 10;
+                draggedNode = null;
+            }
+        });
+        // --- 2. Wire Connection Logic ---
+        function getPortCenter(portId) {
+            const el = document.getElementById(portId);
+            const rect = el.getBoundingClientRect();
+            return {
+                x: rect.left + rect.width / 2,
+                y: rect.top + rect.height / 2
+            };
+        }
+        function drawCurve(x1, y1, x2, y2) {
+            // Cubic bezier curve for a "ComfyUI wire" look
+            const cx = (x1 + x2) / 2;
+            return `M ${x1} ${y1} C ${cx} ${y1}, ${cx} ${y2}, ${x2} ${y2}`;
+        }
+        function updateWires() {
+            // Wire 1: Input to Task
+            const p1 = getPortCenter('port-input-out');
+            const p2 = getPortCenter('port-task-in');
+            document.getElementById('wire1').setAttribute('d', drawCurve(p1.x, p1.y, p2.x, p2.y));
+            // Wire 2: Task to Output
+            const p3 = getPortCenter('port-task-out');
+            const p4 = getPortCenter('port-output-in');
+            document.getElementById('wire2').setAttribute('d', drawCurve(p3.x, p3.y, p4.x, p4.y));
+        }
+        // Initialize wires on load
+        window.addEventListener('resize', updateWires);
+        updateWires();
+        // --- 3. App Logic (Placeholders & Previews) ---
+        const placeholders = {
+            "Query": "e.g., Count the total number of boats...",
+            "Caption": "e.g., short, normal, detailed",
+            "Point": "e.g., The gun held by the person.",
+            "Detect": "e.g., The headlight of the car."
+        };
+        const fileInput = document.getElementById('file-input');
+        const imgPreview = document.getElementById('img-preview');
+        const catSelect = document.getElementById('category-select');
+        const promptInput = document.getElementById('prompt-input');
+        const runBtn = document.getElementById('run-btn');
+        const outText = document.getElementById('output-text');
+        let currentFile = null;
+        catSelect.addEventListener('change', (e) => {
+            promptInput.placeholder = placeholders[e.target.value] || "";
+        });
+        fileInput.addEventListener('change', (e) => {
+            const file = e.target.files[0];
+            if (file) {
+                currentFile = file;
+                const url = URL.createObjectURL(file);
+                imgPreview.src = url;
+                imgPreview.style.display = 'block';
+                updateWires(); // Re-adjust lines since node height changed
+            }
+        });
+        // --- 4. Execution & Streaming Logic ---
+        runBtn.addEventListener('click', async () => {
+            if (!currentFile) return alert('Please upload an image first.');
+            if (!promptInput.value.trim()) return alert('Please enter a prompt.');
+            runBtn.disabled = true;
+            runBtn.innerText = "Running...";
+            outText.innerText = "Initializing connection to model...\n";
+            const formData = new FormData();
+            formData.append('image', currentFile);
+            formData.append('category', catSelect.value);
+            formData.append('prompt', promptInput.value);
+            try {
+                const response = await fetch('/api/run', {
+                    method: 'POST',
+                    body: formData
+                });
+                if (!response.ok) {
+                    const errText = await response.text();
+                    throw new Error(`Error: ${response.status} - ${errText}`);
+                }
+                outText.innerText = ""; // Clear loader
+                // Read the streaming response chunks
+                const reader = response.body.getReader();
+                const decoder = new TextDecoder("utf-8");
+                let done = false;
+                while (!done) {
+                    const { value, done: readerDone } = await reader.read();
+                    done = readerDone;
+                    if (value) {
+                        const chunk = decoder.decode(value, { stream: true });
+                        outText.innerText += chunk;
+                        // Auto-scroll to bottom
+                        outText.scrollTop = outText.scrollHeight;
+                    }
+                }
+            } catch (err) {
+                outText.innerText += `\n\n[Execution Failed]\n${err.message}`;
+            } finally {
+                runBtn.disabled = false;
+                runBtn.innerText = "Queue Run";
+            }
+        });
+    </script>
+</body>
+</html>
+"""
+app.launch()