Spaces:

prithivMLmods
/

Multimodal-Edge-Node

Running on Zero

App Files Files Community

prithivMLmods commited on 25 days ago

Commit

3ce64c9

verified ·

1 Parent(s): d750f5a

update app

Browse files

Files changed (1) hide show

app.py +142 -82

app.py CHANGED Viewed

@@ -33,23 +33,37 @@ DTYPE = (
     else torch.float16
 )
-QWEN_MODEL_NAME     = "Qwen/Qwen3.5-2B"
 QWEN_VL_MODEL_NAME  = "Qwen/Qwen3-VL-2B-Instruct"
 LFM_450_MODEL_NAME  = "LiquidAI/LFM2.5-VL-450M"
 LFM_16_MODEL_NAME   = "LiquidAI/LFM2.5-VL-1.6B"
 # ── Qwen3.5-2B ──────────────────────────────────────────
-print(f"Loading Qwen3.5 model: {QWEN_MODEL_NAME} on {DEVICE}...")
 try:
-    qwen_model = Qwen3_5ForConditionalGeneration.from_pretrained(
-        QWEN_MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE,
     ).eval()
-    qwen_processor = AutoProcessor.from_pretrained(QWEN_MODEL_NAME)
-    print("Qwen3.5 model loaded successfully.")
 except Exception as e:
-    print(f"Warning: Qwen3.5 model loading failed. Error: {e}")
-    qwen_model = None
-    qwen_processor = None
 # ── Qwen3-VL-2B-Instruct ────────────────────────────────
 print(f"Loading Qwen3-VL model: {QWEN_VL_MODEL_NAME} on {DEVICE}...")
@@ -118,7 +132,7 @@ def safe_parse_json(text: str):
 # --- Inference Generator (Streaming) ---
 @spaces.GPU(duration=120)
 def generate_inference_stream(
-    image: Image.Image, category: str, prompt: str, model_id: str = "qwen"
 ):
     if category == "Query":
         full_prompt = prompt
@@ -131,8 +145,72 @@ def generate_inference_stream(
     else:
         full_prompt = prompt
     # ── Qwen3-VL ────────────────────────────────────────
-    if model_id == "qwen_vl":
         if qwen_vl_model is None or qwen_vl_processor is None:
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"
@@ -221,38 +299,6 @@ def generate_inference_stream(
                 yield f"data: {json.dumps({'chunk': tok})}\n\n"
         thread.join()
-    # ── Qwen3.5-2B (default) ────────────────────────────
-    else:
-        if qwen_model is None or qwen_processor is None:
-            yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5 model not loaded.'})}\n\n"
-            yield "data: [DONE]\n\n"
-            return
-        messages = [{"role": "user", "content": [
-            {"type": "image", "image": image},
-            {"type": "text",  "text":  full_prompt},
-        ]}]
-        text_input = qwen_processor.apply_chat_template(
-            messages, tokenize=False, add_generation_prompt=True
-        )
-        inputs = qwen_processor(
-            text=[text_input], images=[image], return_tensors="pt", padding=True
-        ).to(qwen_model.device)
-        streamer = TextIteratorStreamer(
-            qwen_processor.tokenizer,
-            skip_prompt=True, skip_special_tokens=True, timeout=120,
-        )
-        thread = threading.Thread(
-            target=qwen_model.generate,
-            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
-                        use_cache=True, temperature=1.5, min_p=0.1),
-        )
-        thread.start()
-        for tok in streamer:
-            if tok:
-                yield f"data: {json.dumps({'chunk': tok})}\n\n"
-        thread.join()
     yield "data: [DONE]\n\n"
@@ -262,7 +308,7 @@ async def run_inference(
     image:    UploadFile = File(...),
     category: str        = Form(...),
     prompt:   str        = Form(...),
-    model_id: str        = Form("qwen"),
 ):
     try:
         img_bytes = await image.read()
@@ -348,9 +394,9 @@ async def homepage(request: Request):
         /* ── Canvas ── */
         #canvas {
             position: relative;
-            width: 1340px;
             min-height: calc(100vh - 42px);
-            height: 880px;
             margin: 0 auto;
         }
@@ -378,7 +424,7 @@ async def homepage(request: Request):
             background: var(--node-bg);
             border: 1px solid var(--node-border);
             border-radius: 9px;
-            box-shadow: 0 8px 28px rgba(0,0,0,0.5), 0 0 0 0px rgba(124,106,247,0);
             z-index: 10;
             display: flex; flex-direction: column;
             transition: box-shadow 0.2s;
@@ -386,7 +432,6 @@ async def homepage(request: Request):
         .node:hover {
             box-shadow: 0 8px 28px rgba(0,0,0,0.5), 0 0 0 1px rgba(124,106,247,0.3);
         }
-        /* ── reduced to 330px ── */
         .node.fixed-height { height: 330px; }
         .node-header {
@@ -474,7 +519,7 @@ async def homepage(request: Request):
             cursor: pointer;
             transition: opacity 0.2s, transform 0.1s;
             display: flex; justify-content: center; align-items: center; gap: 8px;
-            letter-spacing: 0.04em;
         }
         button.run-btn:hover   { opacity: 0.9; }
         button.run-btn:active  { transform: scale(0.98); }
@@ -526,15 +571,17 @@ async def homepage(request: Request):
             box-shadow: 0 0 5px var(--accent2);
         }
         .model-badge {
             display: inline-block; padding: 2px 7px;
             border-radius: 4px; font-size: 9px; font-weight: 700;
             letter-spacing: 0.06em; text-transform: uppercase;
         }
-        .model-badge.qwen    { background: rgba(124,106,247,0.2); color: var(--accent);  border: 1px solid rgba(124,106,247,0.3); }
-        .model-badge.qwen-vl { background: rgba(255,150,50,0.15);  color: #ff9632;        border: 1px solid rgba(255,150,50,0.35); }
-        .model-badge.lfm450  { background: rgba(78,205,196,0.15);  color: var(--accent2); border: 1px solid rgba(78,205,196,0.3); }
-        .model-badge.lfm16   { background: rgba(107,203,119,0.15); color: #6bcb77;        border: 1px solid rgba(107,203,119,0.35); }
         .model-info-box {
             border-radius: 6px; padding: 9px;
@@ -551,7 +598,7 @@ async def homepage(request: Request):
     <span class="logo">MULTIMODAL EDGE</span>
     <span class="sep">|</span>
     <span class="sub">Node-Based Inference Canvas</span>
-    <span class="badge">v2.2 — QUAD MODEL</span>
 </div>
 <div id="canvas">
@@ -598,17 +645,18 @@ async def homepage(request: Request):
             <div>
                 <label>Active Model</label>
                 <select id="modelSelect">
-                    <option value="qwen">Qwen3.5-2B</option>
                     <option value="qwen_vl">Qwen3-VL-2B-Instruct</option>
                     <option value="lfm_450">LFM2.5-VL-450M (LiquidAI)</option>
                     <option value="lfm_16">LFM2.5-VL-1.6B (LiquidAI)</option>
                 </select>
             </div>
             <div id="modelInfoBox" class="model-info-box"
-                 style="background:rgba(124,106,247,0.07);border:1px solid rgba(124,106,247,0.2);">
-                <span class="model-badge qwen">QWEN 3.5</span><br><br>
-                Qwen3.5 2B multimodal model by Alibaba Cloud.
-                Supports Query, Caption, Point &amp; Detect with streaming output.
             </div>
             <div style="flex:1;"></div>
         </div>
@@ -779,38 +827,50 @@ const dotModel     = document.getElementById('dot-model');
 dotModel.classList.add('active');
 const MODEL_INFO = {
-    qwen: {
-        html: `<span class="model-badge qwen">QWEN 3.5</span><br><br>
-               Qwen3.5 2B multimodal model by Alibaba Cloud.
-               Supports Query, Caption, Point &amp; Detect with streaming output.`,
-        bg:  'rgba(124,106,247,0.07)', border: 'rgba(124,106,247,0.2)'
     },
     qwen_vl: {
-        html: `<span class="model-badge qwen-vl">QWEN3-VL</span><br><br>
-               Qwen3-VL-2B-Instruct — dedicated vision-language model by Alibaba Cloud.
-               Strong spatial grounding, OCR &amp; instruction-following.`,
-        bg:  'rgba(255,150,50,0.07)', border: 'rgba(255,150,50,0.25)'
     },
     lfm_450: {
-        html: `<span class="model-badge lfm450">LFM 450M</span><br><br>
-               LFM2.5-VL 450M by LiquidAI. Ultra-lightweight edge model
-               with strong grounding capabilities.`,
-        bg:  'rgba(78,205,196,0.07)', border: 'rgba(78,205,196,0.2)'
     },
     lfm_16: {
-        html: `<span class="model-badge lfm16">LFM 1.6B</span><br><br>
-               LFM2.5-VL 1.6B by LiquidAI. Larger liquid-state model offering
-               enhanced reasoning and richer visual understanding.`,
-        bg:  'rgba(107,203,119,0.07)', border: 'rgba(107,203,119,0.25)'
     },
 };
 modelSelect.onchange = () => {
     const info = MODEL_INFO[modelSelect.value];
     if (!info) return;
-    modelInfoBox.innerHTML         = info.html;
-    modelInfoBox.style.background  = info.bg;
-    modelInfoBox.style.borderColor = info.border;
 };
 // ══════════════════════════════════════════════
@@ -897,7 +957,7 @@ function drawGrounding(imgSrc, jsonText) {
             // ── Bounding box ──
             let bbox = null;
-            if (item?.bbox_2d?.length === 4) bbox = item.bbox_2d;
             else if (item?.bbox?.length === 4) bbox = item.bbox;
             else if (Array.isArray(item) && item.length === 4 &&
                      item.every(n => typeof n === 'number')) bbox = item;
@@ -907,7 +967,7 @@ function drawGrounding(imgSrc, jsonText) {
                 if (x1 <= 1 && y1 <= 1 && x2 <= 1 && y2 <= 1) {
                     x1*=W; y1*=H; x2*=W; y2*=H;
                 }
-                const bw = x2-x1, bh = y2-y1;
                 const lbl = item?.label || `${i+1}`;
                 gCtx.fillStyle   = hexToRgba(col, 0.18);
@@ -927,7 +987,7 @@ function drawGrounding(imgSrc, jsonText) {
             // ── Point ──
             let pt = null;
-            if (item?.point_2d?.length === 2) pt = item.point_2d;
             else if (item?.point?.length === 2) pt = item.point;
             else if (Array.isArray(item) && item.length === 2 &&
                      item.every(n => typeof n === 'number')) pt = item;

     else torch.float16
 )
+QWEN_4B_MODEL_NAME  = "Qwen/Qwen3.5-4B"
+QWEN_2B_MODEL_NAME  = "Qwen/Qwen3.5-2B"
 QWEN_VL_MODEL_NAME  = "Qwen/Qwen3-VL-2B-Instruct"
 LFM_450_MODEL_NAME  = "LiquidAI/LFM2.5-VL-450M"
 LFM_16_MODEL_NAME   = "LiquidAI/LFM2.5-VL-1.6B"
+# ── Qwen3.5-4B ──────────────────────────────────────────
+print(f"Loading Qwen3.5-4B model: {QWEN_4B_MODEL_NAME} on {DEVICE}...")
+try:
+    qwen_4b_model = Qwen3_5ForConditionalGeneration.from_pretrained(
+        QWEN_4B_MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE,
+    ).eval()
+    qwen_4b_processor = AutoProcessor.from_pretrained(QWEN_4B_MODEL_NAME)
+    print("Qwen3.5-4B model loaded successfully.")
+except Exception as e:
+    print(f"Warning: Qwen3.5-4B model loading failed. Error: {e}")
+    qwen_4b_model = None
+    qwen_4b_processor = None
 # ── Qwen3.5-2B ──────────────────────────────────────────
+print(f"Loading Qwen3.5-2B model: {QWEN_2B_MODEL_NAME} on {DEVICE}...")
 try:
+    qwen_2b_model = Qwen3_5ForConditionalGeneration.from_pretrained(
+        QWEN_2B_MODEL_NAME, torch_dtype=DTYPE, device_map=DEVICE,
     ).eval()
+    qwen_2b_processor = AutoProcessor.from_pretrained(QWEN_2B_MODEL_NAME)
+    print("Qwen3.5-2B model loaded successfully.")
 except Exception as e:
+    print(f"Warning: Qwen3.5-2B model loading failed. Error: {e}")
+    qwen_2b_model = None
+    qwen_2b_processor = None
 # ── Qwen3-VL-2B-Instruct ────────────────────────────────
 print(f"Loading Qwen3-VL model: {QWEN_VL_MODEL_NAME} on {DEVICE}...")
 # --- Inference Generator (Streaming) ---
 @spaces.GPU(duration=120)
 def generate_inference_stream(
+    image: Image.Image, category: str, prompt: str, model_id: str = "qwen_4b"
 ):
     if category == "Query":
         full_prompt = prompt
     else:
         full_prompt = prompt
+    # ── Qwen3.5-4B ──────────────────────────────────────
+    if model_id == "qwen_4b":
+        if qwen_4b_model is None or qwen_4b_processor is None:
+            yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-4B model not loaded.'})}\n\n"
+            yield "data: [DONE]\n\n"
+            return
+        messages = [{"role": "user", "content": [
+            {"type": "image", "image": image},
+            {"type": "text",  "text":  full_prompt},
+        ]}]
+        text_input = qwen_4b_processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = qwen_4b_processor(
+            text=[text_input], images=[image], return_tensors="pt", padding=True
+        ).to(qwen_4b_model.device)
+        streamer = TextIteratorStreamer(
+            qwen_4b_processor.tokenizer,
+            skip_prompt=True, skip_special_tokens=True, timeout=120,
+        )
+        thread = threading.Thread(
+            target=qwen_4b_model.generate,
+            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
+                        use_cache=True, temperature=1.5, min_p=0.1),
+        )
+        thread.start()
+        for tok in streamer:
+            if tok:
+                yield f"data: {json.dumps({'chunk': tok})}\n\n"
+        thread.join()
+    # ── Qwen3.5-2B ──────────────────────────────────────
+    elif model_id == "qwen_2b":
+        if qwen_2b_model is None or qwen_2b_processor is None:
+            yield f"data: {json.dumps({'chunk': '[Error] Qwen3.5-2B model not loaded.'})}\n\n"
+            yield "data: [DONE]\n\n"
+            return
+        messages = [{"role": "user", "content": [
+            {"type": "image", "image": image},
+            {"type": "text",  "text":  full_prompt},
+        ]}]
+        text_input = qwen_2b_processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = qwen_2b_processor(
+            text=[text_input], images=[image], return_tensors="pt", padding=True
+        ).to(qwen_2b_model.device)
+        streamer = TextIteratorStreamer(
+            qwen_2b_processor.tokenizer,
+            skip_prompt=True, skip_special_tokens=True, timeout=120,
+        )
+        thread = threading.Thread(
+            target=qwen_2b_model.generate,
+            kwargs=dict(**inputs, streamer=streamer, max_new_tokens=1024,
+                        use_cache=True, temperature=1.5, min_p=0.1),
+        )
+        thread.start()
+        for tok in streamer:
+            if tok:
+                yield f"data: {json.dumps({'chunk': tok})}\n\n"
+        thread.join()
     # ── Qwen3-VL ────────────────────────────────────────
+    elif model_id == "qwen_vl":
         if qwen_vl_model is None or qwen_vl_processor is None:
             yield f"data: {json.dumps({'chunk': '[Error] Qwen3-VL model not loaded.'})}\n\n"
             yield "data: [DONE]\n\n"
                 yield f"data: {json.dumps({'chunk': tok})}\n\n"
         thread.join()
     yield "data: [DONE]\n\n"
     image:    UploadFile = File(...),
     category: str        = Form(...),
     prompt:   str        = Form(...),
+    model_id: str        = Form("qwen_4b"),
 ):
     try:
         img_bytes = await image.read()
         /* ── Canvas ── */
         #canvas {
             position: relative;
+            width: 1360px;
             min-height: calc(100vh - 42px);
+            height: 900px;
             margin: 0 auto;
         }
             background: var(--node-bg);
             border: 1px solid var(--node-border);
             border-radius: 9px;
+            box-shadow: 0 8px 28px rgba(0,0,0,0.5);
             z-index: 10;
             display: flex; flex-direction: column;
             transition: box-shadow 0.2s;
         .node:hover {
             box-shadow: 0 8px 28px rgba(0,0,0,0.5), 0 0 0 1px rgba(124,106,247,0.3);
         }
         .node.fixed-height { height: 330px; }
         .node-header {
             cursor: pointer;
             transition: opacity 0.2s, transform 0.1s;
             display: flex; justify-content: center; align-items: center; gap: 8px;
+            letter-spacing: 0.04em; flex-shrink: 0;
         }
         button.run-btn:hover   { opacity: 0.9; }
         button.run-btn:active  { transform: scale(0.98); }
             box-shadow: 0 0 5px var(--accent2);
         }
+        /* ── Model badges ── */
         .model-badge {
             display: inline-block; padding: 2px 7px;
             border-radius: 4px; font-size: 9px; font-weight: 700;
             letter-spacing: 0.06em; text-transform: uppercase;
         }
+        .model-badge.q4b    { background: rgba(255,200,80,0.15);  color: #ffc850;        border: 1px solid rgba(255,200,80,0.35); }
+        .model-badge.q2b    { background: rgba(124,106,247,0.2);  color: var(--accent);  border: 1px solid rgba(124,106,247,0.3); }
+        .model-badge.qvl    { background: rgba(255,150,50,0.15);  color: #ff9632;        border: 1px solid rgba(255,150,50,0.35); }
+        .model-badge.lfm450 { background: rgba(78,205,196,0.15);  color: var(--accent2); border: 1px solid rgba(78,205,196,0.3); }
+        .model-badge.lfm16  { background: rgba(107,203,119,0.15); color: #6bcb77;        border: 1px solid rgba(107,203,119,0.35); }
         .model-info-box {
             border-radius: 6px; padding: 9px;
     <span class="logo">MULTIMODAL EDGE</span>
     <span class="sep">|</span>
     <span class="sub">Node-Based Inference Canvas</span>
+    <span class="badge">v2.3 — PENTA MODEL</span>
 </div>
 <div id="canvas">
             <div>
                 <label>Active Model</label>
                 <select id="modelSelect">
+                    <option value="qwen_4b">Qwen3.5-4B</option>
+                    <option value="qwen_2b">Qwen3.5-2B</option>
                     <option value="qwen_vl">Qwen3-VL-2B-Instruct</option>
                     <option value="lfm_450">LFM2.5-VL-450M (LiquidAI)</option>
                     <option value="lfm_16">LFM2.5-VL-1.6B (LiquidAI)</option>
                 </select>
             </div>
             <div id="modelInfoBox" class="model-info-box"
+                 style="background:rgba(255,200,80,0.07);border:1px solid rgba(255,200,80,0.3);">
+                <span class="model-badge q4b">QWEN 3.5 · 4B</span><br><br>
+                Qwen3.5 4B multimodal model by Alibaba Cloud.
+                Enhanced capacity over 2B — richer reasoning, better instruction following.
             </div>
             <div style="flex:1;"></div>
         </div>
 dotModel.classList.add('active');
 const MODEL_INFO = {
+    qwen_4b: {
+        html:   `<span class="model-badge q4b">QWEN 3.5 · 4B</span><br><br>
+                 Qwen3.5 4B multimodal model by Alibaba Cloud.
+                 Enhanced capacity over 2B — richer reasoning &amp; better instruction following.`,
+        bg:     'rgba(255,200,80,0.07)',
+        border: 'rgba(255,200,80,0.30)',
+    },
+    qwen_2b: {
+        html:   `<span class="model-badge q2b">QWEN 3.5 · 2B</span><br><br>
+                 Qwen3.5 2B multimodal model by Alibaba Cloud.
+                 Lightweight &amp; fast — ideal for quick Query, Caption, Point &amp; Detect tasks.`,
+        bg:     'rgba(124,106,247,0.07)',
+        border: 'rgba(124,106,247,0.25)',
     },
     qwen_vl: {
+        html:   `<span class="model-badge qvl">QWEN3-VL · 2B</span><br><br>
+                 Qwen3-VL-2B-Instruct — dedicated vision-language model by Alibaba Cloud.
+                 Strong spatial grounding, OCR &amp; instruction-following.`,
+        bg:     'rgba(255,150,50,0.07)',
+        border: 'rgba(255,150,50,0.25)',
     },
     lfm_450: {
+        html:   `<span class="model-badge lfm450">LFM · 450M</span><br><br>
+                 LFM2.5-VL 450M by LiquidAI. Ultra-lightweight edge model
+                 with solid grounding capabilities.`,
+        bg:     'rgba(78,205,196,0.07)',
+        border: 'rgba(78,205,196,0.25)',
     },
     lfm_16: {
+        html:   `<span class="model-badge lfm16">LFM · 1.6B</span><br><br>
+                 LFM2.5-VL 1.6B by LiquidAI. Larger liquid-state model offering
+                 enhanced reasoning &amp; richer visual understanding.`,
+        bg:     'rgba(107,203,119,0.07)',
+        border: 'rgba(107,203,119,0.25)',
     },
 };
 modelSelect.onchange = () => {
     const info = MODEL_INFO[modelSelect.value];
     if (!info) return;
+    modelInfoBox.innerHTML              = info.html;
+    modelInfoBox.style.background       = info.bg;
+    modelInfoBox.style.borderColor      = info.border;
+    modelInfoBox.style.border           = `1px solid ${info.border}`;
 };
 // ══════════════════════════════════════════════
             // ── Bounding box ──
             let bbox = null;
+            if (item?.bbox_2d?.length === 4)  bbox = item.bbox_2d;
             else if (item?.bbox?.length === 4) bbox = item.bbox;
             else if (Array.isArray(item) && item.length === 4 &&
                      item.every(n => typeof n === 'number')) bbox = item;
                 if (x1 <= 1 && y1 <= 1 && x2 <= 1 && y2 <= 1) {
                     x1*=W; y1*=H; x2*=W; y2*=H;
                 }
+                const bw  = x2-x1, bh = y2-y1;
                 const lbl = item?.label || `${i+1}`;
                 gCtx.fillStyle   = hexToRgba(col, 0.18);
             // ── Point ──
             let pt = null;
+            if (item?.point_2d?.length === 2)  pt = item.point_2d;
             else if (item?.point?.length === 2) pt = item.point;
             else if (Array.isArray(item) && item.length === 2 &&
                      item.every(n => typeof n === 'number')) pt = item;