Spaces:

airzy1
/

cheapsake

Sleeping

App Files Files Community

airzy1 commited on 11 days ago

Commit

b9bf728

verified ·

1 Parent(s): 09738fb

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -210

app.py CHANGED Viewed

@@ -1,15 +1,18 @@
-import json
 import os
 import re
-from typing import Any, Dict, List, Optional, Tuple
 import gradio as gr
 import spaces
-import torch
-from PIL import Image, ImageDraw, ImageFilter, ImageFont, ImageOps
-from qwen_vl_utils import process_vision_info
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 # ---------------------------
 # Environment / cache setup
@@ -29,18 +32,8 @@ torch.set_float32_matmul_precision("high")
 HF_TOKEN = os.environ.get("HF_TOKEN", "")
-# Strong 7B-class OCR/VLM choice with official benchmark evidence.
-# You can swap this later if you decide to test Qwen3-VL on a newer stack.
-MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct"
-# Visual token budget: high enough for label reading, but not absurd for ZeroGPU.
-# Official docs show min_pixels/max_pixels as the supported way to control resolution.
-MIN_PIXELS = 256 * 28 * 28
-MAX_PIXELS = 2048 * 28 * 28
-# Image prep knobs.
-FULL_LONG_SIDE = 2200
-TILE_LONG_SIDE = 1600
 processor = None
 model = None
@@ -55,136 +48,25 @@ def load_model() -> None:
     processor = AutoProcessor.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN if HF_TOKEN else None,
-        min_pixels=MIN_PIXELS,
-        max_pixels=MAX_PIXELS,
     )
     print("Loading model...")
-    model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN if HF_TOKEN else None,
         device_map="auto",
-        torch_dtype="auto",
         low_cpu_mem_usage=True,
     )
     model.eval()
     print("Model ready")
-def _resize_long_side(image: Image.Image, target_long_side: int) -> Image.Image:
-    """Resize only when needed, preserving aspect ratio."""
-    long_side = max(image.size)
-    if long_side <= target_long_side:
-        return image
-    scale = target_long_side / long_side
-    new_size = (
-        max(1, int(round(image.width * scale))),
-        max(1, int(round(image.height * scale))),
-    )
-    return image.resize(new_size, Image.Resampling.LANCZOS)
-def prepare_image(image: Image.Image, target_long_side: int = FULL_LONG_SIDE) -> Image.Image:
-    """Upscale/sharpen for tiny pantry text and ingredient panels."""
-    image = ImageOps.exif_transpose(image).convert("RGB")
-    image = _resize_long_side(image, target_long_side)
-    image = ImageOps.autocontrast(image)
-    image = image.filter(ImageFilter.SHARPEN)
-    image = image.filter(ImageFilter.DETAIL)
-    return image
-def crop_with_padding(
-    image: Image.Image,
-    box: Tuple[int, int, int, int],
-    pad_frac: float = 0.06,
-    target_long_side: int = TILE_LONG_SIDE,
-) -> Image.Image:
-    """Crop a region with some padding, then upscale it for OCR."""
-    w, h = image.size
-    x0, y0, x1, y1 = box
-    pad_x = int(round((x1 - x0) * pad_frac))
-    pad_y = int(round((y1 - y0) * pad_frac))
-    x0 = max(0, x0 - pad_x)
-    y0 = max(0, y0 - pad_y)
-    x1 = min(w, x1 + pad_x)
-    y1 = min(h, y1 + pad_y)
-    crop = image.crop((x0, y0, x1, y1))
-    crop = prepare_image(crop, target_long_side=target_long_side)
-    return crop
-def build_panels(image: Image.Image) -> List[Tuple[str, Image.Image]]:
-    """Create a small set of zoom panels to help the VLM read tiny labels."""
-    image = prepare_image(image, target_long_side=FULL_LONG_SIDE)
-    w, h = image.size
-    panels: List[Tuple[str, Image.Image]] = [("full", image)]
-    # For larger pantry shots, quadrants usually capture labels better than one huge scene.
-    if max(w, h) >= 1200:
-        mid_x = w // 2
-        mid_y = h // 2
-        overlap_x = int(round(w * 0.10))
-        overlap_y = int(round(h * 0.10))
-        quads = {
-            "top_left": (0, 0, mid_x + overlap_x, mid_y + overlap_y),
-            "top_right": (mid_x - overlap_x, 0, w, mid_y + overlap_y),
-            "bottom_left": (0, mid_y - overlap_y, mid_x + overlap_x, h),
-            "bottom_right": (mid_x - overlap_x, mid_y - overlap_y, w, h),
-        }
-        for label, box in quads.items():
-            panels.append((label, crop_with_padding(image, box, pad_frac=0.05)))
-    else:
-        # For smaller images, a centered zoom is often more useful than tiling.
-        cx0 = int(w * 0.15)
-        cy0 = int(h * 0.15)
-        cx1 = int(w * 0.85)
-        cy1 = int(h * 0.85)
-        if cx1 > cx0 and cy1 > cy0:
-            panels.append(("center_zoom", crop_with_padding(image, (cx0, cy0, cx1, cy1), pad_frac=0.03)))
-    return panels[:5]
-def make_contact_sheet(panels: List[Tuple[str, Image.Image]]) -> Image.Image:
-    """Build a single preview image so the user can see what the model saw."""
-    cols = 2
-    tile_w = 720
-    tile_h = 520
-    gap = 16
-    label_h = 28
-    rows = (len(panels) + cols - 1) // cols
-    sheet_w = cols * tile_w + (cols + 1) * gap
-    sheet_h = rows * (tile_h + label_h) + (rows + 1) * gap
-    canvas = Image.new("RGB", (sheet_w, sheet_h), (245, 245, 245))
-    draw = ImageDraw.Draw(canvas)
-    font = ImageFont.load_default()
-    for idx, (label, img) in enumerate(panels):
-        row = idx // cols
-        col = idx % cols
-        x = gap + col * (tile_w + gap)
-        y = gap + row * (tile_h + label_h + gap)
-        tile = ImageOps.contain(img, (tile_w, tile_h))
-        tile_bg = Image.new("RGB", (tile_w, tile_h), (255, 255, 255))
-        offset = ((tile_w - tile.width) // 2, (tile_h - tile.height) // 2)
-        tile_bg.paste(tile, offset)
-        canvas.paste(tile_bg, (x, y + label_h))
-        draw.rectangle([x, y, x + tile_w, y + label_h], fill=(230, 230, 230))
-        draw.text((x + 8, y + 6), label, fill=(20, 20, 20), font=font)
-        draw.rectangle([x, y + label_h, x + tile_w, y + label_h + tile_h], outline=(200, 200, 200), width=1)
-    return canvas
 def extract_json(text: str) -> Dict[str, Any]:
@@ -210,95 +92,62 @@ def extract_json(text: str) -> Dict[str, Any]:
     return {"raw_output": text}
-PROMPT = """
-Analyze all provided panels from the same pantry photo.
-Goal:
-- Read visible brand names, product names, ingredients, and tiny printed text.
-- Use the full panel and the zoom panels together.
-- Do not guess. If text is unreadable, say "unreadable".
-- Merge duplicates across panels.
-- Prefer exact visible text over paraphrase.
-Return strict JSON only with this shape:
-{
-  "items": [
-    {
-      "brand": "",
-      "product_name": "",
-      "visible_text": "",
-      "ingredients": [""],
-      "tiny_text_quality": "clear|partial|unreadable",
-      "confidence": 0.0,
-      "evidence_panels": ["full", "top_left", "top_right", "bottom_left", "bottom_right", "center_zoom"]
-    }
-  ],
-  "warnings": [""],
-  "notes": ""
-}
-""".strip()
-@spaces.GPU(size="large", duration=90)
-def analyze_pantry(image: Image.Image) -> Tuple[Optional[Image.Image], Dict[str, Any]]:
     if image is None:
         return None, {"error": "Upload an image first."}
     load_model()
-    panels = build_panels(image)
-    contact_sheet = make_contact_sheet(panels)
-    # Qwen chat format: the model receives multiple images plus one instruction block.
     messages = [
         {
             "role": "system",
             "content": [
-                {
-                    "type": "text",
-                    "text": "You are a careful OCR and pantry-label extraction assistant. Return valid JSON only.",
-                }
             ],
         },
         {
             "role": "user",
             "content": [
-                {
-                    "type": "text",
-                    "text": (
-                        "Panel order: full, top_left, top_right, bottom_left, bottom_right, center_zoom. "
-                        f"{PROMPT}"
-                    ),
-                },
-                *[{"type": "image", "image": panel_img} for _, panel_img in panels],
             ],
         },
     ]
-    text = processor.apply_chat_template(
         messages,
-        tokenize=False,
         add_generation_prompt=True,
-    )
-    image_inputs, video_inputs = process_vision_info(messages)
-    inputs = processor(
-        text=[text],
-        images=image_inputs,
-        videos=video_inputs,
-        padding=True,
         return_tensors="pt",
     )
-    # Some model/processor versions include token_type_ids, some do not.
-    inputs.pop("token_type_ids", None)
     inputs = inputs.to(model.device)
     with torch.inference_mode():
         output_ids = model.generate(
             **inputs,
-            max_new_tokens=700,
             do_sample=False,
         )
@@ -313,24 +162,13 @@ def analyze_pantry(image: Image.Image) -> Tuple[Optional[Image.Image], Dict[str,
     if isinstance(parsed, dict) and "raw_output" not in parsed:
         parsed["_raw_output"] = generated_text
-    return contact_sheet, parsed
-# Simple helper tests for local sanity checks.
-def _self_test() -> None:
-    blank = Image.new("RGB", (900, 700), "white")
-    panels = build_panels(blank)
-    assert len(panels) >= 2
-    sheet = make_contact_sheet(panels)
-    assert sheet.size[0] > 0 and sheet.size[1] > 0
-    assert extract_json('{"a": 1}') == {"a": 1}
-    assert "raw_output" in extract_json("not json")
 with gr.Blocks() as demo:
     gr.Markdown("# Pantry Scanner")
     gr.Markdown(
-        "Upload a pantry photo. The app sends the full image plus zoomed panels to help the model read tiny labels and ingredients."
     )
     with gr.Row():
@@ -340,7 +178,7 @@ with gr.Blocks() as demo:
         analyze_btn = gr.Button("Analyze", variant="primary")
     with gr.Row():
-        prepared_output = gr.Image(type="pil", label="Panels sent to the model")
         output_json = gr.JSON(label="Detected items")
     analyze_btn.click(

 import os
+import json
 import re
+from typing import Any, Dict, Tuple
+import torch
 import gradio as gr
 import spaces
+from PIL import Image, ImageOps
+# Qwen3-VL requires the latest Transformers from source.
+# In your Space requirements, use:
+# pip install git+https://github.com/huggingface/transformers
+from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
 # ---------------------------
 # Environment / cache setup
 HF_TOKEN = os.environ.get("HF_TOKEN", "")
+# Qwen3-VL upgrade path
+MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
 processor = None
 model = None
     processor = AutoProcessor.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN if HF_TOKEN else None,
     )
     print("Loading model...")
+    model = Qwen3VLForConditionalGeneration.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN if HF_TOKEN else None,
         device_map="auto",
+        torch_dtype=torch.bfloat16,
         low_cpu_mem_usage=True,
     )
+    print("Setting eval mode...")
     model.eval()
     print("Model ready")
+def normalize_image(image: Image.Image) -> Image.Image:
+    """Keep the original image path simple: no cropping, no tiling, no enhancement."""
+    return ImageOps.exif_transpose(image).convert("RGB")
 def extract_json(text: str) -> Dict[str, Any]:
     return {"raw_output": text}
+PROMPT = (
+    "Inspect this single pantry image and return only JSON. "
+    "Identify the visible brand name, product name, ingredients, and any other clearly readable package text. "
+    "Do not guess tiny text you cannot read. "
+    "Use this schema: {"
+    '"brand": string|null, '
+    '"product_name": string|null, '
+    '"ingredients": [string], '
+    '"visible_text": [string], '
+    '"packaging_notes": string|null, '
+    '"confidence": {"brand": number, "product_name": number, "ingredients": number}, '
+    '"raw_ocr": [string]'
+    "}."
+)
+@spaces.GPU(size="large", duration=60)
+def analyze_pantry(image: Image.Image) -> Tuple[Image.Image, Dict[str, Any]]:
     if image is None:
         return None, {"error": "Upload an image first."}
     load_model()
+    prepared = normalize_image(image)
     messages = [
         {
             "role": "system",
             "content": [
+                {"type": "text", "text": "You are a precise visual OCR assistant. Return JSON only."}
             ],
         },
         {
             "role": "user",
             "content": [
+                {"type": "image", "image": prepared},
+                {"type": "text", "text": PROMPT},
             ],
         },
     ]
+    # Qwen3-VL official Transformers usage.
+    inputs = processor.apply_chat_template(
         messages,
+        tokenize=True,
         add_generation_prompt=True,
+        return_dict=True,
         return_tensors="pt",
     )
     inputs = inputs.to(model.device)
     with torch.inference_mode():
         output_ids = model.generate(
             **inputs,
+            max_new_tokens=512,
             do_sample=False,
         )
     if isinstance(parsed, dict) and "raw_output" not in parsed:
         parsed["_raw_output"] = generated_text
+    return prepared, parsed
 with gr.Blocks() as demo:
     gr.Markdown("# Pantry Scanner")
     gr.Markdown(
+        "Single-image Qwen3-VL OCR/brand reader. No tiling, no crop pipeline, no manual sharpening."
     )
     with gr.Row():
         analyze_btn = gr.Button("Analyze", variant="primary")
     with gr.Row():
+        prepared_output = gr.Image(type="pil", label="Feeding image")
         output_json = gr.JSON(label="Detected items")
     analyze_btn.click(