Spaces:

airzy1
/

cheapsake

Running on Zero

App Files Files Community

airzy1 commited on 13 days ago

Commit

d4de43b

verified ·

1 Parent(s): 537537a

Update app.py

Browse files

Files changed (1) hide show

app.py +76 -45

app.py CHANGED Viewed

@@ -1,11 +1,22 @@
 import os
 import json
 import re
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:64"
-# Writable cache
 os.environ["HF_HOME"] = "/tmp/hf"
 os.environ["HF_HUB_CACHE"] = "/tmp/hf/hub"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf/transformers"
@@ -13,65 +24,91 @@ os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf/transformers"
 os.makedirs("/tmp/hf/hub", exist_ok=True)
 os.makedirs("/tmp/hf/transformers", exist_ok=True)
-import spaces
-import torch
-import gradio as gr
-from PIL import Image
-from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
 HF_TOKEN = os.environ.get("HF_TOKEN", "")
-# ✅ SMALLER MODEL
-MODEL_ID = "Qwen/Qwen2.5-VL-1.8B-Instruct"
 processor = None
 model = None
-def load_model():
     global processor, model
-    if model is not None:
         return
     print("Loading processor...")
     processor = AutoProcessor.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN if HF_TOKEN else None,
-        min_pixels=256 * 28 * 28,
-        max_pixels=768 * 28 * 28,  # 🔥 lower for memory safety
     )
-    print("Loading model:", MODEL_ID)
     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN if HF_TOKEN else None,
         device_map="auto",
-        torch_dtype=torch.float16,  # 🔥 force lower memory
     )
     model.eval()
     print("Model ready")
-def extract_json(text: str):
     text = (text or "").strip()
     try:
         return json.loads(text)
-    except:
         pass
     match = re.search(r"\{.*\}", text, flags=re.S)
     if match:
         try:
             return json.loads(match.group(0))
-        except:
             pass
     return {"raw_output": text}
-PROMPT = """Analyze this pantry image.
 Return ONLY valid JSON with this schema:
 {
@@ -87,25 +124,23 @@ Return ONLY valid JSON with this schema:
   "uncertain_items": []
 }
-Focus on:
-- canned goods
-- labels
-- jars
-- boxes
-- spices
-Be precise. Do NOT hallucinate.
 """
-@spaces.GPU(size="large", duration=120)  # 🔥 smaller GPU = faster queue
 def analyze_pantry(image: Image.Image):
     if image is None:
-        return {"error": "Upload an image"}
     load_model()
-    image = image.convert("RGB")
     messages = [
         {
@@ -127,50 +162,46 @@ def analyze_pantry(image: Image.Image):
         add_generation_prompt=True,
     )
     inputs = processor(
         text=[text],
-        images=[image],
         padding=True,
         return_tensors="pt",
     )
-    inputs = {k: v.to(model.device) for k, v in inputs.items()}
     with torch.inference_mode():
         output_ids = model.generate(
             **inputs,
-            max_new_tokens=500,   # 🔥 reduced
             do_sample=False,
         )
     prompt_len = inputs["input_ids"].shape[-1]
     generated_text = processor.batch_decode(
         [output_ids[0][prompt_len:]],
         skip_special_tokens=True,
     )[0].strip()
-    print("OUTPUT:", generated_text)
     parsed = extract_json(generated_text)
     if isinstance(parsed, dict) and "raw_output" not in parsed:
         parsed["_raw_output"] = generated_text
     return parsed
-@spaces.GPU(size="small", duration=1)
-def cloud():
-    return None
 with gr.Blocks() as demo:
-    gr.Markdown("# Pantry Analyzer (ZeroGPU Optimized)")
-    image_input = gr.Image(type="pil")
-    analyze_btn = gr.Button("Analyze")
-    output_json = gr.JSON()
     analyze_btn.click(analyze_pantry, inputs=image_input, outputs=output_json)

 import os
 import json
 import re
+from typing import Any, Dict
+import torch
+import gradio as gr
+import spaces
+from PIL import Image, ImageFilter, ImageOps
+from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
+from qwen_vl_utils import process_vision_info
+# ---------------------------
+# Environment / cache setup
+# ---------------------------
 os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True,max_split_size_mb:64"
+# Writable cache for Spaces
 os.environ["HF_HOME"] = "/tmp/hf"
 os.environ["HF_HUB_CACHE"] = "/tmp/hf/hub"
 os.environ["TRANSFORMERS_CACHE"] = "/tmp/hf/transformers"
 os.makedirs("/tmp/hf/hub", exist_ok=True)
 os.makedirs("/tmp/hf/transformers", exist_ok=True)
+torch.set_float32_matmul_precision("high")
 HF_TOKEN = os.environ.get("HF_TOKEN", "")
+# Heaviest practical choice for a ZeroGPU Space
+MODEL_ID = "Qwen/Qwen2.5-VL-72B-Instruct-AWQ"
+# Aggressive visual token budget for tiny labels / ingredients
+MIN_PIXELS = 1024 * 28 * 28
+MAX_PIXELS = 4096 * 28 * 28
 processor = None
 model = None
+def load_model() -> None:
     global processor, model
+    if model is not None and processor is not None:
         return
     print("Loading processor...")
     processor = AutoProcessor.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN if HF_TOKEN else None,
+        min_pixels=MIN_PIXELS,
+        max_pixels=MAX_PIXELS,
     )
+    print("Loading model...")
     model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID,
         token=HF_TOKEN if HF_TOKEN else None,
         device_map="auto",
+        torch_dtype="auto",
+        low_cpu_mem_usage=True,
     )
     model.eval()
     print("Model ready")
+def prepare_image(image: Image.Image) -> Image.Image:
+    """Upscale and sharpen to help with tiny text on pantry labels."""
+    image = ImageOps.exif_transpose(image).convert("RGB")
+    # Upscale small images so tiny labels have a better chance of being read.
+    long_side = max(image.size)
+    target_long_side = 2400
+    if long_side < target_long_side:
+        scale = target_long_side / long_side
+        new_size = (
+            max(1, int(round(image.width * scale))),
+            max(1, int(round(image.height * scale))),
+        )
+        image = image.resize(new_size, Image.Resampling.LANCZOS)
+    image = ImageOps.autocontrast(image)
+    image = image.filter(ImageFilter.SHARPEN)
+    return image
+def extract_json(text: str) -> Dict[str, Any]:
     text = (text or "").strip()
+    # Strip common markdown fences
+    text = re.sub(r"^\s*```(?:json)?\s*", "", text, flags=re.I)
+    text = re.sub(r"\s*```\s*$", "", text, flags=re.I)
     try:
         return json.loads(text)
+    except Exception:
         pass
+    # Try to find the first JSON object in the text
     match = re.search(r"\{.*\}", text, flags=re.S)
     if match:
         try:
             return json.loads(match.group(0))
+        except Exception:
             pass
     return {"raw_output": text}
+PROMPT = """Analyze this pantry image carefully.
 Return ONLY valid JSON with this schema:
 {
   "uncertain_items": []
 }
+Rules:
+- Focus on tiny labels, ingredient names, canned goods, jars, boxes, spices, and packaging text.
+- Prefer exact visible text over guesses.
+- If a brand or quantity is unclear, leave it empty or put it in uncertain_items.
+- Do not hallucinate.
+- Return JSON only. No markdown, no explanation, no code fences.
 """
+@spaces.GPU(size="xlarge", duration=120)
 def analyze_pantry(image: Image.Image):
     if image is None:
+        return {"error": "Upload an image first."}
     load_model()
+    image = prepare_image(image)
     messages = [
         {
         add_generation_prompt=True,
     )
+    image_inputs, video_inputs = process_vision_info(messages)
     inputs = processor(
         text=[text],
+        images=image_inputs,
+        videos=video_inputs,
         padding=True,
         return_tensors="pt",
     )
+    inputs = inputs.to(model.device)
     with torch.inference_mode():
         output_ids = model.generate(
             **inputs,
+            max_new_tokens=700,
             do_sample=False,
         )
     prompt_len = inputs["input_ids"].shape[-1]
     generated_text = processor.batch_decode(
         [output_ids[0][prompt_len:]],
         skip_special_tokens=True,
     )[0].strip()
     parsed = extract_json(generated_text)
     if isinstance(parsed, dict) and "raw_output" not in parsed:
         parsed["_raw_output"] = generated_text
     return parsed
 with gr.Blocks() as demo:
+    gr.Markdown("# Pantry Scanner")
+    with gr.Row():
+        image_input = gr.Image(type="pil", label="Pantry image")
+    with gr.Row():
+        analyze_btn = gr.Button("Analyze", variant="primary")
+    output_json = gr.JSON(label="Detected items")
     analyze_btn.click(analyze_pantry, inputs=image_input, outputs=output_json)