Spaces:

skshimada
/

Hello

Sleeping

App Files Files Community

skshimada commited on 11 days ago

Commit

ce0a4da

verified ·

1 Parent(s): 4ba62ef

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -32

app.py CHANGED Viewed

@@ -33,28 +33,25 @@ def get_bottle_crops(image_path):
     found_crops = []
     try:
-        original_img = Image.open(image_path)
         img_w, img_h = original_img.size
         yolo_model = YOLO("yolov8n.pt")
-        # Low confidence to catch everything
         results = yolo_model(image_path, verbose=True, conf=0.1)
         for r in results:
             for box in r.boxes:
-                if int(box.cls) == 39: # Bottle
                     x1, y1, x2, y2 = box.xyxy[0].tolist()
-                    # --- NEW: Dynamic 25% Padding ---
-                    # Calculate width and height of the detected box
                     box_w = x2 - x1
                     box_h = y2 - y1
-                    # Expand by 25% of the box's own size
                     pad_x = int(box_w * 0.25)
                     pad_y = int(box_h * 0.25)
-                    # Apply padding but stay within image bounds
                     x1 = max(0, x1 - pad_x)
                     y1 = max(0, y1 - pad_y)
                     x2 = min(img_w, x2 + pad_x)
@@ -75,7 +72,7 @@ def get_bottle_crops(image_path):
     except Exception as e:
         print(f"❌ YOLO CRASH: {e}")
         try:
-            return [Image.open(image_path)]
         except:
             return []
@@ -113,37 +110,47 @@ def bartend(message, history, img_path, inventory):
         debug_images = crops
         # Start with the best crop
-        target_img = crops[0] if crops else Image.open(img_path)
-        # Helper function to run vision model
         def identify_spirit(image_input):
-            prompt = "User: <image>\nRead the label. What is the specific brand and type of alcohol? Be precise.\nAssistant:"
-            out = vision_pipe(image_input, prompt=prompt, generate_kwargs={"max_new_tokens": 50})
             text = out[0]['generated_text']
             if "Assistant:" in text:
                 return text.split("Assistant:")[-1].strip()
             return text.replace("User: <image>", "").strip()
-        # Run First Pass (Crop)
-        inventory = identify_spirit(target_img)
-        inventory = re.sub(r'<.*?>', '', inventory).strip().split('.')[0]
-        print(f"🔍 Pass 1 Result: {inventory}")
-        # --- NEW: The "Generic Fallback" Logic ---
-        # If the result is just a generic category, we missed the brand.
-        # Force a check on the FULL image.
-        generic_terms = ["vodka", "gin", "rum", "tequila", "whiskey", "whisky", "bourbon", "brandy", "alcohol", "liquor", "spirit", "bottle"]
-        if inventory.lower() in generic_terms or len(inventory) < 4:
-            print("⚠️ Result too generic. Trying FULL IMAGE...")
-            full_img_result = identify_spirit(Image.open(img_path))
-            full_img_result = re.sub(r'<.*?>', '', full_img_result).strip().split('.')[0]
-            # If the full image gave us a longer (more specific) name, use it
-            if len(full_img_result) > len(inventory):
-                inventory = full_img_result
-                print(f"✅ Pass 2 Result: {inventory}")
     # 2. RAG (Recipe Search)
     recipe_context = ""
@@ -165,6 +172,7 @@ def bartend(message, history, img_path, inventory):
     else:
         response = f"I see you have **{inventory}**! I don't have a specific recipe for that in the current library. Should I suggest a classic drink?"
     history.append({"role": "user", "content": message})
     history.append({"role": "assistant", "content": response})

     found_crops = []
     try:
+        original_img = Image.open(image_path).convert("RGB")
         img_w, img_h = original_img.size
         yolo_model = YOLO("yolov8n.pt")
+        # Extremely low confidence to catch anything
         results = yolo_model(image_path, verbose=True, conf=0.1)
         for r in results:
             for box in r.boxes:
+                # Class 39 is bottle. We also check Class 40 (Wine glass) or 41 (Cup) just in case
+                if int(box.cls) in [39, 40, 41]:
                     x1, y1, x2, y2 = box.xyxy[0].tolist()
+                    # Dynamic 25% Padding
                     box_w = x2 - x1
                     box_h = y2 - y1
                     pad_x = int(box_w * 0.25)
                     pad_y = int(box_h * 0.25)
                     x1 = max(0, x1 - pad_x)
                     y1 = max(0, y1 - pad_y)
                     x2 = min(img_w, x2 + pad_x)
     except Exception as e:
         print(f"❌ YOLO CRASH: {e}")
         try:
+            return [Image.open(image_path).convert("RGB")]
         except:
             return []
         debug_images = crops
         # Start with the best crop
+        target_img = crops[0] if crops else Image.open(img_path).convert("RGB")
+        # Helper function with FIXED calling signature
         def identify_spirit(image_input):
+            # Ensure image is RGB to prevent pipeline errors
+            if image_input.mode != "RGB":
+                image_input = image_input.convert("RGB")
+            prompt = "User: <image>\nRead the label on the bottle. What is the specific brand and type of alcohol? Be precise.\nAssistant:"
+            # FIXED: Passing prompt as a positional argument (the second argument)
+            # This fixes the "ValueError: You must provide text" error
+            out = vision_pipe(image_input, prompt, generate_kwargs={"max_new_tokens": 50})
             text = out[0]['generated_text']
             if "Assistant:" in text:
                 return text.split("Assistant:")[-1].strip()
             return text.replace("User: <image>", "").strip()
+        # Run Pass 1
+        try:
+            inventory = identify_spirit(target_img)
+            inventory = re.sub(r'<.*?>', '', inventory).strip().split('.')[0]
+            print(f"🔍 Pass 1 Result: {inventory}")
+            # Generic Fallback Logic
+            generic_terms = ["vodka", "gin", "rum", "tequila", "whiskey", "whisky", "bourbon", "brandy", "alcohol", "liquor", "spirit", "bottle", "drink", "glass"]
+            # If the answer is too short or generic, try the FULL image
+            if inventory.lower() in generic_terms or len(inventory) < 4:
+                print("⚠️ Result too generic. Trying FULL IMAGE...")
+                full_img_result = identify_spirit(Image.open(img_path).convert("RGB"))
+                full_img_result = re.sub(r'<.*?>', '', full_img_result).strip().split('.')[0]
+                if len(full_img_result) > len(inventory):
+                    inventory = full_img_result
+                    print(f"✅ Pass 2 Result: {inventory}")
+        except Exception as e:
+            print(f"❌ Vision Pipeline Failed: {e}")
+            inventory = "Unknown Spirit"
     # 2. RAG (Recipe Search)
     recipe_context = ""
     else:
         response = f"I see you have **{inventory}**! I don't have a specific recipe for that in the current library. Should I suggest a classic drink?"
+    # Gradio 6.0 Dictionary Format
     history.append({"role": "user", "content": message})
     history.append({"role": "assistant", "content": response})