Spaces:

skshimada
/

Hello

Sleeping

App Files Files Community

skshimada commited on Feb 18

Commit

0bae07f

verified ·

1 Parent(s): acde7e3

Update app.py

Browse files

Files changed (1) hide show

app.py +19 -19

app.py CHANGED Viewed

@@ -13,17 +13,17 @@ from ultralytics import YOLO
 # --- CONFIGURATION ---
 CHROMA_PATH = "/tmp/chroma_db"
-# SmolVLM is a very efficient "Vision-Language-Model"
 VISION_MODEL = "HuggingFaceTB/SmolVLM-Instruct"
 # --- SYSTEM INITIALIZATION ---
 print("⚙️ Loading Stable Vision Engine...")
-# FIXED: Changed task to "image-text-to-text" and torch_dtype to dtype
 vision_pipe = pipeline(
     "image-text-to-text",
     model=VISION_MODEL,
-    model_kwargs={"dtype": torch.bfloat16},
-    device_map="auto"
 )
 print("📚 Loading Embedding Engine...")
@@ -31,6 +31,7 @@ embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM
 # --- BOTTLE DETECTION ---
 def get_bottle_crops(image_path):
     yolo_model = YOLO("yolov8n.pt")
     results = yolo_model(image_path, verbose=False)
     found_crops = []
@@ -39,8 +40,9 @@ def get_bottle_crops(image_path):
         for box in r.boxes:
             if int(box.cls) == 39: # COCO Index 39 = Bottle
                 x1, y1, x2, y2 = box.xyxy[0].tolist()
-                # Crop with a tiny bit of padding
                 found_crops.append(original_img.crop((x1-5, y1-5, x2+5, y2+5)))
     del yolo_model
     gc.collect()
     return found_crops
@@ -64,7 +66,7 @@ def ingest_recipes(files):
     if not docs:
         return "❌ Could not extract text from files."
-    # Create the vector database in the /tmp folder
     vector_store = Chroma.from_documents(
         documents=docs,
         embedding=embed_model,
@@ -74,44 +76,43 @@ def ingest_recipes(files):
 # --- BARTENDER LOGIC ---
 def bartend(message, history, img_path, inventory):
-    # 1. Vision Scanning
     if img_path:
         crops = get_bottle_crops(img_path)
-        # Scan the first detected bottle or the whole image
         target = crops[0] if crops else Image.open(img_path)
-        # SmolVLM prompt format
         messages = [
             {
                 "role": "user",
                 "content": [
                     {"type": "image"},
-                    {"type": "text", "text": "What is the exact brand and type of alcohol in this image? Answer with just the name."}
                 ]
             }
         ]
-        # Generate the label
         output = vision_pipe(target, prompt=messages, generate_kwargs={"max_new_tokens": 30})
-        # Clean up the output string
         raw_label = output[0]['generated_text']
         inventory = raw_label.split("Assistant:")[-1].strip()
-    # 2. RAG (Recipe Search)
     context = ""
     try:
         vs = Chroma(persist_directory=CHROMA_PATH, embedding_function=embed_model)
         search_query = f"Cocktail recipe using {inventory}"
         results = vs.similarity_search(search_query, k=2)
         context = "\n---\n".join([d.page_content for d in results])
-    except:
         context = ""
-    # 3. Formulate Response
     if context:
-        response = f"I see you have **{inventory}**. I found this in your recipe books:\n\n{context}"
     else:
-        response = f"I see you have **{inventory}**, but I couldn't find a specific match in your uploaded recipes. Would you like a classic suggestion for this spirit?"
     history.append((message, response))
     return history, inventory
@@ -134,9 +135,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             msg = gr.Textbox(label="3. Ask for a drink", placeholder="Tell me what you feel like drinking...")
             send_btn = gr.Button("Mix It Up", variant="primary")
-    # Wire up the buttons
     ingest_btn.click(ingest_recipes, file_up, status)
-    # Using 'submit' for the textbox and 'click' for the button
     msg.submit(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])
     send_btn.click(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])

 # --- CONFIGURATION ---
 CHROMA_PATH = "/tmp/chroma_db"
+# SmolVLM is a very efficient "Vision-Language-Model" for CPU usage
 VISION_MODEL = "HuggingFaceTB/SmolVLM-Instruct"
 # --- SYSTEM INITIALIZATION ---
 print("⚙️ Loading Stable Vision Engine...")
+# We use device="cpu" and float32 to avoid the "accelerate" dependency error
 vision_pipe = pipeline(
     "image-text-to-text",
     model=VISION_MODEL,
+    model_kwargs={"dtype": torch.float32},
+    device="cpu"
 )
 print("📚 Loading Embedding Engine...")
 # --- BOTTLE DETECTION ---
 def get_bottle_crops(image_path):
+    # YOLO downloads its weights automatically to the local directory
     yolo_model = YOLO("yolov8n.pt")
     results = yolo_model(image_path, verbose=False)
     found_crops = []
         for box in r.boxes:
             if int(box.cls) == 39: # COCO Index 39 = Bottle
                 x1, y1, x2, y2 = box.xyxy[0].tolist()
                 found_crops.append(original_img.crop((x1-5, y1-5, x2+5, y2+5)))
+    # Manual cleanup to save RAM on the Free Tier
     del yolo_model
     gc.collect()
     return found_crops
     if not docs:
         return "❌ Could not extract text from files."
+    # Initializing the vector database in /tmp for write access
     vector_store = Chroma.from_documents(
         documents=docs,
         embedding=embed_model,
 # --- BARTENDER LOGIC ---
 def bartend(message, history, img_path, inventory):
+    # 1. Vision Scanning (if image is provided)
     if img_path:
         crops = get_bottle_crops(img_path)
         target = crops[0] if crops else Image.open(img_path)
+        # Format for SmolVLM to ensure high accuracy
         messages = [
             {
                 "role": "user",
                 "content": [
                     {"type": "image"},
+                    {"type": "text", "text": "Identify the brand and specific alcohol type in this image. Answer briefly."}
                 ]
             }
         ]
         output = vision_pipe(target, prompt=messages, generate_kwargs={"max_new_tokens": 30})
         raw_label = output[0]['generated_text']
+        # Extract the Assistant's answer from the prompt/response sequence
         inventory = raw_label.split("Assistant:")[-1].strip()
+    # 2. RAG (Recipe Search in PDF/TXT)
     context = ""
     try:
         vs = Chroma(persist_directory=CHROMA_PATH, embedding_function=embed_model)
         search_query = f"Cocktail recipe using {inventory}"
         results = vs.similarity_search(search_query, k=2)
         context = "\n---\n".join([d.page_content for d in results])
+    except Exception as e:
+        print(f"Search error: {e}")
         context = ""
+    # 3. Final Response Construction
     if context:
+        response = f"I see you have **{inventory}**. Based on your recipe books, here is a suggestion:\n\n{context}"
     else:
+        response = f"I identified **{inventory}** on your shelf! I don't see a specific match in your uploaded books, but I can suggest a classic drink for this spirit if you'd like."
     history.append((message, response))
     return history, inventory
             msg = gr.Textbox(label="3. Ask for a drink", placeholder="Tell me what you feel like drinking...")
             send_btn = gr.Button("Mix It Up", variant="primary")
+    # Wire up events
     ingest_btn.click(ingest_recipes, file_up, status)
     msg.submit(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])
     send_btn.click(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])