Spaces:

skshimada
/

Hello

Sleeping

App Files Files Community

skshimada commited on Feb 18

Commit

8d2f88f

verified ·

1 Parent(s): 06a83ae

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -35

app.py CHANGED Viewed

@@ -13,16 +13,20 @@ from ultralytics import YOLO
 # --- CONFIGURATION ---
 CHROMA_PATH = "/tmp/chroma_db"
-# Using a native HF Vision model that doesn't need C++ compilation
 VISION_MODEL = "HuggingFaceTB/SmolVLM-Instruct"
 # --- SYSTEM INITIALIZATION ---
-# This uses 'transformers', which is pre-installed on HF Spaces
 print("⚙️ Loading Stable Vision Engine...")
-vision_pipe = pipeline("image-to-text", model=VISION_MODEL, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
 print("📚 Loading Embedding Engine...")
-# This replaces the Llama-embeddings to avoid 'Building Wheels'
 embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 # --- BOTTLE DETECTION ---
@@ -33,8 +37,9 @@ def get_bottle_crops(image_path):
     original_img = Image.open(image_path)
     for r in results:
         for box in r.boxes:
-            if int(box.cls) == 39: # Bottle
                 x1, y1, x2, y2 = box.xyxy[0].tolist()
                 found_crops.append(original_img.crop((x1-5, y1-5, x2+5, y2+5)))
     del yolo_model
     gc.collect()
@@ -46,71 +51,93 @@ def ingest_recipes(files):
     docs = []
     for f in files:
-        if f.name.endswith(".txt"):
-            loader = TextLoader(f.name)
-            docs.extend(loader.load())
-        elif f.name.endswith(".pdf"):
-            loader = PyPDFLoader(f.name)
-            docs.extend(loader.load())
     vector_store = Chroma.from_documents(
         documents=docs,
         embedding=embed_model,
         persist_directory=CHROMA_PATH
     )
-    return f"✅ Ingested {len(docs)} pages/recipes."
 # --- BARTENDER LOGIC ---
 def bartend(message, history, img_path, inventory):
     # 1. Vision Scanning
     if img_path:
         crops = get_bottle_crops(img_path)
         target = crops[0] if crops else Image.open(img_path)
-        # Use Transformers instead of llama-cpp for the label reading
-        output = vision_pipe(target, prompt="What brand of alcohol is this?", generate_kwargs={"max_new_tokens": 30})
-        inventory = output[0]['generated_text'].replace("brand", "").strip()
-    # 2. RAG (Search your PDFs)
     context = ""
     try:
         vs = Chroma(persist_directory=CHROMA_PATH, embedding_function=embed_model)
-        search_query = f"{inventory} cocktail"
-        results = vs.similarity_search(search_query, k=3)
-        context = "\n".join([d.page_content for d in results])
     except:
-        context = "No PDF recipes loaded yet."
-    # 3. Generate Response (Using a fast text pipeline)
-    # For the free tier, we use a simple text generator or the Vision model's text ability
-    prompt = f"System: You are a Master Sommelier. Inventory: {inventory}. Source: {context}. User: {message}"
-    # Simple response construction for stability
-    if "No PDF" in context:
-        response = f"I see you have {inventory}! Since no recipe books are loaded, I recommend a classic pairing. What's your flavor profile?"
     else:
-        response = f"I found a recipe in your books for {inventory}!\n\n{context[:500]}..."
     history.append((message, response))
     return history, inventory
 # --- UI LAYOUT ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.HTML("<h1 style='text-align:center'>🍸 LocalAGI: The Cloud-Stable Sommelier</h1>")
     inv_state = gr.State("Empty Shelf")
     with gr.Row():
         with gr.Column(scale=1):
-            file_up = gr.File(label="Upload Recipe PDFs", file_count="multiple")
-            ingest_btn = gr.Button("📥 Load Recipes")
             status = gr.Textbox(label="System Status", value="Ready")
         with gr.Column(scale=2):
-            chatbot = gr.Chatbot(height=400)
-            msg = gr.Textbox(label="Ask the Bartender")
-            img = gr.Image(type="filepath", label="Bottle Photo")
-            send_btn = gr.Button("Mix Drink", variant="primary")
     ingest_btn.click(ingest_recipes, file_up, status)
     send_btn.click(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])
 if __name__ == "__main__":

 # --- CONFIGURATION ---
 CHROMA_PATH = "/tmp/chroma_db"
+# SmolVLM is a very efficient "Vision-Language-Model"
 VISION_MODEL = "HuggingFaceTB/SmolVLM-Instruct"
 # --- SYSTEM INITIALIZATION ---
 print("⚙️ Loading Stable Vision Engine...")
+# FIXED: Changed task to "image-text-to-text" and torch_dtype to dtype
+vision_pipe = pipeline(
+    "image-text-to-text",
+    model=VISION_MODEL,
+    model_kwargs={"dtype": torch.bfloat16},
+    device_map="auto"
+)
 print("📚 Loading Embedding Engine...")
 embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 # --- BOTTLE DETECTION ---
     original_img = Image.open(image_path)
     for r in results:
         for box in r.boxes:
+            if int(box.cls) == 39: # COCO Index 39 = Bottle
                 x1, y1, x2, y2 = box.xyxy[0].tolist()
+                # Crop with a tiny bit of padding
                 found_crops.append(original_img.crop((x1-5, y1-5, x2+5, y2+5)))
     del yolo_model
     gc.collect()
     docs = []
     for f in files:
+        try:
+            if f.name.endswith(".txt"):
+                loader = TextLoader(f.name)
+                docs.extend(loader.load())
+            elif f.name.endswith(".pdf"):
+                loader = PyPDFLoader(f.name)
+                docs.extend(loader.load())
+        except Exception as e:
+            print(f"Error loading {f.name}: {e}")
+    if not docs:
+        return "❌ Could not extract text from files."
+    # Create the vector database in the /tmp folder
     vector_store = Chroma.from_documents(
         documents=docs,
         embedding=embed_model,
         persist_directory=CHROMA_PATH
     )
+    return f"✅ Ingested {len(docs)} pages/recipes into the bar library."
 # --- BARTENDER LOGIC ---
 def bartend(message, history, img_path, inventory):
     # 1. Vision Scanning
     if img_path:
         crops = get_bottle_crops(img_path)
+        # Scan the first detected bottle or the whole image
         target = crops[0] if crops else Image.open(img_path)
+        # SmolVLM prompt format
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": "What is the exact brand and type of alcohol in this image? Answer with just the name."}
+                ]
+            }
+        ]
+        # Generate the label
+        output = vision_pipe(target, prompt=messages, generate_kwargs={"max_new_tokens": 30})
+        # Clean up the output string
+        raw_label = output[0]['generated_text']
+        inventory = raw_label.split("Assistant:")[-1].strip()
+    # 2. RAG (Recipe Search)
     context = ""
     try:
         vs = Chroma(persist_directory=CHROMA_PATH, embedding_function=embed_model)
+        search_query = f"Cocktail recipe using {inventory}"
+        results = vs.similarity_search(search_query, k=2)
+        context = "\n---\n".join([d.page_content for d in results])
     except:
+        context = ""
+    # 3. Formulate Response
+    if context:
+        response = f"I see you have **{inventory}**. I found this in your recipe books:\n\n{context}"
     else:
+        response = f"I see you have **{inventory}**, but I couldn't find a specific match in your uploaded recipes. Would you like a classic suggestion for this spirit?"
     history.append((message, response))
     return history, inventory
 # --- UI LAYOUT ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🍸 LocalAGI: The AI Sommelier")
     inv_state = gr.State("Empty Shelf")
     with gr.Row():
         with gr.Column(scale=1):
+            file_up = gr.File(label="1. Upload Recipe Books (PDF/TXT)", file_count="multiple")
+            ingest_btn = gr.Button("📥 Load into Memory")
             status = gr.Textbox(label="System Status", value="Ready")
+            gr.Markdown("---")
+            img = gr.Image(type="filepath", label="2. Photo of your Bottle")
         with gr.Column(scale=2):
+            chatbot = gr.Chatbot(height=500, label="Bartender")
+            msg = gr.Textbox(label="3. Ask for a drink", placeholder="Tell me what you feel like drinking...")
+            send_btn = gr.Button("Mix It Up", variant="primary")
+    # Wire up the buttons
     ingest_btn.click(ingest_recipes, file_up, status)
+    # Using 'submit' for the textbox and 'click' for the button
+    msg.submit(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])
     send_btn.click(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])
 if __name__ == "__main__":