skshimada commited on
Commit
8d2f88f
·
verified ·
1 Parent(s): 06a83ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +62 -35
app.py CHANGED
@@ -13,16 +13,20 @@ from ultralytics import YOLO
13
 
14
  # --- CONFIGURATION ---
15
  CHROMA_PATH = "/tmp/chroma_db"
16
- # Using a native HF Vision model that doesn't need C++ compilation
17
  VISION_MODEL = "HuggingFaceTB/SmolVLM-Instruct"
18
 
19
  # --- SYSTEM INITIALIZATION ---
20
- # This uses 'transformers', which is pre-installed on HF Spaces
21
  print("⚙️ Loading Stable Vision Engine...")
22
- vision_pipe = pipeline("image-to-text", model=VISION_MODEL, model_kwargs={"torch_dtype": torch.bfloat16}, device_map="auto")
 
 
 
 
 
 
23
 
24
  print("📚 Loading Embedding Engine...")
25
- # This replaces the Llama-embeddings to avoid 'Building Wheels'
26
  embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
27
 
28
  # --- BOTTLE DETECTION ---
@@ -33,8 +37,9 @@ def get_bottle_crops(image_path):
33
  original_img = Image.open(image_path)
34
  for r in results:
35
  for box in r.boxes:
36
- if int(box.cls) == 39: # Bottle
37
  x1, y1, x2, y2 = box.xyxy[0].tolist()
 
38
  found_crops.append(original_img.crop((x1-5, y1-5, x2+5, y2+5)))
39
  del yolo_model
40
  gc.collect()
@@ -46,71 +51,93 @@ def ingest_recipes(files):
46
 
47
  docs = []
48
  for f in files:
49
- if f.name.endswith(".txt"):
50
- loader = TextLoader(f.name)
51
- docs.extend(loader.load())
52
- elif f.name.endswith(".pdf"):
53
- loader = PyPDFLoader(f.name)
54
- docs.extend(loader.load())
 
 
 
55
 
 
 
 
 
56
  vector_store = Chroma.from_documents(
57
  documents=docs,
58
  embedding=embed_model,
59
  persist_directory=CHROMA_PATH
60
  )
61
- return f"✅ Ingested {len(docs)} pages/recipes."
62
 
63
  # --- BARTENDER LOGIC ---
64
  def bartend(message, history, img_path, inventory):
65
  # 1. Vision Scanning
66
  if img_path:
67
  crops = get_bottle_crops(img_path)
 
68
  target = crops[0] if crops else Image.open(img_path)
69
- # Use Transformers instead of llama-cpp for the label reading
70
- output = vision_pipe(target, prompt="What brand of alcohol is this?", generate_kwargs={"max_new_tokens": 30})
71
- inventory = output[0]['generated_text'].replace("brand", "").strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
- # 2. RAG (Search your PDFs)
74
  context = ""
75
  try:
76
  vs = Chroma(persist_directory=CHROMA_PATH, embedding_function=embed_model)
77
- search_query = f"{inventory} cocktail"
78
- results = vs.similarity_search(search_query, k=3)
79
- context = "\n".join([d.page_content for d in results])
80
  except:
81
- context = "No PDF recipes loaded yet."
82
 
83
- # 3. Generate Response (Using a fast text pipeline)
84
- # For the free tier, we use a simple text generator or the Vision model's text ability
85
- prompt = f"System: You are a Master Sommelier. Inventory: {inventory}. Source: {context}. User: {message}"
86
-
87
- # Simple response construction for stability
88
- if "No PDF" in context:
89
- response = f"I see you have {inventory}! Since no recipe books are loaded, I recommend a classic pairing. What's your flavor profile?"
90
  else:
91
- response = f"I found a recipe in your books for {inventory}!\n\n{context[:500]}..."
92
 
93
  history.append((message, response))
94
  return history, inventory
95
 
96
  # --- UI LAYOUT ---
97
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
98
- gr.HTML("<h1 style='text-align:center'>🍸 LocalAGI: The Cloud-Stable Sommelier</h1>")
99
  inv_state = gr.State("Empty Shelf")
100
 
101
  with gr.Row():
102
  with gr.Column(scale=1):
103
- file_up = gr.File(label="Upload Recipe PDFs", file_count="multiple")
104
- ingest_btn = gr.Button("📥 Load Recipes")
105
  status = gr.Textbox(label="System Status", value="Ready")
 
 
106
 
107
  with gr.Column(scale=2):
108
- chatbot = gr.Chatbot(height=400)
109
- msg = gr.Textbox(label="Ask the Bartender")
110
- img = gr.Image(type="filepath", label="Bottle Photo")
111
- send_btn = gr.Button("Mix Drink", variant="primary")
112
 
 
113
  ingest_btn.click(ingest_recipes, file_up, status)
 
 
114
  send_btn.click(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])
115
 
116
  if __name__ == "__main__":
 
13
 
14
  # --- CONFIGURATION ---
15
  CHROMA_PATH = "/tmp/chroma_db"
16
+ # SmolVLM is a very efficient "Vision-Language-Model"
17
  VISION_MODEL = "HuggingFaceTB/SmolVLM-Instruct"
18
 
19
  # --- SYSTEM INITIALIZATION ---
 
20
  print("⚙️ Loading Stable Vision Engine...")
21
+ # FIXED: Changed task to "image-text-to-text" and torch_dtype to dtype
22
+ vision_pipe = pipeline(
23
+ "image-text-to-text",
24
+ model=VISION_MODEL,
25
+ model_kwargs={"dtype": torch.bfloat16},
26
+ device_map="auto"
27
+ )
28
 
29
  print("📚 Loading Embedding Engine...")
 
30
  embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
31
 
32
  # --- BOTTLE DETECTION ---
 
37
  original_img = Image.open(image_path)
38
  for r in results:
39
  for box in r.boxes:
40
+ if int(box.cls) == 39: # COCO Index 39 = Bottle
41
  x1, y1, x2, y2 = box.xyxy[0].tolist()
42
+ # Crop with a tiny bit of padding
43
  found_crops.append(original_img.crop((x1-5, y1-5, x2+5, y2+5)))
44
  del yolo_model
45
  gc.collect()
 
51
 
52
  docs = []
53
  for f in files:
54
+ try:
55
+ if f.name.endswith(".txt"):
56
+ loader = TextLoader(f.name)
57
+ docs.extend(loader.load())
58
+ elif f.name.endswith(".pdf"):
59
+ loader = PyPDFLoader(f.name)
60
+ docs.extend(loader.load())
61
+ except Exception as e:
62
+ print(f"Error loading {f.name}: {e}")
63
 
64
+ if not docs:
65
+ return "❌ Could not extract text from files."
66
+
67
+ # Create the vector database in the /tmp folder
68
  vector_store = Chroma.from_documents(
69
  documents=docs,
70
  embedding=embed_model,
71
  persist_directory=CHROMA_PATH
72
  )
73
+ return f"✅ Ingested {len(docs)} pages/recipes into the bar library."
74
 
75
  # --- BARTENDER LOGIC ---
76
  def bartend(message, history, img_path, inventory):
77
  # 1. Vision Scanning
78
  if img_path:
79
  crops = get_bottle_crops(img_path)
80
+ # Scan the first detected bottle or the whole image
81
  target = crops[0] if crops else Image.open(img_path)
82
+
83
+ # SmolVLM prompt format
84
+ messages = [
85
+ {
86
+ "role": "user",
87
+ "content": [
88
+ {"type": "image"},
89
+ {"type": "text", "text": "What is the exact brand and type of alcohol in this image? Answer with just the name."}
90
+ ]
91
+ }
92
+ ]
93
+
94
+ # Generate the label
95
+ output = vision_pipe(target, prompt=messages, generate_kwargs={"max_new_tokens": 30})
96
+ # Clean up the output string
97
+ raw_label = output[0]['generated_text']
98
+ inventory = raw_label.split("Assistant:")[-1].strip()
99
 
100
+ # 2. RAG (Recipe Search)
101
  context = ""
102
  try:
103
  vs = Chroma(persist_directory=CHROMA_PATH, embedding_function=embed_model)
104
+ search_query = f"Cocktail recipe using {inventory}"
105
+ results = vs.similarity_search(search_query, k=2)
106
+ context = "\n---\n".join([d.page_content for d in results])
107
  except:
108
+ context = ""
109
 
110
+ # 3. Formulate Response
111
+ if context:
112
+ response = f"I see you have **{inventory}**. I found this in your recipe books:\n\n{context}"
 
 
 
 
113
  else:
114
+ response = f"I see you have **{inventory}**, but I couldn't find a specific match in your uploaded recipes. Would you like a classic suggestion for this spirit?"
115
 
116
  history.append((message, response))
117
  return history, inventory
118
 
119
  # --- UI LAYOUT ---
120
  with gr.Blocks(theme=gr.themes.Soft()) as demo:
121
+ gr.Markdown("# 🍸 LocalAGI: The AI Sommelier")
122
  inv_state = gr.State("Empty Shelf")
123
 
124
  with gr.Row():
125
  with gr.Column(scale=1):
126
+ file_up = gr.File(label="1. Upload Recipe Books (PDF/TXT)", file_count="multiple")
127
+ ingest_btn = gr.Button("📥 Load into Memory")
128
  status = gr.Textbox(label="System Status", value="Ready")
129
+ gr.Markdown("---")
130
+ img = gr.Image(type="filepath", label="2. Photo of your Bottle")
131
 
132
  with gr.Column(scale=2):
133
+ chatbot = gr.Chatbot(height=500, label="Bartender")
134
+ msg = gr.Textbox(label="3. Ask for a drink", placeholder="Tell me what you feel like drinking...")
135
+ send_btn = gr.Button("Mix It Up", variant="primary")
 
136
 
137
+ # Wire up the buttons
138
  ingest_btn.click(ingest_recipes, file_up, status)
139
+ # Using 'submit' for the textbox and 'click' for the button
140
+ msg.submit(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])
141
  send_btn.click(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])
142
 
143
  if __name__ == "__main__":