skshimada commited on
Commit
0bae07f
·
verified ·
1 Parent(s): acde7e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -19
app.py CHANGED
@@ -13,17 +13,17 @@ from ultralytics import YOLO
13
 
14
  # --- CONFIGURATION ---
15
  CHROMA_PATH = "/tmp/chroma_db"
16
- # SmolVLM is a very efficient "Vision-Language-Model"
17
  VISION_MODEL = "HuggingFaceTB/SmolVLM-Instruct"
18
 
19
  # --- SYSTEM INITIALIZATION ---
20
  print("⚙️ Loading Stable Vision Engine...")
21
- # FIXED: Changed task to "image-text-to-text" and torch_dtype to dtype
22
  vision_pipe = pipeline(
23
  "image-text-to-text",
24
  model=VISION_MODEL,
25
- model_kwargs={"dtype": torch.bfloat16},
26
- device_map="auto"
27
  )
28
 
29
  print("📚 Loading Embedding Engine...")
@@ -31,6 +31,7 @@ embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM
31
 
32
  # --- BOTTLE DETECTION ---
33
  def get_bottle_crops(image_path):
 
34
  yolo_model = YOLO("yolov8n.pt")
35
  results = yolo_model(image_path, verbose=False)
36
  found_crops = []
@@ -39,8 +40,9 @@ def get_bottle_crops(image_path):
39
  for box in r.boxes:
40
  if int(box.cls) == 39: # COCO Index 39 = Bottle
41
  x1, y1, x2, y2 = box.xyxy[0].tolist()
42
- # Crop with a tiny bit of padding
43
  found_crops.append(original_img.crop((x1-5, y1-5, x2+5, y2+5)))
 
 
44
  del yolo_model
45
  gc.collect()
46
  return found_crops
@@ -64,7 +66,7 @@ def ingest_recipes(files):
64
  if not docs:
65
  return "❌ Could not extract text from files."
66
 
67
- # Create the vector database in the /tmp folder
68
  vector_store = Chroma.from_documents(
69
  documents=docs,
70
  embedding=embed_model,
@@ -74,44 +76,43 @@ def ingest_recipes(files):
74
 
75
  # --- BARTENDER LOGIC ---
76
  def bartend(message, history, img_path, inventory):
77
- # 1. Vision Scanning
78
  if img_path:
79
  crops = get_bottle_crops(img_path)
80
- # Scan the first detected bottle or the whole image
81
  target = crops[0] if crops else Image.open(img_path)
82
 
83
- # SmolVLM prompt format
84
  messages = [
85
  {
86
  "role": "user",
87
  "content": [
88
  {"type": "image"},
89
- {"type": "text", "text": "What is the exact brand and type of alcohol in this image? Answer with just the name."}
90
  ]
91
  }
92
  ]
93
 
94
- # Generate the label
95
  output = vision_pipe(target, prompt=messages, generate_kwargs={"max_new_tokens": 30})
96
- # Clean up the output string
97
  raw_label = output[0]['generated_text']
 
98
  inventory = raw_label.split("Assistant:")[-1].strip()
99
 
100
- # 2. RAG (Recipe Search)
101
  context = ""
102
  try:
103
  vs = Chroma(persist_directory=CHROMA_PATH, embedding_function=embed_model)
104
  search_query = f"Cocktail recipe using {inventory}"
105
  results = vs.similarity_search(search_query, k=2)
106
  context = "\n---\n".join([d.page_content for d in results])
107
- except:
 
108
  context = ""
109
 
110
- # 3. Formulate Response
111
  if context:
112
- response = f"I see you have **{inventory}**. I found this in your recipe books:\n\n{context}"
113
  else:
114
- response = f"I see you have **{inventory}**, but I couldn't find a specific match in your uploaded recipes. Would you like a classic suggestion for this spirit?"
115
 
116
  history.append((message, response))
117
  return history, inventory
@@ -134,9 +135,8 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
134
  msg = gr.Textbox(label="3. Ask for a drink", placeholder="Tell me what you feel like drinking...")
135
  send_btn = gr.Button("Mix It Up", variant="primary")
136
 
137
- # Wire up the buttons
138
  ingest_btn.click(ingest_recipes, file_up, status)
139
- # Using 'submit' for the textbox and 'click' for the button
140
  msg.submit(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])
141
  send_btn.click(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])
142
 
 
13
 
14
  # --- CONFIGURATION ---
15
  CHROMA_PATH = "/tmp/chroma_db"
16
+ # SmolVLM is a very efficient "Vision-Language-Model" for CPU usage
17
  VISION_MODEL = "HuggingFaceTB/SmolVLM-Instruct"
18
 
19
  # --- SYSTEM INITIALIZATION ---
20
  print("⚙️ Loading Stable Vision Engine...")
21
+ # We use device="cpu" and float32 to avoid the "accelerate" dependency error
22
  vision_pipe = pipeline(
23
  "image-text-to-text",
24
  model=VISION_MODEL,
25
+ model_kwargs={"dtype": torch.float32},
26
+ device="cpu"
27
  )
28
 
29
  print("📚 Loading Embedding Engine...")
 
31
 
32
  # --- BOTTLE DETECTION ---
33
  def get_bottle_crops(image_path):
34
+ # YOLO downloads its weights automatically to the local directory
35
  yolo_model = YOLO("yolov8n.pt")
36
  results = yolo_model(image_path, verbose=False)
37
  found_crops = []
 
40
  for box in r.boxes:
41
  if int(box.cls) == 39: # COCO Index 39 = Bottle
42
  x1, y1, x2, y2 = box.xyxy[0].tolist()
 
43
  found_crops.append(original_img.crop((x1-5, y1-5, x2+5, y2+5)))
44
+
45
+ # Manual cleanup to save RAM on the Free Tier
46
  del yolo_model
47
  gc.collect()
48
  return found_crops
 
66
  if not docs:
67
  return "❌ Could not extract text from files."
68
 
69
+ # Initializing the vector database in /tmp for write access
70
  vector_store = Chroma.from_documents(
71
  documents=docs,
72
  embedding=embed_model,
 
76
 
77
  # --- BARTENDER LOGIC ---
78
  def bartend(message, history, img_path, inventory):
79
+ # 1. Vision Scanning (if image is provided)
80
  if img_path:
81
  crops = get_bottle_crops(img_path)
 
82
  target = crops[0] if crops else Image.open(img_path)
83
 
84
+ # Format for SmolVLM to ensure high accuracy
85
  messages = [
86
  {
87
  "role": "user",
88
  "content": [
89
  {"type": "image"},
90
+ {"type": "text", "text": "Identify the brand and specific alcohol type in this image. Answer briefly."}
91
  ]
92
  }
93
  ]
94
 
 
95
  output = vision_pipe(target, prompt=messages, generate_kwargs={"max_new_tokens": 30})
 
96
  raw_label = output[0]['generated_text']
97
+ # Extract the Assistant's answer from the prompt/response sequence
98
  inventory = raw_label.split("Assistant:")[-1].strip()
99
 
100
+ # 2. RAG (Recipe Search in PDF/TXT)
101
  context = ""
102
  try:
103
  vs = Chroma(persist_directory=CHROMA_PATH, embedding_function=embed_model)
104
  search_query = f"Cocktail recipe using {inventory}"
105
  results = vs.similarity_search(search_query, k=2)
106
  context = "\n---\n".join([d.page_content for d in results])
107
+ except Exception as e:
108
+ print(f"Search error: {e}")
109
  context = ""
110
 
111
+ # 3. Final Response Construction
112
  if context:
113
+ response = f"I see you have **{inventory}**. Based on your recipe books, here is a suggestion:\n\n{context}"
114
  else:
115
+ response = f"I identified **{inventory}** on your shelf! I don't see a specific match in your uploaded books, but I can suggest a classic drink for this spirit if you'd like."
116
 
117
  history.append((message, response))
118
  return history, inventory
 
135
  msg = gr.Textbox(label="3. Ask for a drink", placeholder="Tell me what you feel like drinking...")
136
  send_btn = gr.Button("Mix It Up", variant="primary")
137
 
138
+ # Wire up events
139
  ingest_btn.click(ingest_recipes, file_up, status)
 
140
  msg.submit(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])
141
  send_btn.click(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state])
142