Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,7 @@ from PIL import Image
|
|
| 7 |
from transformers import pipeline
|
| 8 |
from langchain_chroma import Chroma
|
| 9 |
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
|
|
|
| 10 |
from langchain_core.documents import Document
|
| 11 |
from langchain_huggingface import HuggingFaceEmbeddings
|
| 12 |
from ultralytics import YOLO
|
|
@@ -27,9 +28,11 @@ vision_pipe = pipeline(
|
|
| 27 |
print("π Loading Embedding Engine...")
|
| 28 |
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 29 |
|
| 30 |
-
# --- BOTTLE DETECTION
|
| 31 |
def get_bottle_crops(image_path):
|
|
|
|
| 32 |
found_crops = []
|
|
|
|
| 33 |
try:
|
| 34 |
original_img = Image.open(image_path).convert("RGB")
|
| 35 |
img_w, img_h = original_img.size
|
|
@@ -41,20 +44,24 @@ def get_bottle_crops(image_path):
|
|
| 41 |
for box in r.boxes:
|
| 42 |
if int(box.cls) in [39, 40, 41]:
|
| 43 |
x1, y1, x2, y2 = box.xyxy[0].tolist()
|
|
|
|
|
|
|
| 44 |
box_w, box_h = x2 - x1, y2 - y1
|
| 45 |
pad_x, pad_y = int(box_w * 0.25), int(box_h * 0.25)
|
| 46 |
|
| 47 |
x1, y1 = max(0, x1 - pad_x), max(0, y1 - pad_y)
|
| 48 |
x2, y2 = min(img_w, x2 + pad_x), min(img_h, y2 + pad_y)
|
|
|
|
| 49 |
found_crops.append(original_img.crop((x1, y1, x2, y2)))
|
| 50 |
|
| 51 |
del yolo_model
|
| 52 |
gc.collect()
|
| 53 |
return found_crops if found_crops else [original_img]
|
| 54 |
-
except Exception:
|
|
|
|
| 55 |
return []
|
| 56 |
|
| 57 |
-
# --- RECIPE INGESTION
|
| 58 |
def ingest_recipes(files):
|
| 59 |
if not files: return "β No files uploaded."
|
| 60 |
|
|
@@ -82,37 +89,49 @@ def ingest_recipes(files):
|
|
| 82 |
except Exception as e:
|
| 83 |
return f"β Database Error: {e}"
|
| 84 |
|
| 85 |
-
# --- BARTENDER LOGIC
|
| 86 |
def bartend(message, history, img_path, inventory):
|
| 87 |
debug_images = []
|
| 88 |
|
| 89 |
if img_path:
|
| 90 |
-
# Run YOLO just so the user can see what it isolated in the gallery
|
| 91 |
crops = get_bottle_crops(img_path)
|
| 92 |
debug_images = crops
|
| 93 |
|
| 94 |
-
#
|
| 95 |
-
target_img = Image.open(img_path).convert("RGB")
|
| 96 |
|
| 97 |
def identify_spirit(image_input):
|
| 98 |
-
#
|
| 99 |
-
#
|
| 100 |
-
image_input.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
prompt = "User: <image>\nRead the label. What is the specific brand and type of alcohol? Be precise.\nAssistant:"
|
| 103 |
|
| 104 |
-
#
|
| 105 |
-
out = vision_pipe(
|
| 106 |
text = out[0]['generated_text']
|
| 107 |
if "Assistant:" in text: return text.split("Assistant:")[-1].strip()
|
| 108 |
return text.replace("User: <image>", "").strip()
|
| 109 |
|
| 110 |
try:
|
| 111 |
-
# π SPEED FIX 3: Single Pass. No more running the vision model twice.
|
| 112 |
-
print("π Starting Vision Pass (Speed Optimized)...")
|
| 113 |
inventory = identify_spirit(target_img)
|
| 114 |
inventory = re.sub(r'<.*?>', '', inventory).strip().split('.')[0]
|
| 115 |
-
print(f"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
|
| 117 |
except Exception as e:
|
| 118 |
print(f"β Vision Failed: {e}")
|
|
@@ -125,7 +144,7 @@ def bartend(message, history, img_path, inventory):
|
|
| 125 |
vs = Chroma(persist_directory=CHROMA_PATH, embedding_function=embed_model)
|
| 126 |
search_query = f"Cocktail recipe using {inventory}"
|
| 127 |
|
| 128 |
-
# Fetch top 4 recipes
|
| 129 |
results = vs.similarity_search(search_query, k=4)
|
| 130 |
recipe_context = "\n\n---\n\n".join([d.page_content for d in results])
|
| 131 |
except Exception as e:
|
|
|
|
| 7 |
from transformers import pipeline
|
| 8 |
from langchain_chroma import Chroma
|
| 9 |
from langchain_community.document_loaders import PyPDFLoader, TextLoader
|
| 10 |
+
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 11 |
from langchain_core.documents import Document
|
| 12 |
from langchain_huggingface import HuggingFaceEmbeddings
|
| 13 |
from ultralytics import YOLO
|
|
|
|
| 28 |
print("π Loading Embedding Engine...")
|
| 29 |
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 30 |
|
| 31 |
+
# --- BOTTLE DETECTION ---
|
| 32 |
def get_bottle_crops(image_path):
|
| 33 |
+
print(f"π DEBUG: Starting YOLO on {image_path}")
|
| 34 |
found_crops = []
|
| 35 |
+
|
| 36 |
try:
|
| 37 |
original_img = Image.open(image_path).convert("RGB")
|
| 38 |
img_w, img_h = original_img.size
|
|
|
|
| 44 |
for box in r.boxes:
|
| 45 |
if int(box.cls) in [39, 40, 41]:
|
| 46 |
x1, y1, x2, y2 = box.xyxy[0].tolist()
|
| 47 |
+
|
| 48 |
+
# 25% Padding to ensure the label isn't cut off
|
| 49 |
box_w, box_h = x2 - x1, y2 - y1
|
| 50 |
pad_x, pad_y = int(box_w * 0.25), int(box_h * 0.25)
|
| 51 |
|
| 52 |
x1, y1 = max(0, x1 - pad_x), max(0, y1 - pad_y)
|
| 53 |
x2, y2 = min(img_w, x2 + pad_x), min(img_h, y2 + pad_y)
|
| 54 |
+
|
| 55 |
found_crops.append(original_img.crop((x1, y1, x2, y2)))
|
| 56 |
|
| 57 |
del yolo_model
|
| 58 |
gc.collect()
|
| 59 |
return found_crops if found_crops else [original_img]
|
| 60 |
+
except Exception as e:
|
| 61 |
+
print(f"β YOLO Error: {e}")
|
| 62 |
return []
|
| 63 |
|
| 64 |
+
# --- RECIPE INGESTION ---
|
| 65 |
def ingest_recipes(files):
|
| 66 |
if not files: return "β No files uploaded."
|
| 67 |
|
|
|
|
| 89 |
except Exception as e:
|
| 90 |
return f"β Database Error: {e}"
|
| 91 |
|
| 92 |
+
# --- BARTENDER LOGIC ---
|
| 93 |
def bartend(message, history, img_path, inventory):
|
| 94 |
debug_images = []
|
| 95 |
|
| 96 |
if img_path:
|
|
|
|
| 97 |
crops = get_bottle_crops(img_path)
|
| 98 |
debug_images = crops
|
| 99 |
|
| 100 |
+
# SPEED FIX 1: We return to using the tight crop, discarding the heavy background!
|
| 101 |
+
target_img = crops[0] if crops else Image.open(img_path).convert("RGB")
|
| 102 |
|
| 103 |
def identify_spirit(image_input):
|
| 104 |
+
# SPEED FIX 2: Aggressive squishing.
|
| 105 |
+
# We copy the image so we don't blur the gallery debug version
|
| 106 |
+
fast_img = image_input.copy()
|
| 107 |
+
if fast_img.mode != "RGB": fast_img = fast_img.convert("RGB")
|
| 108 |
+
|
| 109 |
+
# Shrink down to a max of 384x384. This makes CPU math practically instant.
|
| 110 |
+
fast_img.thumbnail((384, 384))
|
| 111 |
|
| 112 |
prompt = "User: <image>\nRead the label. What is the specific brand and type of alcohol? Be precise.\nAssistant:"
|
| 113 |
|
| 114 |
+
# Keep token limit at 15. The 'brain' (Chroma) handles the long text, the 'eyes' just need to read the brand name.
|
| 115 |
+
out = vision_pipe(fast_img, prompt, generate_kwargs={"max_new_tokens": 15})
|
| 116 |
text = out[0]['generated_text']
|
| 117 |
if "Assistant:" in text: return text.split("Assistant:")[-1].strip()
|
| 118 |
return text.replace("User: <image>", "").strip()
|
| 119 |
|
| 120 |
try:
|
|
|
|
|
|
|
| 121 |
inventory = identify_spirit(target_img)
|
| 122 |
inventory = re.sub(r'<.*?>', '', inventory).strip().split('.')[0]
|
| 123 |
+
print(f"π Pass 1 Result: {inventory}")
|
| 124 |
+
|
| 125 |
+
generic_terms = ["vodka", "gin", "rum", "tequila", "whiskey", "whisky", "bourbon", "brandy", "alcohol", "liquor", "spirit", "bottle", "drink"]
|
| 126 |
+
|
| 127 |
+
# ONLY fallback to the heavy full image if the crop failed us
|
| 128 |
+
if inventory.lower() in generic_terms or len(inventory) < 4:
|
| 129 |
+
print("β οΈ Result too generic. Trying FULL IMAGE...")
|
| 130 |
+
full_img_result = identify_spirit(Image.open(img_path))
|
| 131 |
+
full_img_result = re.sub(r'<.*?>', '', full_img_result).strip().split('.')[0]
|
| 132 |
+
if len(full_img_result) > len(inventory):
|
| 133 |
+
inventory = full_img_result
|
| 134 |
+
print(f"β
Pass 2 Result: {inventory}")
|
| 135 |
|
| 136 |
except Exception as e:
|
| 137 |
print(f"β Vision Failed: {e}")
|
|
|
|
| 144 |
vs = Chroma(persist_directory=CHROMA_PATH, embedding_function=embed_model)
|
| 145 |
search_query = f"Cocktail recipe using {inventory}"
|
| 146 |
|
| 147 |
+
# Fetch top 4 distinct recipes
|
| 148 |
results = vs.similarity_search(search_query, k=4)
|
| 149 |
recipe_context = "\n\n---\n\n".join([d.page_content for d in results])
|
| 150 |
except Exception as e:
|