File size: 8,099 Bytes
8edd4f5
 
 
c20bab5
7280e12
8edd4f5
c20bab5
8edd4f5
c20bab5
73d9e71
c20bab5
 
527c0df
8edd4f5
7280e12
 
c20bab5
8edd4f5
c20bab5
 
8d2f88f
 
 
0bae07f
 
8d2f88f
8edd4f5
c20bab5
 
8edd4f5
73d9e71
8edd4f5
73d9e71
ddb1921
73d9e71
1cc7f06
ce0a4da
4ba62ef
ddb1921
 
c4c69b9
ddb1921
1cc7f06
 
c4c69b9
1cc7f06
73d9e71
 
c4c69b9
 
ddb1921
c4c69b9
 
73d9e71
c4c69b9
ddb1921
1cc7f06
 
c4c69b9
73d9e71
 
c4c69b9
ddb1921
73d9e71
7280e12
 
33c5c81
8edd4f5
 
8d2f88f
c4c69b9
 
 
7280e12
4ba62ef
8d2f88f
43779e4
 
 
 
 
 
 
 
 
 
c4c69b9
43779e4
 
 
8edd4f5
73d9e71
c20bab5
ddb1921
 
7280e12
 
4ba62ef
c4c69b9
73d9e71
 
099e0d3
4ba62ef
73d9e71
 
 
 
 
 
 
c4c69b9
33c5c81
c4c69b9
73d9e71
 
4ba62ef
33c5c81
4ba62ef
 
ce0a4da
 
 
73d9e71
 
 
 
 
 
 
 
 
 
 
 
ce0a4da
 
33c5c81
ce0a4da
c20bab5
1cc7f06
993f3d0
1cc7f06
 
 
 
33c5c81
73d9e71
43779e4
 
1cc7f06
 
c20bab5
993f3d0
ddb1921
993f3d0
43779e4
c20bab5
ddb1921
c20bab5
c3f6e08
 
 
ddb1921
c20bab5
 
993f3d0
8d2f88f
c20bab5
8edd4f5
 
 
6b342db
 
c20bab5
ddb1921
8d2f88f
6b342db
7280e12
c4c69b9
ddb1921
 
8edd4f5
993f3d0
6b342db
8d2f88f
7280e12
 
ddb1921
 
8edd4f5
7280e12
993f3d0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
import gradio as gr
import os
import re
import torch
import gc
from PIL import Image
from transformers import pipeline
from langchain_chroma import Chroma
from langchain_community.document_loaders import PyPDFLoader, TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document
from langchain_huggingface import HuggingFaceEmbeddings
from ultralytics import YOLO

# --- CONFIGURATION ---
CHROMA_PATH = "/tmp/chroma_db"
VISION_MODEL = "HuggingFaceTB/SmolVLM-Instruct"

# --- SYSTEM INITIALIZATION ---
print("โš™๏ธ Loading Stable Vision Engine...")
vision_pipe = pipeline(
    "image-text-to-text", 
    model=VISION_MODEL, 
    model_kwargs={"dtype": torch.float32}, 
    device="cpu"
)

print("๐Ÿ“š Loading Embedding Engine...")
embed_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# --- BOTTLE DETECTION ---
def get_bottle_crops(image_path):
    print(f"๐Ÿ” DEBUG: Starting YOLO on {image_path}")
    found_crops = []
    
    try:
        original_img = Image.open(image_path).convert("RGB")
        img_w, img_h = original_img.size
        
        yolo_model = YOLO("yolov8n.pt")
        results = yolo_model(image_path, verbose=False, conf=0.1)
        
        for r in results:
            for box in r.boxes:
                if int(box.cls) in [39, 40, 41]: 
                    x1, y1, x2, y2 = box.xyxy[0].tolist()
                    
                    # 25% Padding to ensure the label isn't cut off
                    box_w, box_h = x2 - x1, y2 - y1
                    pad_x, pad_y = int(box_w * 0.25), int(box_h * 0.25)
                    
                    x1, y1 = max(0, x1 - pad_x), max(0, y1 - pad_y)
                    x2, y2 = min(img_w, x2 + pad_x), min(img_h, y2 + pad_y)
                    
                    found_crops.append(original_img.crop((x1, y1, x2, y2)))

        del yolo_model
        gc.collect()
        return found_crops if found_crops else [original_img]
    except Exception as e:
        print(f"โŒ YOLO Error: {e}")
        return []

# --- RECIPE INGESTION ---
def ingest_recipes(files):
    if not files: return "โŒ No files uploaded."
    
    docs = []
    for f in files:
        try:
            if f.name.endswith(".txt"): docs.extend(TextLoader(f.name).load())
            elif f.name.endswith(".pdf"): docs.extend(PyPDFLoader(f.name).load())
        except Exception as e: print(f"Error: {e}")
            
    if not docs: return "โŒ Could not extract text."

    full_text = "\n".join([d.page_content for d in docs])
    raw_chunks = re.split(r'(?m)^(?=Recipe:)', full_text)
    
    split_docs = []
    for chunk in raw_chunks:
        clean_chunk = re.sub(r'โธป+', '', chunk).strip()
        if len(clean_chunk) > 20: 
            split_docs.append(Document(page_content=clean_chunk))

    try:
        Chroma.from_documents(split_docs, embed_model, persist_directory=CHROMA_PATH)
        return f"โœ… Bar library updated. Strictly split into {len(split_docs)} individual recipes."
    except Exception as e:
        return f"โŒ Database Error: {e}"

# --- BARTENDER LOGIC ---
def bartend(message, history, img_path, inventory):
    debug_images = []
    
    if img_path:
        crops = get_bottle_crops(img_path)
        debug_images = crops 
        
        # SPEED FIX 1: We return to using the tight crop, discarding the heavy background!
        target_img = crops[0] if crops else Image.open(img_path).convert("RGB")
        
        def identify_spirit(image_input):
            # SPEED FIX 2: Aggressive squishing. 
            # We copy the image so we don't blur the gallery debug version
            fast_img = image_input.copy()
            if fast_img.mode != "RGB": fast_img = fast_img.convert("RGB")
            
            # Shrink down to a max of 384x384. This makes CPU math practically instant.
            fast_img.thumbnail((384, 384))
            
            prompt = "User: <image>\nRead the label. What is the specific brand and type of alcohol? Be precise.\nAssistant:"
            
            # Keep token limit at 15. The 'brain' (Chroma) handles the long text, the 'eyes' just need to read the brand name.
            out = vision_pipe(fast_img, prompt, generate_kwargs={"max_new_tokens": 15})
            text = out[0]['generated_text']
            if "Assistant:" in text: return text.split("Assistant:")[-1].strip()
            return text.replace("User: <image>", "").strip()

        try:
            inventory = identify_spirit(target_img)
            inventory = re.sub(r'<.*?>', '', inventory).strip().split('.')[0]
            print(f"๐Ÿ” Pass 1 Result: {inventory}")
            
            generic_terms = ["vodka", "gin", "rum", "tequila", "whiskey", "whisky", "bourbon", "brandy", "alcohol", "liquor", "spirit", "bottle", "drink"]
            
            # ONLY fallback to the heavy full image if the crop failed us
            if inventory.lower() in generic_terms or len(inventory) < 4:
                print("โš ๏ธ Result too generic. Trying FULL IMAGE...")
                full_img_result = identify_spirit(Image.open(img_path))
                full_img_result = re.sub(r'<.*?>', '', full_img_result).strip().split('.')[0]
                if len(full_img_result) > len(inventory):
                    inventory = full_img_result
                    print(f"โœ… Pass 2 Result: {inventory}")
                    
        except Exception as e:
            print(f"โŒ Vision Failed: {e}")
            inventory = "Unknown Spirit"

    recipe_context = ""
    if inventory and inventory not in ["Empty Shelf", "Unknown Spirit", ""]:
        try:
            if os.path.exists(CHROMA_PATH):
                vs = Chroma(persist_directory=CHROMA_PATH, embedding_function=embed_model)
                search_query = f"Cocktail recipe using {inventory}"
                
                # Fetch top 4 distinct recipes
                results = vs.similarity_search(search_query, k=4)
                recipe_context = "\n\n---\n\n".join([d.page_content for d in results])
        except Exception as e:
            print(f"Search error: {e}")

    if inventory == "Unknown Spirit":
        response = "I'm having trouble reading that label. Check the 'Vision Debug' gallery belowโ€”is the crop clear?"
    elif recipe_context:
        response = f"I see you have **{inventory}**. Here are a few options from your collection:\n\n{recipe_context}"
    else:
        response = f"I see you have **{inventory}**! I don't have a specific recipe for that in the current library. Should I suggest a classic drink?"

    history.append({"role": "user", "content": message})
    history.append({"role": "assistant", "content": response})

    return history, inventory, debug_images

# --- UI LAYOUT ---
with gr.Blocks() as demo:
    gr.Markdown("# ๐Ÿธ LocalAGI: The AI Sommelier")
    inv_state = gr.State("Empty Shelf")
    
    with gr.Row():
        with gr.Column(scale=1):
            file_up = gr.File(label="1. Upload Recipe PDFs/TXTs", file_count="multiple")
            ingest_btn = gr.Button("๐Ÿ“ฅ Load Recipes into Memory")
            status = gr.Textbox(label="System Status", value="Ready")
            
            gr.Markdown("---")
            img = gr.Image(type="filepath", label="2. Photo of your Bottle")
            
            with gr.Accordion("๐Ÿ” Vision Debug", open=False):
                debug_gallery = gr.Gallery(label="YOLO Crops", columns=2, height="auto")
            
        with gr.Column(scale=2):
            chatbot = gr.Chatbot(height=500, label="Bartender Chat")
            msg = gr.Textbox(label="3. Your Message", placeholder="Ask for a drink suggestion...")
            send_btn = gr.Button("Mix It Up", variant="primary")

    ingest_btn.click(ingest_recipes, file_up, status)
    msg.submit(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state, debug_gallery])
    send_btn.click(bartend, [msg, chatbot, img, inv_state], [chatbot, inv_state, debug_gallery])

if __name__ == "__main__":
    demo.launch(theme=gr.themes.Soft())