Nestor eduardo Sanchez ospina commited on
Commit
c627f4d
·
1 Parent(s): 25fab06

Add application file

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
2nd_clean_comida_dogs_enriched_multilingual.pkl ADDED
Binary file (11.2 kB). View file
 
2nd_clean_comida_dogs_filtered.pkl ADDED
Binary file (26.9 kB). View file
 
3rd_clean_comida_dogs_enriched_multilingual_2.pkl ADDED
Binary file (147 kB). View file
 
README.md CHANGED
@@ -1,13 +1,88 @@
1
- ---
2
- title: PetProject
3
- emoji: 📊
4
- colorFrom: red
5
- colorTo: green
6
- sdk: streamlit
7
- sdk_version: 1.42.2
8
- app_file: app.py
9
- pinned: false
10
- license: apache-2.0
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dog Food Product QA System
2
+
3
+ A hybrid search and question-answering system for dog food products using BM25 and vector search (ChromaDB).
4
+
5
+ ## Core Components
6
+
7
+ ### Essential Scripts
8
+
9
+ 1. `qa_backend.py`
10
+ - Main backend implementation
11
+ - Contains DogFoodQASystem class
12
+ - Implements hybrid search and answer generation
13
+ - Key features: BM25 search, vector search, result combination, multilingual support
14
+
15
+ 2. `app.py`
16
+ - Streamlit frontend interface
17
+ - Displays search results with source indicators
18
+ - Shows statistics and product details
19
+ - Handles user queries in English and Spanish
20
+
21
+ ### Important Notebooks
22
+
23
+ 1. `test_retrieval.ipynb`
24
+ - Reference implementation for hybrid search
25
+ - Used for testing and validating search functionality
26
+ - Contains working examples of both BM25 and ChromaDB searches
27
+ ```python:Initial_trial_RAG/test_retrieval.ipynb
28
+ startLine: 64
29
+ endLine: 117
30
+ ```
31
+
32
+ 2. `diagnose_qa_system.ipynb`
33
+ - Diagnostic tool for system components
34
+ - Tests vector store connectivity
35
+ - Validates search result combination
36
+ - Useful for debugging and system verification
37
+
38
+ ### Supporting Files
39
+
40
+ - `bm25_index.pkl`: Serialized BM25 index and data
41
+ - `chroma_db/`: Directory containing ChromaDB vector store
42
+ - `.env`: Environment variables (OpenAI API key)
43
+
44
+ ### Less Critical Components
45
+
46
+ 1. `trial_enriching_description.ipynb`
47
+ - Used for initial data enrichment
48
+ - Not needed for regular system operation
49
+ - Reference for future data processing
50
+ ```python:Initial_trial_RAG/trial_enriching_decription.ipynb
51
+ startLine: 26
52
+ endLine: 37
53
+ ```
54
+
55
+ ## System Architecture
56
+
57
+ 1. **Search Components**
58
+ - BM25 for keyword matching
59
+ - ChromaDB for semantic search
60
+ - Smart result combination with duplicate detection
61
+
62
+ 2. **Result Processing**
63
+ - Source tracking (BM25, Vector, or Both)
64
+ - Score preservation for transparency
65
+ - Metadata-aware result presentation
66
+
67
+ 3. **User Interface**
68
+ - Color-coded results by source:
69
+ - 🔵 Blue: BM25 results
70
+ - 🟢 Green: Vector results
71
+ - 🔄 Purple: Found by both sources
72
+ - Detailed statistics display
73
+ - Bilingual support (EN/ES)
74
+
75
+ ## Usage
76
+
77
+ 1. Start the application:
78
+
79
+ 2. Enter queries in English or Spanish
80
+ 3. View combined results with source indicators
81
+ 4. Check statistics for result distribution
82
+
83
+ ## Development Notes
84
+
85
+ - BM25 and Vector searches each return top 5 results
86
+ - Duplicates are automatically detected and merged
87
+ - All unique results are passed to the LLM for context
88
+ - Scores are displayed but not used for filtering
__pycache__/qa_backend.cpython-311.pyc ADDED
Binary file (16.4 kB). View file
 
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from qa_backend import DogFoodQASystem
3
+ import time
4
+ from typing import Dict, Any, List
5
+ import logging
6
+
7
+ # Configure logging
8
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
9
+
10
+ # Configure page settings
11
+ st.set_page_config(
12
+ page_title="Dog Food Advisor",
13
+ page_icon="🐕",
14
+ layout="wide"
15
+ )
16
+
17
+ # Custom CSS for better styling
18
+ st.markdown("""
19
+ <style>
20
+ .stAlert {
21
+ padding: 1rem;
22
+ margin: 1rem 0;
23
+ border-radius: 0.5rem;
24
+ }
25
+ .search-result {
26
+ padding: 1rem;
27
+ margin: 0.5rem 0;
28
+ border: 1px solid #ddd;
29
+ border-radius: 0.5rem;
30
+ }
31
+ .debug-info {
32
+ font-size: small;
33
+ color: gray;
34
+ padding: 0.5rem;
35
+ background-color: #f0f0f0;
36
+ border-radius: 0.3rem;
37
+ }
38
+ </style>
39
+ """, unsafe_allow_html=True)
40
+
41
+ @st.cache_resource
42
+ def load_qa_system() -> DogFoodQASystem:
43
+ """Initialize and cache the QA system."""
44
+ qa_system = DogFoodQASystem()
45
+ # Run diagnostics
46
+ vector_store_status = qa_system.diagnose_vector_store()
47
+ return qa_system, vector_store_status
48
+
49
+ def display_search_result(result: Dict[str, Any], index: int) -> None:
50
+ """Display a single search result with enhanced source and score information."""
51
+ with st.container():
52
+ # Source indicator and styling
53
+ sources = result['sources']
54
+ if len(sources) > 1:
55
+ source_color = "#9C27B0" # Purple for both sources
56
+ source_badge = "🔄 Found in Both Sources"
57
+ scores_text = f"BM25: {result['original_scores']['BM25']:.3f}, Vector: {result['original_scores']['Vector']:.3f}"
58
+ elif 'Vector' in sources:
59
+ source_color = "#2E7D32" # Green for Vector
60
+ source_badge = "🟢 Vector Search"
61
+ scores_text = f"Score: {result['original_scores']['Vector']:.3f}"
62
+ else:
63
+ source_color = "#1565C0" # Blue for BM25
64
+ source_badge = "🔵 BM25 Search"
65
+ scores_text = f"Score: {result['original_scores']['BM25']:.3f}"
66
+
67
+ # Display header with source and score information
68
+ st.markdown(f"""
69
+ <div class="search-result">
70
+ <h4 style="color: {source_color}">
71
+ Result {index + 1} | {source_badge} | {scores_text}
72
+ </h4>
73
+ </div>
74
+ """, unsafe_allow_html=True)
75
+
76
+ # Display product details
77
+ col1, col2 = st.columns(2)
78
+ with col1:
79
+ st.write("**Product Details:**")
80
+ st.write(f"• Brand: {result['metadata']['brand']}")
81
+ st.write(f"• Product: {result['metadata']['product_name']}")
82
+ st.write(f"• Price: ${result['metadata']['price']:.2f}")
83
+
84
+ with col2:
85
+ st.write("**Additional Information:**")
86
+ st.write(f"• Weight: {result['metadata']['weight']}kg")
87
+ st.write(f"• Dog Type: {result['metadata']['dog_type']}")
88
+ if 'reviews' in result['metadata']:
89
+ st.write(f"• Reviews: {result['metadata']['reviews']}")
90
+
91
+ st.markdown("**Description:**")
92
+ st.write(result['text'])
93
+ st.markdown("---")
94
+
95
+ def display_search_stats(results: List[Dict[str, Any]]) -> None:
96
+ """Display detailed statistics about search results."""
97
+ total_results = len(results)
98
+ duplicates = sum(1 for r in results if len(r['sources']) > 1)
99
+ vector_only = sum(1 for r in results if r['sources'] == ['Vector'])
100
+ bm25_only = sum(1 for r in results if r['sources'] == ['BM25'])
101
+
102
+ st.markdown("#### Search Results Statistics")
103
+ col1, col2, col3, col4 = st.columns(4)
104
+ with col1:
105
+ st.metric("Total Unique Results", total_results)
106
+ with col2:
107
+ st.metric("Found in Both Sources", duplicates, "🔄")
108
+ with col3:
109
+ st.metric("Vector Only", vector_only, "🟢")
110
+ with col4:
111
+ st.metric("BM25 Only", bm25_only, "🔵")
112
+
113
+ def main():
114
+ # Header
115
+ st.title("🐕 Dog Food Advisor")
116
+ st.markdown("""
117
+ Ask questions about dog food products in English or Spanish.
118
+ The system will provide relevant recommendations based on your query.
119
+ """)
120
+
121
+ # Initialize QA system with diagnostics
122
+ qa_system, vector_store_status = load_qa_system()
123
+
124
+ # Display system status
125
+ with st.sidebar:
126
+ st.markdown("### System Status")
127
+ if vector_store_status:
128
+ st.success("Vector Store: Connected")
129
+ else:
130
+ st.error("Vector Store: Not Connected")
131
+ st.warning("Only BM25 search will be available")
132
+
133
+ # Query input
134
+ query = st.text_input(
135
+ "Enter your question:",
136
+ placeholder="e.g., 'What's the best food for puppies?' or '¿Cuál es la mejor comida para perros adultos?'"
137
+ )
138
+
139
+ # Add a search button
140
+ search_button = st.button("Search")
141
+
142
+ if query and search_button:
143
+ with st.spinner("Processing your query..."):
144
+ try:
145
+ # Process query
146
+ start_time = time.time()
147
+ result = qa_system.process_query(query)
148
+ processing_time = time.time() - start_time
149
+
150
+ # Display answer
151
+ st.markdown("### Answer")
152
+ st.write(result["answer"])
153
+
154
+ # Display search stats
155
+ display_search_stats(result["search_results"])
156
+
157
+ # Display processing information
158
+ st.markdown(f"""
159
+ <div class='debug-info'>
160
+ Language detected: {result['language']} |
161
+ Processing time: {processing_time:.2f}s
162
+ </div>
163
+ """, unsafe_allow_html=True)
164
+
165
+ # Display search results in an expander
166
+ with st.expander("View Relevant Products", expanded=False):
167
+ st.markdown("### Search Results")
168
+ for i, search_result in enumerate(result["search_results"]):
169
+ display_search_result(search_result, i)
170
+
171
+ except Exception as e:
172
+ st.error(f"An error occurred: {str(e)}")
173
+ logging.error(f"Error processing query: {str(e)}", exc_info=True)
174
+
175
+ # Add footer with instructions
176
+ st.markdown("---")
177
+ with st.expander("Usage Tips"):
178
+ st.markdown("""
179
+ - Ask questions in English or Spanish
180
+ - Be specific about your dog's needs (age, size, special requirements)
181
+ - Include price preferences (e.g., 'affordable', 'premium')
182
+ - Results are ranked by relevance and include price, brand, and product details
183
+ - Results are color-coded:
184
+ - 🔵 Blue: BM25 Search Results
185
+ - 🟢 Green: Vector Search Results
186
+ """)
187
+
188
+ if __name__ == "__main__":
189
+ main()
bm25_index.pkl ADDED
Binary file (280 kB). View file
 
checkpoints/checkpoint_batch_5.pkl ADDED
Binary file (98.6 kB). View file
 
create_vector_stores.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import List, Dict, Any
3
+ import pickle
4
+ import nltk
5
+ from nltk.tokenize import word_tokenize
6
+ from rank_bm25 import BM25Okapi
7
+ import chromadb
8
+ from chromadb.config import Settings
9
+ from openai import OpenAI
10
+ import pandas as pd
11
+ from tqdm import tqdm
12
+ from dotenv import load_dotenv
13
+ import os
14
+
15
+ # Configure logging
16
+ logging.basicConfig(
17
+ level=logging.INFO,
18
+ format='%(asctime)s - %(levelname)s - %(message)s'
19
+ )
20
+
21
+ class VectorStoreCreator:
22
+ """Class to create and manage vector stores for dog food product search."""
23
+
24
+ def __init__(self, data_path: str):
25
+ """
26
+ Initialize the VectorStoreCreator.
27
+
28
+ Args:
29
+ data_path: Path to the pickle file containing the product data
30
+ """
31
+ # Load environment variables
32
+ load_dotenv()
33
+
34
+ # Initialize OpenAI client
35
+ self.client = OpenAI()
36
+
37
+ # Download NLTK resources
38
+ nltk.download('punkt', quiet=True)
39
+
40
+ # Load data
41
+ self.df = pd.read_pickle(data_path)
42
+
43
+ # Initialize stores
44
+ self.bm25_model = None
45
+ self.chroma_collection = None
46
+ self.chunks = []
47
+ self.metadata = []
48
+
49
+ def prepare_data(self) -> None:
50
+ """Prepare data for BM25 and embeddings."""
51
+ logging.info("Preparing data for vector stores...")
52
+
53
+ # Log initial dataframe info
54
+ total_rows = len(self.df)
55
+ logging.info(f"Total rows in DataFrame: {total_rows}")
56
+
57
+ for _, row in self.df.iterrows():
58
+ # Combine English and Spanish descriptions
59
+ combined_text = f"{row['description_en']} {row['description_es']}"
60
+ self.chunks.append(combined_text)
61
+
62
+ # Create metadata
63
+ metadata = {
64
+ "product_name": row["product_name"],
65
+ "brand": row["brand"],
66
+ "dog_type": row["dog_type"],
67
+ "food_type": row["food_type"],
68
+ "weight": float(row["weight"]),
69
+ "price": float(row["price"]),
70
+ "reviews": float(row["reviews"]) if pd.notna(row["reviews"]) else 0.0
71
+ }
72
+ self.metadata.append(metadata)
73
+
74
+ # Log final chunks info
75
+ logging.info(f"Total chunks created: {len(self.chunks)}")
76
+ if len(self.chunks) != total_rows:
77
+ logging.warning(f"Mismatch between DataFrame rows ({total_rows}) and chunks created ({len(self.chunks)})")
78
+
79
+ # Log sample of first chunk
80
+ if self.chunks:
81
+ logging.info(f"Sample of first chunk: {self.chunks[0][:200]}...")
82
+
83
+ def create_bm25_index(self, save_path: str = "bm25_index.pkl") -> None:
84
+ """
85
+ Create and save BM25 index.
86
+
87
+ Args:
88
+ save_path: Path to save the BM25 index
89
+ """
90
+ logging.info("Creating BM25 index...")
91
+
92
+ # Tokenize chunks
93
+ tokenized_chunks = [word_tokenize(chunk.lower()) for chunk in self.chunks]
94
+
95
+ # Create BM25 model
96
+ self.bm25_model = BM25Okapi(tokenized_chunks)
97
+
98
+ # Save the model and related data
99
+ with open(save_path, 'wb') as f:
100
+ pickle.dump({
101
+ 'model': self.bm25_model,
102
+ 'chunks': self.chunks,
103
+ 'metadata': self.metadata
104
+ }, f)
105
+
106
+ logging.info(f"BM25 index saved to {save_path}")
107
+
108
+ def create_chroma_db(self, db_path: str = "chroma_db") -> None:
109
+ """
110
+ Create ChromaDB database.
111
+
112
+ Args:
113
+ db_path: Path to save the ChromaDB
114
+ """
115
+ logging.info("Creating ChromaDB database...")
116
+
117
+ # Initialize ChromaDB with new client syntax
118
+ client = chromadb.PersistentClient(path=db_path)
119
+
120
+ # Create or get collection
121
+ self.chroma_collection = client.get_or_create_collection(
122
+ name="dog_food_descriptions"
123
+ )
124
+
125
+ # Add documents in batches
126
+ batch_size = 10
127
+ for i in tqdm(range(0, len(self.chunks), batch_size)):
128
+ batch_chunks = self.chunks[i:i + batch_size]
129
+ batch_metadata = self.metadata[i:i + batch_size]
130
+ batch_ids = [str(idx) for idx in range(i, min(i + batch_size, len(self.chunks)))]
131
+
132
+ # Get embeddings for batch
133
+ embeddings = []
134
+ for chunk in batch_chunks:
135
+ response = self.client.embeddings.create(
136
+ model="text-embedding-ada-002",
137
+ input=chunk
138
+ )
139
+ embeddings.append(response.data[0].embedding)
140
+
141
+ # Add to collection
142
+ self.chroma_collection.add(
143
+ embeddings=embeddings,
144
+ metadatas=batch_metadata,
145
+ documents=batch_chunks,
146
+ ids=batch_ids
147
+ )
148
+
149
+ logging.info(f"ChromaDB saved to {db_path}")
150
+
151
+ def main():
152
+ """Main execution function."""
153
+ try:
154
+ # Initialize creator
155
+ creator = VectorStoreCreator("3rd_clean_comida_dogs_enriched_multilingual_2.pkl")
156
+
157
+ # Prepare data
158
+ creator.prepare_data()
159
+
160
+ # Create indices
161
+ creator.create_bm25_index()
162
+ creator.create_chroma_db()
163
+
164
+ logging.info("Vector stores created successfully!")
165
+
166
+ except Exception as e:
167
+ logging.error(f"An error occurred: {e}")
168
+ raise
169
+
170
+ if __name__ == "__main__":
171
+ main()
data_cleansing/2nd_clean_comida_dogs_filtered.pkl ADDED
Binary file (26.9 kB). View file
 
data_cleansing/clean_comida_dogs_categoria_cleaned.pkl ADDED
Binary file (89.4 kB). View file
 
diagnostic_notebook.ipynb ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import logging\n",
10
+ "from qa_backend import DogFoodQASystem\n",
11
+ "\n",
12
+ "# Configure logging to show everything\n",
13
+ "logging.basicConfig(\n",
14
+ " level=logging.INFO,\n",
15
+ " format='%(asctime)s - %(levelname)s - %(message)s'\n",
16
+ ")"
17
+ ]
18
+ },
19
+ {
20
+ "cell_type": "code",
21
+ "execution_count": 2,
22
+ "metadata": {},
23
+ "outputs": [
24
+ {
25
+ "name": "stderr",
26
+ "output_type": "stream",
27
+ "text": [
28
+ "2025-01-19 17:56:19,823 - INFO - Anonymized telemetry enabled. See https://docs.trychroma.com/telemetry for more information.\n"
29
+ ]
30
+ },
31
+ {
32
+ "name": "stdout",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "Initializing QA System...\n"
36
+ ]
37
+ },
38
+ {
39
+ "name": "stderr",
40
+ "output_type": "stream",
41
+ "text": [
42
+ "2025-01-19 17:56:20,074 - INFO - \n",
43
+ "Diagnosing Vector Store:\n",
44
+ "2025-01-19 17:56:20,082 - INFO - Collection name: dog_food_descriptions\n",
45
+ "2025-01-19 17:56:20,082 - INFO - Number of documents: 84\n"
46
+ ]
47
+ },
48
+ {
49
+ "name": "stdout",
50
+ "output_type": "stream",
51
+ "text": [
52
+ "\n",
53
+ "Running Vector Store Diagnostics...\n"
54
+ ]
55
+ },
56
+ {
57
+ "name": "stderr",
58
+ "output_type": "stream",
59
+ "text": [
60
+ "2025-01-19 17:56:21,233 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
61
+ "2025-01-19 17:56:21,259 - INFO - ✅ Vector store test query successful\n"
62
+ ]
63
+ }
64
+ ],
65
+ "source": [
66
+ "# Initialize the QA system\n",
67
+ "print(\"Initializing QA System...\")\n",
68
+ "qa_system = DogFoodQASystem()\n",
69
+ "\n",
70
+ "# Run diagnostics\n",
71
+ "print(\"\\nRunning Vector Store Diagnostics...\")\n",
72
+ "vector_store_status = qa_system.diagnose_vector_store()\n"
73
+ ]
74
+ },
75
+ {
76
+ "cell_type": "code",
77
+ "execution_count": 3,
78
+ "metadata": {},
79
+ "outputs": [
80
+ {
81
+ "name": "stdout",
82
+ "output_type": "stream",
83
+ "text": [
84
+ "\n",
85
+ "Testing with query: What's the best premium food for adult dogs?\n"
86
+ ]
87
+ },
88
+ {
89
+ "name": "stderr",
90
+ "output_type": "stream",
91
+ "text": [
92
+ "2025-01-19 17:56:37,332 - INFO - \n",
93
+ "==================================================\n",
94
+ "Starting hybrid search for query: What's the best premium food for adult dogs?\n",
95
+ "2025-01-19 17:56:37,335 - INFO - ChromaDB collection info:\n",
96
+ "2025-01-19 17:56:37,336 - INFO - - Number of documents: 84\n",
97
+ "2025-01-19 17:56:37,336 - INFO - - Collection name: dog_food_descriptions\n",
98
+ "2025-01-19 17:56:37,341 - INFO - \n",
99
+ "BM25 Search Results:\n",
100
+ "2025-01-19 17:56:37,342 - INFO - Found 5 results\n",
101
+ "2025-01-19 17:56:37,342 - INFO - \n",
102
+ "Generating embedding for query...\n",
103
+ "2025-01-19 17:56:38,091 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings \"HTTP/1.1 200 OK\"\n",
104
+ "2025-01-19 17:56:38,093 - INFO - Embedding generated successfully. Dimension: 1536\n",
105
+ "2025-01-19 17:56:38,094 - INFO - \n",
106
+ "Performing ChromaDB search...\n",
107
+ "2025-01-19 17:56:38,099 - INFO - ChromaDB raw results:\n",
108
+ "2025-01-19 17:56:38,100 - INFO - - Number of results: 5\n",
109
+ "2025-01-19 17:56:38,100 - INFO - - Keys in results: dict_keys(['ids', 'distances', 'metadatas', 'embeddings', 'documents', 'uris', 'data'])\n",
110
+ "2025-01-19 17:56:38,100 - INFO - \n",
111
+ "Vector result 1:\n",
112
+ "2025-01-19 17:56:38,100 - INFO - - Score: 0.6637\n",
113
+ "2025-01-19 17:56:38,100 - INFO - - Text preview: **Introducing Dowolf Snack Para Perro Galletas - The Premium Treat for Your Adult Dog!**\n",
114
+ "\n",
115
+ "**Brand:**...\n",
116
+ "2025-01-19 17:56:38,101 - INFO - \n",
117
+ "Vector result 2:\n",
118
+ "2025-01-19 17:56:38,101 - INFO - - Score: 0.6391\n",
119
+ "2025-01-19 17:56:38,101 - INFO - - Text preview: ### Dogourmet Alimento Seco Para Perro Adulto Carne Parrilla 4kg\n",
120
+ "\n",
121
+ "**Elevate Your Dog’s Dining Experi...\n",
122
+ "2025-01-19 17:56:38,102 - INFO - \n",
123
+ "Vector result 3:\n",
124
+ "2025-01-19 17:56:38,102 - INFO - - Score: 0.6388\n",
125
+ "2025-01-19 17:56:38,102 - INFO - - Text preview: ### Discover the Ultimate in Canine Nutrition with Chunky Alimento Seco Para Perro Adulto Nuggets De...\n",
126
+ "2025-01-19 17:56:38,103 - INFO - \n",
127
+ "Vector result 4:\n",
128
+ "2025-01-19 17:56:38,103 - INFO - - Score: 0.6338\n",
129
+ "2025-01-19 17:56:38,103 - INFO - - Text preview: **Unleash the Gourmet Experience with Dogourmet Alimento Seco Para Perros Pavo Y Pollo**\n",
130
+ "\n",
131
+ "Elevate yo...\n",
132
+ "2025-01-19 17:56:38,104 - INFO - \n",
133
+ "Vector result 5:\n",
134
+ "2025-01-19 17:56:38,104 - INFO - - Score: 0.6328\n",
135
+ "2025-01-19 17:56:38,104 - INFO - - Text preview: **Introducing Chunky Snack Para Perro Bombonera Deli Dent – The Ultimate Gourmet Snack for Adult Dog...\n",
136
+ "2025-01-19 17:56:38,105 - INFO - \n",
137
+ "Processed 5 vector results\n",
138
+ "2025-01-19 17:56:38,105 - INFO - \n",
139
+ "Final results distribution:\n",
140
+ "2025-01-19 17:56:38,105 - INFO - - BM25 results: 5\n",
141
+ "2025-01-19 17:56:38,106 - INFO - - Vector results: 0\n",
142
+ "2025-01-19 17:56:38,106 - INFO - ==================================================\n",
143
+ "2025-01-19 17:56:39,662 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions \"HTTP/1.1 200 OK\"\n"
144
+ ]
145
+ },
146
+ {
147
+ "name": "stdout",
148
+ "output_type": "stream",
149
+ "text": [
150
+ "\n",
151
+ "Results Distribution:\n",
152
+ "- BM25 Results: 5\n",
153
+ "- Vector Results: 0\n"
154
+ ]
155
+ }
156
+ ],
157
+ "source": [
158
+ "# Test with a sample query\n",
159
+ "test_query = \"What's the best premium food for adult dogs?\"\n",
160
+ "print(f\"\\nTesting with query: {test_query}\")\n",
161
+ "\n",
162
+ "result = qa_system.process_query(test_query)\n",
163
+ "\n",
164
+ "# Display results statistics\n",
165
+ "bm25_count = sum(1 for r in result['search_results'] if r['source'] == 'BM25')\n",
166
+ "vector_count = sum(1 for r in result['search_results'] if r['source'] == 'Vector')\n",
167
+ "\n",
168
+ "print(f\"\\nResults Distribution:\")\n",
169
+ "print(f\"- BM25 Results: {bm25_count}\")\n",
170
+ "print(f\"- Vector Results: {vector_count}\")"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": null,
176
+ "metadata": {},
177
+ "outputs": [],
178
+ "source": []
179
+ }
180
+ ],
181
+ "metadata": {
182
+ "kernelspec": {
183
+ "display_name": "chats_langchain",
184
+ "language": "python",
185
+ "name": "python3"
186
+ },
187
+ "language_info": {
188
+ "codemirror_mode": {
189
+ "name": "ipython",
190
+ "version": 3
191
+ },
192
+ "file_extension": ".py",
193
+ "mimetype": "text/x-python",
194
+ "name": "python",
195
+ "nbconvert_exporter": "python",
196
+ "pygments_lexer": "ipython3",
197
+ "version": "3.11.5"
198
+ }
199
+ },
200
+ "nbformat": 4,
201
+ "nbformat_minor": 2
202
+ }
enriching_description.py ADDED
@@ -0,0 +1,237 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict, Any
3
+ import time
4
+ from tqdm import tqdm
5
+ import openai
6
+ import pandas as pd
7
+ from dotenv import load_dotenv, find_dotenv
8
+ import os
9
+ import glob
10
+ from langsmith import traceable
11
+ from langsmith import Client
12
+ from langsmith.wrappers import wrap_openai
13
+
14
+ # Configure logging
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(levelname)s - %(message)s'
18
+ )
19
+
20
+ # Load environment variables
21
+ _ = load_dotenv(find_dotenv())
22
+
23
+ # Initialize LangSmith client
24
+ langsmith_client = Client()
25
+ # Wrap OpenAI client with LangSmith
26
+ openai = wrap_openai(openai)
27
+
28
+ @traceable(run_type="chain")
29
+ def create_product_prompt(row: pd.Series, language: str) -> str:
30
+ """
31
+ Create a detailed prompt for product description generation.
32
+
33
+ Args:
34
+ row: DataFrame row containing product information
35
+ language: Target language ('en' or 'es')
36
+
37
+ Returns:
38
+ str: Formatted prompt for the LLM
39
+ """
40
+ base_prompts = {
41
+ 'en': """Create a compelling and detailed marketing description for a premium dog food product.
42
+ Include the following information and expand with your knowledge:
43
+
44
+ • Brand: {brand}
45
+ • Product Name: {product_name}
46
+ • Specifically designed for: {dog_type}
47
+ • Type: {food_type}
48
+ • Package Size: {weight} kg
49
+ • Price Point: ${price:.2f}
50
+
51
+ Focus on:
52
+ 1. Key nutritional benefits
53
+ 2. Quality of ingredients
54
+ 3. Health advantages
55
+ 4. Why it's perfect for the specified dog type
56
+ 5. Value proposition
57
+
58
+ Make it engaging and persuasive while maintaining accuracy.""",
59
+
60
+ 'es': """Crea una descripción comercial detallada y convincente para un producto premium de alimentación canina.
61
+ Incluye la siguiente información y expándela con tu conocimiento:
62
+
63
+ • Marca: {brand}
64
+ • Nombre del Producto: {product_name}
65
+ • Diseñado específicamente para: {dog_type}
66
+ • Tipo: {food_type}
67
+ • Tamaño del Paquete: {weight} kg
68
+ • Precio: ${price:.2f}
69
+
70
+ Enfócate en:
71
+ 1. Beneficios nutricionales clave
72
+ 2. Calidad de los ingredientes
73
+ 3. Ventajas para la salud
74
+ 4. Por qué es perfecto para el tipo de perro especificado
75
+ 5. Propuesta de valor
76
+
77
+ Hazlo atractivo y persuasivo mientras mantienes la precisión."""
78
+ }
79
+ prompt = base_prompts[language].format(**row.to_dict())
80
+ return prompt
81
+
82
+ @traceable(run_type="chain")
83
+ def generate_description(row: pd.Series, language: str, retry_attempts: int = 3) -> str:
84
+ """
85
+ Generate a product description using OpenAI's API with retry logic.
86
+
87
+ Args:
88
+ row: DataFrame row containing product information
89
+ language: Target language ('en' or 'es')
90
+ retry_attempts: Number of retry attempts on failure
91
+
92
+ Returns:
93
+ str: Generated description or error message
94
+ """
95
+ prompt = create_product_prompt(row, language)
96
+
97
+ for attempt in range(retry_attempts):
98
+ try:
99
+ response = openai.chat.completions.create(
100
+ model="gpt-4o-mini", # Using GPT-4 for better quality
101
+ messages=[{"role": "user", "content": prompt}],
102
+ max_tokens=150,
103
+ temperature=0.7,
104
+ presence_penalty=0.3,
105
+ frequency_penalty=0.3
106
+ )
107
+ return response.choices[0].message.content.strip()
108
+
109
+ except Exception as e:
110
+ logging.error(f"Attempt {attempt + 1} failed: {str(e)}") # Added more detailed logging
111
+ if attempt == retry_attempts - 1:
112
+ return f"Error generating {language} description: {e}"
113
+ time.sleep(2 ** attempt) # Exponential backoff
114
+
115
+ def save_checkpoint(df: pd.DataFrame, batch_num: int, checkpoint_dir: str = 'checkpoints') -> None:
116
+ """
117
+ Save a checkpoint of the current DataFrame.
118
+
119
+ Args:
120
+ df: DataFrame to checkpoint
121
+ batch_num: Current batch number
122
+ checkpoint_dir: Directory to store checkpoints
123
+ """
124
+ # Create checkpoint directory if it doesn't exist
125
+ os.makedirs(checkpoint_dir, exist_ok=True)
126
+
127
+ checkpoint_path = os.path.join(checkpoint_dir, f'checkpoint_batch_{batch_num}.pkl')
128
+ df.to_pickle(checkpoint_path)
129
+ logging.info(f"Saved checkpoint at batch {batch_num}")
130
+
131
+ def load_latest_checkpoint(checkpoint_dir: str = 'checkpoints') -> tuple[pd.DataFrame | None, int]:
132
+ """
133
+ Load the most recent checkpoint if it exists.
134
+
135
+ Args:
136
+ checkpoint_dir: Directory containing checkpoints
137
+
138
+ Returns:
139
+ tuple: (DataFrame or None, last completed batch number)
140
+ """
141
+ if not os.path.exists(checkpoint_dir):
142
+ return None, 0
143
+
144
+ checkpoint_files = glob.glob(os.path.join(checkpoint_dir, 'checkpoint_batch_*.pkl'))
145
+ if not checkpoint_files:
146
+ return None, 0
147
+
148
+ latest_checkpoint = max(checkpoint_files)
149
+ batch_num = int(latest_checkpoint.split('_')[-1].split('.')[0])
150
+
151
+ logging.info(f"Loading checkpoint from batch {batch_num}")
152
+ return pd.read_pickle(latest_checkpoint), batch_num
153
+
154
+ @traceable(run_type="chain")
155
+ def enrich_descriptions(df: pd.DataFrame, batch_size: int = 10, checkpoint_frequency: int = 5) -> pd.DataFrame:
156
+ """
157
+ Enrich DataFrame with product descriptions in both languages.
158
+
159
+ Args:
160
+ df: Input DataFrame
161
+ batch_size: Number of items to process in each batch
162
+ checkpoint_frequency: Number of batches between checkpoints
163
+
164
+ Returns:
165
+ pd.DataFrame: Enriched DataFrame with new description columns
166
+ """
167
+ logging.info("Starting description generation process...")
168
+
169
+ initial_row_count = len(df)
170
+ df = df.copy()
171
+
172
+ # Try to load from checkpoint
173
+ checkpoint_df, last_batch = load_latest_checkpoint()
174
+ if checkpoint_df is not None:
175
+ df = checkpoint_df
176
+ start_idx = (last_batch + 1) * batch_size
177
+ logging.info(f"Resuming from batch {last_batch + 1}")
178
+ else:
179
+ start_idx = 0
180
+
181
+ total_batches = (len(df) + batch_size - 1) // batch_size
182
+
183
+ for batch_num, i in enumerate(tqdm(range(start_idx, len(df), batch_size)), start=last_batch + 1):
184
+ batch = df.iloc[i:i + batch_size]
185
+
186
+ df.loc[batch.index, 'description_en'] = batch.apply(
187
+ lambda row: generate_description(row, 'en'), axis=1
188
+ )
189
+ df.loc[batch.index, 'description_es'] = batch.apply(
190
+ lambda row: generate_description(row, 'es'), axis=1
191
+ )
192
+
193
+ if batch_num % checkpoint_frequency == 0:
194
+ save_checkpoint(df, batch_num)
195
+
196
+ time.sleep(1) # Rate limiting
197
+
198
+ # Validate row counts and description completeness
199
+ final_row_count = len(df)
200
+ if final_row_count != initial_row_count:
201
+ raise ValueError(f"Row count mismatch: Started with {initial_row_count} rows, ended with {final_row_count} rows")
202
+
203
+ # Check for missing descriptions
204
+ missing_en = df['description_en'].isna().sum()
205
+ missing_es = df['description_es'].isna().sum()
206
+ if missing_en > 0 or missing_es > 0:
207
+ logging.warning(f"Missing descriptions detected: English: {missing_en}, Spanish: {missing_es}")
208
+
209
+ return df
210
+
211
+ def main():
212
+ """Main execution function."""
213
+ try:
214
+ # Load the dataset
215
+ file_path = '2nd_clean_comida_dogs_filtered.pkl'
216
+ data = pd.read_pickle(file_path)
217
+ initial_count = len(data)
218
+ logging.info(f"Loaded dataset with {initial_count} records")
219
+
220
+ # Enrich with descriptions
221
+ enriched_data = enrich_descriptions(data)
222
+
223
+ # Final validation before saving
224
+ if len(enriched_data) != initial_count:
225
+ raise ValueError(f"Row count mismatch: Original had {initial_count} rows, enriched has {len(enriched_data)} rows")
226
+
227
+ # Save the enriched dataset
228
+ output_path = '3rd_clean_comida_dogs_enriched_multilingual_2.pkl'
229
+ enriched_data.to_pickle(output_path)
230
+ logging.info(f"Enriched dataset saved to {output_path}")
231
+
232
+ except Exception as e:
233
+ logging.error(f"An error occurred: {e}")
234
+ raise
235
+
236
+ if __name__ == "__main__":
237
+ main()
qa_backend.py ADDED
@@ -0,0 +1,296 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Dict, List, Any
3
+ import pickle
4
+ import chromadb
5
+ from chromadb.config import Settings
6
+ from openai import OpenAI
7
+ import numpy as np
8
+ from nltk.tokenize import word_tokenize
9
+ from dotenv import load_dotenv
10
+ import os
11
+ from langsmith import traceable
12
+ from langsmith import Client
13
+ from langsmith.wrappers import wrap_openai
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
+
18
+ # Load environment variables
19
+ load_dotenv()
20
+
21
+ # Initialize LangSmith client
22
+ langsmith_client = Client()
23
+ # Wrap OpenAI client with LangSmith
24
+ openai = wrap_openai(OpenAI())
25
+
26
+ def detect_language(text: str) -> str:
27
+ """
28
+ Simple language detection for English/Spanish based on common words.
29
+
30
+ Args:
31
+ text: Input text to detect language
32
+
33
+ Returns:
34
+ str: 'es' for Spanish, 'en' for English
35
+ """
36
+ # Common Spanish words/characters
37
+ spanish_indicators = {'qué', 'cuál', 'cómo', 'dónde', 'por', 'para', 'perro', 'comida',
38
+ 'mejor', 'precio', 'barato', 'caro', 'cachorro', 'adulto'}
39
+
40
+ # Convert to lowercase for comparison
41
+ text_lower = text.lower()
42
+
43
+ # Count Spanish indicators
44
+ spanish_count = sum(1 for word in spanish_indicators if word in text_lower)
45
+
46
+ # If we find Spanish indicators, classify as Spanish, otherwise default to English
47
+ return 'es' if spanish_count > 0 else 'en'
48
+
49
+ class DogFoodQASystem:
50
+ def __init__(self):
51
+ """Initialize the QA system with vector stores and models."""
52
+ self.load_stores()
53
+
54
+ def load_stores(self) -> None:
55
+ """Load BM25 and ChromaDB stores."""
56
+ with open('bm25_index.pkl', 'rb') as f:
57
+ self.bm25_data = pickle.load(f)
58
+
59
+ self.chroma_client = chromadb.PersistentClient(path="chroma_db")
60
+ self.collection = self.chroma_client.get_collection("dog_food_descriptions")
61
+
62
+ @traceable(run_type="chain")
63
+ def hybrid_search(self, query: str, top_k: int = 5) -> List[Dict[str, Any]]:
64
+ """
65
+ Hybrid search that gets top_k results from each source and combines unique results.
66
+ """
67
+ logging.info(f"\n{'='*50}\nStarting hybrid search for query: {query}")
68
+
69
+ # BM25 search - get top_k results
70
+ tokenized_query = word_tokenize(query.lower())
71
+ bm25_scores = self.bm25_data['model'].get_scores(tokenized_query)
72
+ bm25_indices = np.argsort(bm25_scores)[::-1][:top_k] # Get top_k results
73
+
74
+ bm25_results = [
75
+ {
76
+ 'score': float(bm25_scores[idx]),
77
+ 'text': self.bm25_data['chunks'][idx],
78
+ 'metadata': self.bm25_data['metadata'][idx],
79
+ 'source': 'BM25'
80
+ }
81
+ for idx in bm25_indices
82
+ ]
83
+ logging.info(f"Retrieved {len(bm25_results)} results from BM25")
84
+
85
+ # Vector search - get top_k results
86
+ try:
87
+ embedding_response = openai.embeddings.create(
88
+ model="text-embedding-ada-002",
89
+ input=query
90
+ )
91
+ query_embedding = embedding_response.data[0].embedding
92
+
93
+ chroma_results = self.collection.query(
94
+ query_embeddings=[query_embedding],
95
+ n_results=top_k, # Get top_k results
96
+ include=["documents", "metadatas", "distances"]
97
+ )
98
+
99
+ processed_vector_results = [
100
+ {
101
+ 'score': float(1 - distance),
102
+ 'text': doc,
103
+ 'metadata': meta,
104
+ 'source': 'Vector'
105
+ }
106
+ for doc, meta, distance in zip(
107
+ chroma_results['documents'][0],
108
+ chroma_results['metadatas'][0],
109
+ chroma_results['distances'][0]
110
+ )
111
+ ]
112
+ logging.info(f"Retrieved {len(processed_vector_results)} results from Vector search")
113
+
114
+ except Exception as e:
115
+ logging.error(f"Error in vector search: {str(e)}", exc_info=True)
116
+ processed_vector_results = []
117
+
118
+ # Combine results
119
+ all_results = self._smart_combine_results(bm25_results, processed_vector_results, query)
120
+ return all_results
121
+
122
+ def _smart_combine_results(self, bm25_results: List[Dict], vector_results: List[Dict], query: str) -> List[Dict]:
123
+ """
124
+ Combine results from both sources, tracking duplicates and sources.
125
+ """
126
+ logging.info("\nCombining search results...")
127
+
128
+ # Initialize combined results with tracking
129
+ combined_dict = {} # Use text as key to track duplicates
130
+
131
+ # Process vector results
132
+ for result in vector_results:
133
+ text = result['text']
134
+ if text not in combined_dict:
135
+ result['sources'] = ['Vector']
136
+ result['original_scores'] = {'Vector': result['score']}
137
+ combined_dict[text] = result
138
+ logging.info(f"Added Vector result (score: {result['score']:.4f})")
139
+ else:
140
+ combined_dict[text]['sources'].append('Vector')
141
+ combined_dict[text]['original_scores']['Vector'] = result['score']
142
+ logging.info(f"Marked existing result as found by Vector (score: {result['score']:.4f})")
143
+
144
+ # Process BM25 results
145
+ for result in bm25_results:
146
+ text = result['text']
147
+ if text not in combined_dict:
148
+ result['sources'] = ['BM25']
149
+ result['original_scores'] = {'BM25': result['score']}
150
+ combined_dict[text] = result
151
+ logging.info(f"Added BM25 result (score: {result['score']:.4f})")
152
+ else:
153
+ combined_dict[text]['sources'].append('BM25')
154
+ combined_dict[text]['original_scores']['BM25'] = result['score']
155
+ logging.info(f"Marked existing result as found by BM25 (score: {result['score']:.4f})")
156
+
157
+ # Convert to list
158
+ all_results = list(combined_dict.values())
159
+
160
+ # Calculate statistics
161
+ total_results = len(all_results)
162
+ duplicates = sum(1 for r in all_results if len(r['sources']) > 1)
163
+ vector_only = sum(1 for r in all_results if r['sources'] == ['Vector'])
164
+ bm25_only = sum(1 for r in all_results if r['sources'] == ['BM25'])
165
+
166
+ logging.info(f"\nResults Statistics:")
167
+ logging.info(f"- Total unique results: {total_results}")
168
+ logging.info(f"- Duplicates (found by both): {duplicates}")
169
+ logging.info(f"- Vector only: {vector_only}")
170
+ logging.info(f"- BM25 only: {bm25_only}")
171
+
172
+ return all_results
173
+
174
+ def _adjust_score_with_metadata(self, result: Dict, query: str) -> float:
175
+ """Adjust search score based on metadata relevance."""
176
+ base_score = result['score']
177
+ metadata = result['metadata']
178
+
179
+ # Initialize boost factors
180
+ boost = 1.0
181
+
182
+ # Boost based on reviews (social proof)
183
+ if metadata.get('reviews', 0) > 20:
184
+ boost *= 1.2
185
+
186
+ # Boost based on price range mentions
187
+ query_lower = query.lower()
188
+ if ('affordable' in query_lower or 'barato' in query_lower) and metadata.get('price', 0) < 50:
189
+ boost *= 1.3
190
+ elif ('premium' in query_lower or 'premium' in query_lower) and metadata.get('price', 0) > 100:
191
+ boost *= 1.3
192
+
193
+ # Boost based on specific dog type matches
194
+ dog_types = ['puppy', 'adult', 'senior', 'cachorro', 'adulto']
195
+ for dog_type in dog_types:
196
+ if dog_type in query_lower and dog_type in metadata.get('dog_type', '').lower():
197
+ boost *= 1.25
198
+ break
199
+
200
+ return base_score * boost
201
+
202
+ @traceable(run_type="chain")
203
+ def generate_answer(self, query: str, search_results: List[Dict]) -> str:
204
+ """Generate a natural language answer based on search results."""
205
+ # Detect query language
206
+ query_lang = detect_language(query)
207
+
208
+ # Prepare context from search results
209
+ context = self._prepare_context(search_results)
210
+
211
+ # Create prompt based on language
212
+ system_prompt = {
213
+ 'es': """Eres un experto en nutrición canina. Responde a la pregunta utilizando solo el contexto proporcionado.
214
+ Si no puedes responder con el contexto dado, indícalo. Incluye información sobre precios y características
215
+ específicas de los productos cuando sea relevante.""",
216
+ 'en': """You are a dog nutrition expert. Answer the question using only the provided context.
217
+ If you cannot answer from the given context, say so. Include pricing and specific product
218
+ features when relevant."""
219
+ }.get(query_lang, 'en')
220
+
221
+ response = openai.chat.completions.create(
222
+ model="gpt-4o-mini",
223
+ messages=[
224
+ {"role": "system", "content": system_prompt},
225
+ {"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"}
226
+ ],
227
+ temperature=0.7,
228
+ max_tokens=300
229
+ )
230
+
231
+ return response.choices[0].message.content.strip()
232
+
233
+ def _prepare_context(self, search_results: List[Dict]) -> str:
234
+ """Prepare search results as context for the LLM."""
235
+ context_parts = []
236
+ for result in search_results:
237
+ metadata = result['metadata']
238
+ context_parts.append(
239
+ f"Product: {metadata['product_name']}\n"
240
+ f"Brand: {metadata['brand']}\n"
241
+ f"Price: ${metadata['price']}\n"
242
+ f"Weight: {metadata['weight']}kg\n"
243
+ f"Dog Type: {metadata['dog_type']}\n"
244
+ f"Description: {result['text']}\n"
245
+ )
246
+ return "\n---\n".join(context_parts)
247
+
248
+ @traceable(run_type="chain")
249
+ def process_query(self, query: str) -> Dict[str, Any]:
250
+ """Process a user query and return both search results and answer."""
251
+ search_results = self.hybrid_search(query)
252
+ answer = self.generate_answer(query, search_results)
253
+
254
+ return {
255
+ "answer": answer,
256
+ "search_results": search_results,
257
+ "language": detect_language(query)
258
+ }
259
+
260
+ def diagnose_vector_store(self):
261
+ """Diagnose the vector store setup."""
262
+ try:
263
+ logging.info("\nDiagnosing Vector Store:")
264
+ collection_info = self.collection.get()
265
+
266
+ # Basic collection info
267
+ doc_count = len(collection_info['ids'])
268
+ logging.info(f"Collection name: {self.collection.name}")
269
+ logging.info(f"Number of documents: {doc_count}")
270
+
271
+ # Sample query test
272
+ if doc_count > 0:
273
+ test_query = "test query for diagnosis"
274
+ test_embedding = openai.embeddings.create(
275
+ model="text-embedding-ada-002",
276
+ input=test_query
277
+ ).data[0].embedding
278
+
279
+ test_results = self.collection.query(
280
+ query_embeddings=[test_embedding],
281
+ n_results=1
282
+ )
283
+
284
+ if len(test_results['ids'][0]) > 0:
285
+ logging.info("✅ Vector store test query successful")
286
+ return True
287
+ else:
288
+ logging.error("❌ Vector store returned no results for test query")
289
+ return False
290
+ else:
291
+ logging.error("❌ Vector store is empty")
292
+ return False
293
+
294
+ except Exception as e:
295
+ logging.error(f"❌ Error accessing vector store: {str(e)}")
296
+ return False
raw_data/clean_comida_dogs_categoria.pkl ADDED
Binary file (100 kB). View file
 
raw_data/veterinarias_processed.pkl ADDED
Binary file (12.6 kB). View file