Crcs1225 commited on
Commit
c135be2
Β·
1 Parent(s): 1333c38
Files changed (3) hide show
  1. database.py +53 -10
  2. generate_embeddings.py +49 -0
  3. main.py +26 -133
database.py CHANGED
@@ -14,18 +14,18 @@ class Database:
14
  self.client = motor.motor_asyncio.AsyncIOMotorClient(settings.MONGODB_URI)
15
  self.db = self.client[settings.DATABASE_NAME]
16
  self.collection = self.db[settings.COLLECTION_NAME]
 
17
 
18
  async def similarity_search(self, query_embedding: List[float], limit: int = 3) -> List[Dict]:
19
- """Search for similar products using vector similarity"""
20
  try:
21
- # First try vector search if index exists
22
  pipeline = [
23
  {
24
  "$vectorSearch": {
25
- "index": "vector_index", # Your vector index name
26
  "path": "embedding",
27
  "queryVector": query_embedding,
28
- "numCandidates": 100,
29
  "limit": limit
30
  }
31
  },
@@ -46,7 +46,7 @@ class Database:
46
  async for doc in cursor:
47
  results.append({
48
  "id": str(doc["_id"]),
49
- "content": f"Product: {doc.get('title', 'N/A')}. Description: {doc.get('product_description', 'N/A')}. Category: {doc.get('category', 'N/A')}. Price: {doc.get('final_price', 'N/A')}.",
50
  "source": doc.get('title', 'product_database'),
51
  "metadata": {
52
  "category": doc.get('category', 'N/A'),
@@ -56,12 +56,45 @@ class Database:
56
  })
57
  return results
58
  except Exception as e:
59
- print(f"Vector search failed, falling back to text search: {e}")
60
- # Fallback to text search if vector search fails
61
- return await self.search_by_category("tops", limit)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
 
63
  async def search_by_category(self, category: str, limit: int = 5) -> List[Dict]:
64
- """Search products by category (fallback if vector search fails)"""
65
  cursor = self.collection.find(
66
  {"category": {"$regex": category, "$options": "i"}}
67
  ).limit(limit)
@@ -70,7 +103,7 @@ class Database:
70
  async for doc in cursor:
71
  results.append({
72
  "id": str(doc["_id"]),
73
- "content": f"Product: {doc.get('title', 'N/A')}. Description: {doc.get('product_description', 'N/A')}. Category: {doc.get('category', 'N/A')}. Price: {doc.get('final_price', 'N/A')}.",
74
  "source": doc.get('title', 'product_database'),
75
  "metadata": {
76
  "category": doc.get('category', 'N/A'),
@@ -83,6 +116,16 @@ class Database:
83
  """Insert documents into the collection"""
84
  result = await self.collection.insert_many(documents)
85
  return [str(id) for id in result.inserted_ids]
 
 
 
 
 
 
 
 
 
 
86
 
87
  # Global database instance
88
  db = Database()
 
14
  self.client = motor.motor_asyncio.AsyncIOMotorClient(settings.MONGODB_URI)
15
  self.db = self.client[settings.DATABASE_NAME]
16
  self.collection = self.db[settings.COLLECTION_NAME]
17
+ print(f"βœ… Connected to MongoDB: {settings.DATABASE_NAME}.{settings.COLLECTION_NAME}")
18
 
19
  async def similarity_search(self, query_embedding: List[float], limit: int = 3) -> List[Dict]:
20
+ """Search for similar products using MongoDB Atlas Vector Search"""
21
  try:
 
22
  pipeline = [
23
  {
24
  "$vectorSearch": {
25
+ "index": "vector_index", # Make sure this matches your Atlas index name
26
  "path": "embedding",
27
  "queryVector": query_embedding,
28
+ "numCandidates": 150,
29
  "limit": limit
30
  }
31
  },
 
46
  async for doc in cursor:
47
  results.append({
48
  "id": str(doc["_id"]),
49
+ "content": self._create_product_content(doc),
50
  "source": doc.get('title', 'product_database'),
51
  "metadata": {
52
  "category": doc.get('category', 'N/A'),
 
56
  })
57
  return results
58
  except Exception as e:
59
+ print(f"❌ Vector search error: {e}")
60
+ # Fallback to text search
61
+ return await self.search_by_text("tops", limit)
62
+
63
+ def _create_product_content(self, doc: Dict) -> str:
64
+ """Create formatted product content for the LLM"""
65
+ content_parts = [
66
+ f"Product: {doc.get('title', 'N/A')}",
67
+ f"Description: {doc.get('product_description', 'N/A')}",
68
+ f"Category: {doc.get('category', 'N/A')}",
69
+ f"Price: β‚Ή{doc.get('final_price', 'N/A')}"
70
+ ]
71
+ return ". ".join(content_parts)
72
+
73
+ async def search_by_text(self, query: str, limit: int = 5) -> List[Dict]:
74
+ """Fallback text search if vector search fails"""
75
+ cursor = self.collection.find({
76
+ "$or": [
77
+ {"title": {"$regex": query, "$options": "i"}},
78
+ {"category": {"$regex": query, "$options": "i"}},
79
+ {"product_description": {"$regex": query, "$options": "i"}}
80
+ ]
81
+ }).limit(limit)
82
+
83
+ results = []
84
+ async for doc in cursor:
85
+ results.append({
86
+ "id": str(doc["_id"]),
87
+ "content": self._create_product_content(doc),
88
+ "source": doc.get('title', 'product_database'),
89
+ "metadata": {
90
+ "category": doc.get('category', 'N/A'),
91
+ "price": doc.get('final_price', 'N/A')
92
+ }
93
+ })
94
+ return results
95
 
96
  async def search_by_category(self, category: str, limit: int = 5) -> List[Dict]:
97
+ """Search products by category"""
98
  cursor = self.collection.find(
99
  {"category": {"$regex": category, "$options": "i"}}
100
  ).limit(limit)
 
103
  async for doc in cursor:
104
  results.append({
105
  "id": str(doc["_id"]),
106
+ "content": self._create_product_content(doc),
107
  "source": doc.get('title', 'product_database'),
108
  "metadata": {
109
  "category": doc.get('category', 'N/A'),
 
116
  """Insert documents into the collection"""
117
  result = await self.collection.insert_many(documents)
118
  return [str(id) for id in result.inserted_ids]
119
+
120
+ async def get_collection_stats(self):
121
+ """Get collection statistics"""
122
+ total_docs = await self.collection.count_documents({})
123
+ docs_with_embeddings = await self.collection.count_documents({"embedding": {"$exists": True}})
124
+ return {
125
+ "total_documents": total_docs,
126
+ "documents_with_embeddings": docs_with_embeddings,
127
+ "embedding_coverage": f"{(docs_with_embeddings/total_docs*100):.1f}%" if total_docs > 0 else "0%"
128
+ }
129
 
130
  # Global database instance
131
  db = Database()
generate_embeddings.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import time
3
+ from database import db
4
+ from rag_system import rag_pipeline
5
+
6
+ def build_content_string(doc: dict) -> str:
7
+ parts = []
8
+ if doc.get("title"):
9
+ parts.append(f"Title: {doc['title']}")
10
+ if doc.get("product_description"):
11
+ parts.append(f"Description: {doc['product_description']}")
12
+ if doc.get("category"):
13
+ parts.append(f"Category: {doc['category']}")
14
+
15
+ for key, value in doc.items():
16
+ if key in ["_id", "embedding", "title", "product_description", "category"]:
17
+ continue
18
+ if isinstance(value, (str, int, float)):
19
+ parts.append(f"{key}: {value}")
20
+
21
+ return ". ".join(str(p) for p in parts if p)
22
+
23
+ async def generate_and_store_embeddings():
24
+ await db.connect()
25
+ cursor = db.collection.find({"embedding": {"$exists": False}})
26
+ updated_count = 0
27
+ batch_size = 20
28
+
29
+ async for doc in cursor:
30
+ try:
31
+ content = build_content_string(doc)
32
+ if content.strip():
33
+ embedding = await rag_pipeline.get_embeddings([content])
34
+ await db.collection.update_one(
35
+ {"_id": doc["_id"]},
36
+ {"$set": {"embedding": embedding[0]}}
37
+ )
38
+ updated_count += 1
39
+ if updated_count % 10 == 0:
40
+ print(f"βœ… Processed {updated_count} documents...")
41
+ except Exception as e:
42
+ print(f"❌ Error processing {doc.get('_id')}: {e}")
43
+ continue
44
+ time.sleep(0.2) # small delay to avoid overload
45
+
46
+ print(f"πŸŽ‰ Embedding generation completed! {updated_count} documents updated.")
47
+
48
+ if __name__ == "__main__":
49
+ asyncio.run(generate_and_store_embeddings())
main.py CHANGED
@@ -32,30 +32,22 @@ embeddings_generated = False
32
 
33
  @app.on_event("startup")
34
  async def startup_event():
35
- """Run on application startup"""
36
- global embeddings_generated
37
  try:
38
  print("πŸš€ Starting RAG Chatbot API...")
39
 
40
  # Initialize database connection
41
  await db.connect()
42
 
43
- # Check if we need to generate embeddings
44
- total_docs = await db.collection.count_documents({})
45
- docs_with_embeddings = await db.collection.count_documents({"embedding": {"$exists": True}})
46
 
47
- print(f"πŸ“Š Database status: {total_docs} total documents, {docs_with_embeddings} with embeddings")
48
-
49
- # If we have documents but no embeddings, generate them
50
- if total_docs > 0 and docs_with_embeddings == 0:
51
- print("πŸ”„ No embeddings found. Starting automatic embedding generation...")
52
- await generate_embeddings_on_startup()
53
- embeddings_generated = True
54
- elif docs_with_embeddings > 0:
55
- print(f"βœ… Embeddings already exist for {docs_with_embeddings} documents")
56
- embeddings_generated = True
57
  else:
58
- print("ℹ️ No documents found in database")
59
 
60
  print("βœ… RAG Chatbot API is ready!")
61
 
@@ -63,74 +55,6 @@ async def startup_event():
63
  print(f"❌ Startup error: {e}")
64
  raise
65
 
66
- async def generate_embeddings_on_startup():
67
- """Generate embeddings for all documents on startup"""
68
- try:
69
- # Find all documents without embeddings
70
- cursor = db.collection.find({"embedding": {"$exists": False}})
71
- documents_without_embeddings = []
72
- async for doc in cursor:
73
- documents_without_embeddings.append(doc)
74
-
75
- if not documents_without_embeddings:
76
- print("βœ… All documents already have embeddings")
77
- return
78
-
79
- print(f"πŸ”„ Generating embeddings for {len(documents_without_embeddings)} documents...")
80
-
81
- updated_count = 0
82
- errors = 0
83
-
84
- # Process in smaller batches to avoid timeout
85
- batch_size = 50
86
- for i in range(0, len(documents_without_embeddings), batch_size):
87
- batch = documents_without_embeddings[i:i + batch_size]
88
-
89
- for doc in batch:
90
- try:
91
- # Create meaningful content for embedding
92
- content_parts = []
93
-
94
- # Include all relevant text fields
95
- if doc.get('title'):
96
- content_parts.append(f"Product: {doc['title']}")
97
- if doc.get('product_description'):
98
- content_parts.append(f"Description: {doc['product_description']}")
99
- if doc.get('category'):
100
- content_parts.append(f"Category: {doc['category']}")
101
-
102
- content = ". ".join(content_parts)
103
-
104
- if content.strip():
105
- # Generate embedding
106
- embedding = await rag_pipeline.get_embeddings([content])
107
-
108
- # Update document with embedding
109
- await db.collection.update_one(
110
- {"_id": doc["_id"]},
111
- {"$set": {"embedding": embedding[0]}}
112
- )
113
- updated_count += 1
114
-
115
- # Progress update every 50 documents
116
- if updated_count % 50 == 0:
117
- print(f"βœ… Processed {updated_count}/{len(documents_without_embeddings)} documents...")
118
-
119
- except Exception as e:
120
- errors += 1
121
- print(f"❌ Error processing document: {e}")
122
- continue
123
-
124
- # Small delay between batches
125
- await asyncio.sleep(1)
126
-
127
- # Final status
128
- final_with_embeddings = await db.collection.count_documents({"embedding": {"$exists": True}})
129
- print(f"πŸŽ‰ Embedding generation completed! {final_with_embeddings} documents now have embeddings")
130
-
131
- except Exception as e:
132
- print(f"❌ Embedding generation failed: {e}")
133
-
134
  @app.get("/")
135
  async def root():
136
  return {"message": "RAG Chatbot API is running!", "status": "healthy"}
@@ -171,66 +95,35 @@ async def chat_with_assistant(request: ChatRequest):
171
  except Exception as e:
172
  print(f"❌ Error in /chat endpoint: {traceback.format_exc()}")
173
  raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
174
-
175
- @app.get("/debug/vector-results")
176
- async def debug_vector_results(query: str = "tops"):
177
- """See exactly what vector search returns"""
178
- try:
179
- # Get embeddings for the query
180
- query_embedding = await rag_pipeline.get_embeddings([query])
181
- print(f"πŸ” Testing vector search for: '{query}'")
182
- print(f"πŸ“ Embedding dimensions: {len(query_embedding[0])}")
183
-
184
- # Perform vector search
185
- results = await db.similarity_search(query_embedding[0], limit=5)
186
-
187
- response_data = {
188
- "query": query,
189
- "embedding_dimensions": len(query_embedding[0]),
190
- "results_found": len(results),
191
- "raw_results": []
192
- }
193
-
194
- for i, doc in enumerate(results):
195
- response_data["raw_results"].append({
196
- "rank": i + 1,
197
- "id": doc["id"],
198
- "content": doc["content"],
199
- "source": doc.get("source", "unknown"),
200
- "metadata": doc.get("metadata", {})
201
- })
202
- print(f"πŸ“„ Result {i+1}: {doc['content'][:100]}...")
203
-
204
- return response_data
205
-
206
- except Exception as e:
207
- return {"error": str(e), "traceback": traceback.format_exc()}
208
 
209
- @app.get("/debug/sample-products")
210
- async def debug_sample_products(category: str = "tops", limit: int = 5):
211
- """Get sample products to see what content is available"""
212
  try:
213
- cursor = db.collection.find({"category": {"$regex": category, "$options": "i"}}).limit(limit)
214
- products = []
 
 
 
215
  async for doc in cursor:
216
- product_info = {
217
  "id": str(doc["_id"]),
218
- "name": doc.get("title", "N/A"),
219
  "category": doc.get("category", "N/A"),
220
- "description": doc.get("product_description", "N/A"),
221
- "price": doc.get("final_price", "N/A"),
222
  "has_embedding": "embedding" in doc,
223
- "content_used_for_embedding": f"{doc.get('title', '')} {doc.get('product_description', '')} {doc.get('category', '')}"
224
- }
225
- products.append(product_info)
226
 
227
  return {
228
- "category": category,
229
- "products_found": len(products),
230
- "products": products
 
231
  }
232
  except Exception as e:
233
  return {"error": str(e)}
 
234
 
235
  if __name__ == "__main__":
236
  uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)
 
32
 
33
  @app.on_event("startup")
34
  async def startup_event():
35
+ """Run on application startup - WITHOUT embedding generation"""
 
36
  try:
37
  print("πŸš€ Starting RAG Chatbot API...")
38
 
39
  # Initialize database connection
40
  await db.connect()
41
 
42
+ # Check database status (but don't generate embeddings)
43
+ stats = await db.get_collection_stats()
44
+ print(f"πŸ“Š Database status: {stats}")
45
 
46
+ if stats["documents_with_embeddings"] == 0:
47
+ print("⚠️ No embeddings found in database. Please pre-compute embeddings separately.")
48
+ print("πŸ’‘ Run the embedding generation script locally and upload to MongoDB Atlas.")
 
 
 
 
 
 
 
49
  else:
50
+ print(f"βœ… Ready! Using {stats['documents_with_embeddings']} documents with embeddings from MongoDB Atlas")
51
 
52
  print("βœ… RAG Chatbot API is ready!")
53
 
 
55
  print(f"❌ Startup error: {e}")
56
  raise
57
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
  @app.get("/")
59
  async def root():
60
  return {"message": "RAG Chatbot API is running!", "status": "healthy"}
 
95
  except Exception as e:
96
  print(f"❌ Error in /chat endpoint: {traceback.format_exc()}")
97
  raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
+ @app.get("/debug/database-stats")
100
+ async def debug_database_stats():
101
+ """Get detailed database statistics"""
102
  try:
103
+ stats = await db.get_collection_stats()
104
+
105
+ # Sample some documents to see their structure
106
+ sample_docs = []
107
+ cursor = db.collection.find({"embedding": {"$exists": True}}).limit(3)
108
  async for doc in cursor:
109
+ sample_docs.append({
110
  "id": str(doc["_id"]),
111
+ "title": doc.get("title", "N/A"),
112
  "category": doc.get("category", "N/A"),
 
 
113
  "has_embedding": "embedding" in doc,
114
+ "embedding_length": len(doc.get("embedding", [])),
115
+ "content_preview": f"{doc.get('title', '')} - {doc.get('product_description', '')[:50]}..."
116
+ })
117
 
118
  return {
119
+ "database": settings.DATABASE_NAME,
120
+ "collection": settings.COLLECTION_NAME,
121
+ "statistics": stats,
122
+ "sample_documents_with_embeddings": sample_docs
123
  }
124
  except Exception as e:
125
  return {"error": str(e)}
126
+
127
 
128
  if __name__ == "__main__":
129
  uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)