Spaces:

miyukicodes
/

d-commerce

Running

App Files Files Community

Crcs1225 commited on Oct 1, 2025

Commit

c135be2

1 Parent(s): 1333c38

hey

Browse files

Files changed (3) hide show

database.py +53 -10
generate_embeddings.py +49 -0
main.py +26 -133

database.py CHANGED Viewed

@@ -14,18 +14,18 @@ class Database:
         self.client = motor.motor_asyncio.AsyncIOMotorClient(settings.MONGODB_URI)
         self.db = self.client[settings.DATABASE_NAME]
         self.collection = self.db[settings.COLLECTION_NAME]
     async def similarity_search(self, query_embedding: List[float], limit: int = 3) -> List[Dict]:
-        """Search for similar products using vector similarity"""
         try:
-            # First try vector search if index exists
             pipeline = [
                 {
                     "$vectorSearch": {
-                        "index": "vector_index",  # Your vector index name
                         "path": "embedding",
                         "queryVector": query_embedding,
-                        "numCandidates": 100,
                         "limit": limit
                     }
                 },
@@ -46,7 +46,7 @@ class Database:
             async for doc in cursor:
                 results.append({
                     "id": str(doc["_id"]),
-                    "content": f"Product: {doc.get('title', 'N/A')}. Description: {doc.get('product_description', 'N/A')}. Category: {doc.get('category', 'N/A')}. Price: {doc.get('final_price', 'N/A')}.",
                     "source": doc.get('title', 'product_database'),
                     "metadata": {
                         "category": doc.get('category', 'N/A'),
@@ -56,12 +56,45 @@ class Database:
                 })
             return results
         except Exception as e:
-            print(f"Vector search failed, falling back to text search: {e}")
-            # Fallback to text search if vector search fails
-            return await self.search_by_category("tops", limit)
     async def search_by_category(self, category: str, limit: int = 5) -> List[Dict]:
-        """Search products by category (fallback if vector search fails)"""
         cursor = self.collection.find(
             {"category": {"$regex": category, "$options": "i"}}
         ).limit(limit)
@@ -70,7 +103,7 @@ class Database:
         async for doc in cursor:
             results.append({
                 "id": str(doc["_id"]),
-                "content": f"Product: {doc.get('title', 'N/A')}. Description: {doc.get('product_description', 'N/A')}. Category: {doc.get('category', 'N/A')}. Price: {doc.get('final_price', 'N/A')}.",
                 "source": doc.get('title', 'product_database'),
                 "metadata": {
                     "category": doc.get('category', 'N/A'),
@@ -83,6 +116,16 @@ class Database:
         """Insert documents into the collection"""
         result = await self.collection.insert_many(documents)
         return [str(id) for id in result.inserted_ids]
 # Global database instance
 db = Database()

         self.client = motor.motor_asyncio.AsyncIOMotorClient(settings.MONGODB_URI)
         self.db = self.client[settings.DATABASE_NAME]
         self.collection = self.db[settings.COLLECTION_NAME]
+        print(f"✅ Connected to MongoDB: {settings.DATABASE_NAME}.{settings.COLLECTION_NAME}")
     async def similarity_search(self, query_embedding: List[float], limit: int = 3) -> List[Dict]:
+        """Search for similar products using MongoDB Atlas Vector Search"""
         try:
             pipeline = [
                 {
                     "$vectorSearch": {
+                        "index": "vector_index",  # Make sure this matches your Atlas index name
                         "path": "embedding",
                         "queryVector": query_embedding,
+                        "numCandidates": 150,
                         "limit": limit
                     }
                 },
             async for doc in cursor:
                 results.append({
                     "id": str(doc["_id"]),
+                    "content": self._create_product_content(doc),
                     "source": doc.get('title', 'product_database'),
                     "metadata": {
                         "category": doc.get('category', 'N/A'),
                 })
             return results
         except Exception as e:
+            print(f"❌ Vector search error: {e}")
+            # Fallback to text search
+            return await self.search_by_text("tops", limit)
+    def _create_product_content(self, doc: Dict) -> str:
+        """Create formatted product content for the LLM"""
+        content_parts = [
+            f"Product: {doc.get('title', 'N/A')}",
+            f"Description: {doc.get('product_description', 'N/A')}",
+            f"Category: {doc.get('category', 'N/A')}",
+            f"Price: ₹{doc.get('final_price', 'N/A')}"
+        ]
+        return ". ".join(content_parts)
+    async def search_by_text(self, query: str, limit: int = 5) -> List[Dict]:
+        """Fallback text search if vector search fails"""
+        cursor = self.collection.find({
+            "$or": [
+                {"title": {"$regex": query, "$options": "i"}},
+                {"category": {"$regex": query, "$options": "i"}},
+                {"product_description": {"$regex": query, "$options": "i"}}
+            ]
+        }).limit(limit)
+        results = []
+        async for doc in cursor:
+            results.append({
+                "id": str(doc["_id"]),
+                "content": self._create_product_content(doc),
+                "source": doc.get('title', 'product_database'),
+                "metadata": {
+                    "category": doc.get('category', 'N/A'),
+                    "price": doc.get('final_price', 'N/A')
+                }
+            })
+        return results
     async def search_by_category(self, category: str, limit: int = 5) -> List[Dict]:
+        """Search products by category"""
         cursor = self.collection.find(
             {"category": {"$regex": category, "$options": "i"}}
         ).limit(limit)
         async for doc in cursor:
             results.append({
                 "id": str(doc["_id"]),
+                "content": self._create_product_content(doc),
                 "source": doc.get('title', 'product_database'),
                 "metadata": {
                     "category": doc.get('category', 'N/A'),
         """Insert documents into the collection"""
         result = await self.collection.insert_many(documents)
         return [str(id) for id in result.inserted_ids]
+    async def get_collection_stats(self):
+        """Get collection statistics"""
+        total_docs = await self.collection.count_documents({})
+        docs_with_embeddings = await self.collection.count_documents({"embedding": {"$exists": True}})
+        return {
+            "total_documents": total_docs,
+            "documents_with_embeddings": docs_with_embeddings,
+            "embedding_coverage": f"{(docs_with_embeddings/total_docs*100):.1f}%" if total_docs > 0 else "0%"
+        }
 # Global database instance
 db = Database()

generate_embeddings.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import asyncio
+import time
+from database import db
+from rag_system import rag_pipeline
+def build_content_string(doc: dict) -> str:
+    parts = []
+    if doc.get("title"):
+        parts.append(f"Title: {doc['title']}")
+    if doc.get("product_description"):
+        parts.append(f"Description: {doc['product_description']}")
+    if doc.get("category"):
+        parts.append(f"Category: {doc['category']}")
+    for key, value in doc.items():
+        if key in ["_id", "embedding", "title", "product_description", "category"]:
+            continue
+        if isinstance(value, (str, int, float)):
+            parts.append(f"{key}: {value}")
+    return ". ".join(str(p) for p in parts if p)
+async def generate_and_store_embeddings():
+    await db.connect()
+    cursor = db.collection.find({"embedding": {"$exists": False}})
+    updated_count = 0
+    batch_size = 20
+    async for doc in cursor:
+        try:
+            content = build_content_string(doc)
+            if content.strip():
+                embedding = await rag_pipeline.get_embeddings([content])
+                await db.collection.update_one(
+                    {"_id": doc["_id"]},
+                    {"$set": {"embedding": embedding[0]}}
+                )
+                updated_count += 1
+                if updated_count % 10 == 0:
+                    print(f"✅ Processed {updated_count} documents...")
+        except Exception as e:
+            print(f"❌ Error processing {doc.get('_id')}: {e}")
+            continue
+        time.sleep(0.2)  # small delay to avoid overload
+    print(f"🎉 Embedding generation completed! {updated_count} documents updated.")
+if __name__ == "__main__":
+    asyncio.run(generate_and_store_embeddings())

main.py CHANGED Viewed

@@ -32,30 +32,22 @@ embeddings_generated = False
 @app.on_event("startup")
 async def startup_event():
-    """Run on application startup"""
-    global embeddings_generated
     try:
         print("🚀 Starting RAG Chatbot API...")
         # Initialize database connection
         await db.connect()
-        # Check if we need to generate embeddings
-        total_docs = await db.collection.count_documents({})
-        docs_with_embeddings = await db.collection.count_documents({"embedding": {"$exists": True}})
-        print(f"📊 Database status: {total_docs} total documents, {docs_with_embeddings} with embeddings")
-        # If we have documents but no embeddings, generate them
-        if total_docs > 0 and docs_with_embeddings == 0:
-            print("🔄 No embeddings found. Starting automatic embedding generation...")
-            await generate_embeddings_on_startup()
-            embeddings_generated = True
-        elif docs_with_embeddings > 0:
-            print(f"✅ Embeddings already exist for {docs_with_embeddings} documents")
-            embeddings_generated = True
         else:
-            print("ℹ️  No documents found in database")
         print("✅ RAG Chatbot API is ready!")
@@ -63,74 +55,6 @@ async def startup_event():
         print(f"❌ Startup error: {e}")
         raise
-async def generate_embeddings_on_startup():
-    """Generate embeddings for all documents on startup"""
-    try:
-        # Find all documents without embeddings
-        cursor = db.collection.find({"embedding": {"$exists": False}})
-        documents_without_embeddings = []
-        async for doc in cursor:
-            documents_without_embeddings.append(doc)
-        if not documents_without_embeddings:
-            print("✅ All documents already have embeddings")
-            return
-        print(f"🔄 Generating embeddings for {len(documents_without_embeddings)} documents...")
-        updated_count = 0
-        errors = 0
-        # Process in smaller batches to avoid timeout
-        batch_size = 50
-        for i in range(0, len(documents_without_embeddings), batch_size):
-            batch = documents_without_embeddings[i:i + batch_size]
-            for doc in batch:
-                try:
-                    # Create meaningful content for embedding
-                    content_parts = []
-                    # Include all relevant text fields
-                    if doc.get('title'):
-                        content_parts.append(f"Product: {doc['title']}")
-                    if doc.get('product_description'):
-                        content_parts.append(f"Description: {doc['product_description']}")
-                    if doc.get('category'):
-                        content_parts.append(f"Category: {doc['category']}")
-                    content = ". ".join(content_parts)
-                    if content.strip():
-                        # Generate embedding
-                        embedding = await rag_pipeline.get_embeddings([content])
-                        # Update document with embedding
-                        await db.collection.update_one(
-                            {"_id": doc["_id"]},
-                            {"$set": {"embedding": embedding[0]}}
-                        )
-                        updated_count += 1
-                        # Progress update every 50 documents
-                        if updated_count % 50 == 0:
-                            print(f"✅ Processed {updated_count}/{len(documents_without_embeddings)} documents...")
-                except Exception as e:
-                    errors += 1
-                    print(f"❌ Error processing document: {e}")
-                    continue
-            # Small delay between batches
-            await asyncio.sleep(1)
-        # Final status
-        final_with_embeddings = await db.collection.count_documents({"embedding": {"$exists": True}})
-        print(f"🎉 Embedding generation completed! {final_with_embeddings} documents now have embeddings")
-    except Exception as e:
-        print(f"❌ Embedding generation failed: {e}")
 @app.get("/")
 async def root():
     return {"message": "RAG Chatbot API is running!", "status": "healthy"}
@@ -171,66 +95,35 @@ async def chat_with_assistant(request: ChatRequest):
     except Exception as e:
         print(f"❌ Error in /chat endpoint: {traceback.format_exc()}")
         raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
-@app.get("/debug/vector-results")
-async def debug_vector_results(query: str = "tops"):
-    """See exactly what vector search returns"""
-    try:
-        # Get embeddings for the query
-        query_embedding = await rag_pipeline.get_embeddings([query])
-        print(f"🔍 Testing vector search for: '{query}'")
-        print(f"📐 Embedding dimensions: {len(query_embedding[0])}")
-        # Perform vector search
-        results = await db.similarity_search(query_embedding[0], limit=5)
-        response_data = {
-            "query": query,
-            "embedding_dimensions": len(query_embedding[0]),
-            "results_found": len(results),
-            "raw_results": []
-        }
-        for i, doc in enumerate(results):
-            response_data["raw_results"].append({
-                "rank": i + 1,
-                "id": doc["id"],
-                "content": doc["content"],
-                "source": doc.get("source", "unknown"),
-                "metadata": doc.get("metadata", {})
-            })
-            print(f"📄 Result {i+1}: {doc['content'][:100]}...")
-        return response_data
-    except Exception as e:
-        return {"error": str(e), "traceback": traceback.format_exc()}
-@app.get("/debug/sample-products")
-async def debug_sample_products(category: str = "tops", limit: int = 5):
-    """Get sample products to see what content is available"""
     try:
-        cursor = db.collection.find({"category": {"$regex": category, "$options": "i"}}).limit(limit)
-        products = []
         async for doc in cursor:
-            product_info = {
                 "id": str(doc["_id"]),
-                "name": doc.get("title", "N/A"),
                 "category": doc.get("category", "N/A"),
-                "description": doc.get("product_description", "N/A"),
-                "price": doc.get("final_price", "N/A"),
                 "has_embedding": "embedding" in doc,
-                "content_used_for_embedding": f"{doc.get('title', '')} {doc.get('product_description', '')} {doc.get('category', '')}"
-            }
-            products.append(product_info)
         return {
-            "category": category,
-            "products_found": len(products),
-            "products": products
         }
     except Exception as e:
         return {"error": str(e)}
 if __name__ == "__main__":
     uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)

 @app.on_event("startup")
 async def startup_event():
+    """Run on application startup - WITHOUT embedding generation"""
     try:
         print("🚀 Starting RAG Chatbot API...")
         # Initialize database connection
         await db.connect()
+        # Check database status (but don't generate embeddings)
+        stats = await db.get_collection_stats()
+        print(f"📊 Database status: {stats}")
+        if stats["documents_with_embeddings"] == 0:
+            print("⚠️  No embeddings found in database. Please pre-compute embeddings separately.")
+            print("💡 Run the embedding generation script locally and upload to MongoDB Atlas.")
         else:
+            print(f"✅ Ready! Using {stats['documents_with_embeddings']} documents with embeddings from MongoDB Atlas")
         print("✅ RAG Chatbot API is ready!")
         print(f"❌ Startup error: {e}")
         raise
 @app.get("/")
 async def root():
     return {"message": "RAG Chatbot API is running!", "status": "healthy"}
     except Exception as e:
         print(f"❌ Error in /chat endpoint: {traceback.format_exc()}")
         raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
+@app.get("/debug/database-stats")
+async def debug_database_stats():
+    """Get detailed database statistics"""
     try:
+        stats = await db.get_collection_stats()
+        # Sample some documents to see their structure
+        sample_docs = []
+        cursor = db.collection.find({"embedding": {"$exists": True}}).limit(3)
         async for doc in cursor:
+            sample_docs.append({
                 "id": str(doc["_id"]),
+                "title": doc.get("title", "N/A"),
                 "category": doc.get("category", "N/A"),
                 "has_embedding": "embedding" in doc,
+                "embedding_length": len(doc.get("embedding", [])),
+                "content_preview": f"{doc.get('title', '')} - {doc.get('product_description', '')[:50]}..."
+            })
         return {
+            "database": settings.DATABASE_NAME,
+            "collection": settings.COLLECTION_NAME,
+            "statistics": stats,
+            "sample_documents_with_embeddings": sample_docs
         }
     except Exception as e:
         return {"error": str(e)}
 if __name__ == "__main__":
     uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)