Spaces:

miyukicodes
/

d-commerce

Sleeping

App Files Files Community

Crcs1225 commited on Oct 1, 2025

Commit

1333c38

1 Parent(s): ccabb90

new rag llm

Browse files

Files changed (7) hide show

config.py +14 -9
database.py +80 -109
gemini_service.py +0 -106
main.py +193 -174
models.py +45 -81
rag_system.py +139 -262
run.py +2 -8

config.py CHANGED Viewed

@@ -2,15 +2,20 @@ from pydantic_settings import BaseSettings
 from typing import Optional
 class Settings(BaseSettings):
-    mongodb_atlas_uri: str
-    gemini_api_key: str
-    database_name: str = "product"
-    products_collection: str = "marketplace"
-    conversations_collection: str = "conversations"
-    embeddings_collection: str = "embeddings"
-    port: int = 7860
     class Config:
-        env_file = None
-settings = Settings()

 from typing import Optional
 class Settings(BaseSettings):
+    # MongoDB Atlas
+    MONGODB_URI: str
+    DATABASE_NAME: str = "marketplace"
+    COLLECTION_NAME: str = "product"
+    # Gemini API
+    GEMINI_API_KEY: str
+    # Server
+    HOST: str = "0.0.0.0"
+    PORT: int = 7860
     class Config:
+        env_file = ".env"
+settings = Settings()

database.py CHANGED Viewed

@@ -1,117 +1,88 @@
-from datetime import datetime
 import motor.motor_asyncio
 from bson import ObjectId
-from typing import List, Optional, Dict, Any
 from config import settings
-import json
-class MongoDB:
     def __init__(self):
-        self.client = motor.motor_asyncio.AsyncIOMotorClient(settings.mongodb_atlas_uri)
-        self.db = self.client[settings.database_name]
-        self.products = self.db[settings.products_collection]
-        self.conversations = self.db[settings.conversations_collection]
-        self.embeddings = self.db[settings.embeddings_collection]
-    async def get_product(self, product_id: str) -> Optional[Dict]:
-        return await self.products.find_one({"_id": ObjectId(product_id)})
-    async def get_products_by_category(self, category: str, limit: int = 10) -> List[Dict]:
-        # Your database might have different field names for category
-        cursor = self.products.find({"category": category}).limit(limit)
-        return await cursor.to_list(length=limit)
-    async def search_products(self, query: Dict, limit: int = 10) -> List[Dict]:
-        cursor = self.products.find(query).limit(limit)
-        return await cursor.to_list(length=limit)
-    async def get_all_products(self, limit: int = 50) -> List[Dict]:
-        cursor = self.products.find().limit(limit)
-        return await cursor.to_list(length=limit)
-    async def create_product(self, product_data: Dict) -> str:
-        result = await self.products.insert_one(product_data)
-        return str(result.inserted_id)
-    async def update_product(self, product_id: str, update_data: Dict) -> bool:
-        result = await self.products.update_one(
-            {"_id": ObjectId(product_id)},
-            {"$set": update_data}
-        )
-        return result.modified_count > 0
-    async def delete_product(self, product_id: str) -> bool:
-        result = await self.products.delete_one({"_id": ObjectId(product_id)})
-        return result.deleted_count > 0
-    async def get_conversation(self, conversation_id: str) -> Optional[Dict]:
-        return await self.conversations.find_one({"_id": ObjectId(conversation_id)})
-    async def create_conversation(self, user_id: str) -> str:
-        conversation_data = {
-            "user_id": user_id,
-            "messages": [],
-            "created_at": datetime.now(),
-            "updated_at": datetime.now()
-        }
-        result = await self.conversations.insert_one(conversation_data)
-        return str(result.inserted_id)
-    async def add_message_to_conversation(self, conversation_id: str, message: Dict) -> bool:
-        result = await self.conversations.update_one(
-            {"_id": ObjectId(conversation_id)},
-            {
-                "$push": {"messages": message},
-                "$set": {"updated_at": datetime.now()}
-            }
-        )
-        return result.modified_count > 0
-    async def get_user_conversations(self, user_id: str, limit: int = 10) -> List[Dict]:
-        cursor = self.conversations.find({"user_id": user_id}).sort("updated_at", -1).limit(limit)
-        return await cursor.to_list(length=limit)
-    async def store_embedding(self, text: str, embedding: List[float], metadata: Dict) -> str:
-        doc = {
-            "text": text,
-            "embedding": embedding,
-            "metadata": metadata,
-            "created_at": datetime.now()
-        }
-        result = await self.embeddings.insert_one(doc)
-        return str(result.inserted_id)
-    async def find_similar_embeddings(self, embedding: List[float], limit: int = 5) -> List[Dict]:
-        # This is a simplified version - in production, you'd use vector search
-        pipeline = [
-            {
-                "$addFields": {
-                    "similarity": {
-                        "$sqrt": {
-                            "$sum": {
-                                "$map": {
-                                    "input": {"$range": [0, {"$size": "$embedding"}]},
-                                    "as": "idx",
-                                    "in": {
-                                        "$pow": [
-                                            {"$subtract": [
-                                                {"$arrayElemAt": ["$embedding", "$$idx"]},
-                                                {"$arrayElemAt": [embedding, "$$idx"]}
-                                            ]},
-                                            2
-                                        ]
-                                    }
-                                }
-                            }
-                        }
                     }
                 }
-            },
-            {"$sort": {"similarity": 1}},
-            {"$limit": limit}
-        ]
-        cursor = self.embeddings.aggregate(pipeline)
-        return await cursor.to_list(length=limit)
-# Database instance
-db = MongoDB()

 import motor.motor_asyncio
 from bson import ObjectId
+from typing import List, Dict, Any
+import numpy as np
 from config import settings
+class Database:
     def __init__(self):
+        self.client = None
+        self.db = None
+        self.collection = None
+    async def connect(self):
+        self.client = motor.motor_asyncio.AsyncIOMotorClient(settings.MONGODB_URI)
+        self.db = self.client[settings.DATABASE_NAME]
+        self.collection = self.db[settings.COLLECTION_NAME]
+    async def similarity_search(self, query_embedding: List[float], limit: int = 3) -> List[Dict]:
+        """Search for similar products using vector similarity"""
+        try:
+            # First try vector search if index exists
+            pipeline = [
+                {
+                    "$vectorSearch": {
+                        "index": "vector_index",  # Your vector index name
+                        "path": "embedding",
+                        "queryVector": query_embedding,
+                        "numCandidates": 100,
+                        "limit": limit
                     }
+                },
+                {
+                    "$project": {
+                        "_id": 1,
+                        "title": 1,
+                        "category": 1,
+                        "product_description": 1,
+                        "final_price": 1,
+                        "score": {"$meta": "vectorSearchScore"}
+                    }
+                }
+            ]
+            cursor = self.collection.aggregate(pipeline)
+            results = []
+            async for doc in cursor:
+                results.append({
+                    "id": str(doc["_id"]),
+                    "content": f"Product: {doc.get('title', 'N/A')}. Description: {doc.get('product_description', 'N/A')}. Category: {doc.get('category', 'N/A')}. Price: {doc.get('final_price', 'N/A')}.",
+                    "source": doc.get('title', 'product_database'),
+                    "metadata": {
+                        "category": doc.get('category', 'N/A'),
+                        "price": doc.get('final_price', 'N/A'),
+                        "similarity_score": doc.get('score', 0)
+                    }
+                })
+            return results
+        except Exception as e:
+            print(f"Vector search failed, falling back to text search: {e}")
+            # Fallback to text search if vector search fails
+            return await self.search_by_category("tops", limit)
+    async def search_by_category(self, category: str, limit: int = 5) -> List[Dict]:
+        """Search products by category (fallback if vector search fails)"""
+        cursor = self.collection.find(
+            {"category": {"$regex": category, "$options": "i"}}
+        ).limit(limit)
+        results = []
+        async for doc in cursor:
+            results.append({
+                "id": str(doc["_id"]),
+                "content": f"Product: {doc.get('title', 'N/A')}. Description: {doc.get('product_description', 'N/A')}. Category: {doc.get('category', 'N/A')}. Price: {doc.get('final_price', 'N/A')}.",
+                "source": doc.get('title', 'product_database'),
+                "metadata": {
+                    "category": doc.get('category', 'N/A'),
+                    "price": doc.get('final_price', 'N/A')
                 }
+            })
+        return results
+    async def insert_documents(self, documents: List[Dict]) -> List[str]:
+        """Insert documents into the collection"""
+        result = await self.collection.insert_many(documents)
+        return [str(id) for id in result.inserted_ids]
+# Global database instance
+db = Database()

gemini_service.py DELETED Viewed

@@ -1,106 +0,0 @@
-import google.generativeai as genai
-from typing import List, Dict, Any, Optional
-import asyncio
-import aiohttp
-import json
-from config import settings
-class GeminiService:
-    def __init__(self):
-        genai.configure(api_key=settings.gemini_api_key)
-        self.model = genai.GenerativeModel('gemini-2.5-flash')
-    async def generate_response(self, prompt: str, context: str = "") -> str:
-        """Generate response using Gemini API with context"""
-        try:
-            full_prompt = f"""
-            Context Information:
-            {context}
-            User Question: {prompt}
-            You are a helpful shopping assistant for Daddy's Shop. Use the context information above to answer the user's question accurately and helpfully. If the context doesn't contain relevant information, use your general knowledge but be honest about limitations.
-            Provide a friendly, professional response focused on helping with shopping needs.
-            """
-            # Run in thread pool since Gemini doesn't have native async support
-            loop = asyncio.get_event_loop()
-            response = await loop.run_in_executor(
-                None,
-                lambda: self.model.generate_content(full_prompt)
-            )
-            return response.text
-        except Exception as e:
-            print(f"Gemini API error: {e}")
-            return "I apologize, but I'm having trouble processing your request right now. Please try again later."
-    async def generate_embedding(self, text: str) -> List[float]:
-        """Generate embeddings for text using Gemini"""
-        try:
-            # Note: Gemini doesn't have direct embedding API, so we'll use a workaround
-            # For production, consider using SentenceTransformers or another embedding service
-            embedding_model = genai.GenerativeModel('embedding-001')
-            loop = asyncio.get_event_loop()
-            result = await loop.run_in_executor(
-                None,
-                lambda: genai.embed_content(
-                    model=embedding_model,
-                    content=text,
-                    task_type="retrieval_document"
-                )
-            )
-            return result['embedding']
-        except Exception as e:
-            print(f"Embedding generation error: {e}")
-            # Fallback to simple embedding (in production, use proper embedding model)
-            return [0.0] * 384  # Default dimension
-    async def classify_intent(self, message: str) -> Dict[str, Any]:
-        """Classify user intent using Gemini"""
-        prompt = f"""
-        Classify the following user message into one of these intents:
-        - product_inquiry: Questions about products, features, availability
-        - pricing: Questions about costs, discounts, prices
-        - shipping: Questions about delivery, shipping costs, timelines
-        - returns: Questions about returns, refunds, exchanges
-        - support: General customer support, contact information
-        - greeting: Hello, hi, greetings
-        - unknown: Cannot classify
-        Message: "{message}"
-        Return ONLY a JSON object with:
-        {{
-            "intent": "classified_intent",
-            "confidence": 0.95,
-            "entities": ["extracted_entities", "if_any"]
-        }}
-        """
-        try:
-            loop = asyncio.get_event_loop()
-            response = await loop.run_in_executor(
-                None,
-                lambda: self.model.generate_content(prompt)
-            )
-            # Parse JSON response
-            import re
-            json_match = re.search(r'\{.*\}', response.text, re.DOTALL)
-            if json_match:
-                return json.loads(json_match.group())
-            else:
-                return {"intent": "unknown", "confidence": 0.0, "entities": []}
-        except Exception as e:
-            print(f"Intent classification error: {e}")
-            return {"intent": "unknown", "confidence": 0.0, "entities": []}
-# Gemini service instance
-gemini_service = GeminiService()

main.py CHANGED Viewed

@@ -1,217 +1,236 @@
-from fastapi import FastAPI, HTTPException, Depends
 from fastapi.middleware.cors import CORSMiddleware
-from typing import List, Optional
 import uuid
-from datetime import datetime
-from config import settings
 from database import db
-from rag_system import rag_system
-from models import (
-    Product, ProductCreate, ChatRequest, ChatResponse,
-    SearchRequest, Conversation, ChatMessage
-)
 app = FastAPI(
-    title="Daddy's Shop RAG Chatbot API",
-    description="AI-powered shopping assistant with RAG using MongoDB and Gemini",
     version="1.0.0"
 )
 # CORS middleware
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["http://localhost:3000", "http://127.0.0.1:3000", "https://yourdomain.com"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-@app.get("/")
-async def root():
-    """Health check endpoint"""
-    return {
-        "message": "Daddy's Shop RAG Chatbot API is running!",
-        "version": "1.0.0",
-        "status": "healthy"
-    }
-@app.post("/chat", response_model=ChatResponse)
-async def chat_endpoint(request: ChatRequest):
-    """
-    Main chatbot endpoint with RAG capabilities
-    """
     try:
-        # Create new conversation if no ID provided
-        if not request.conversation_id:
-            conversation_id = await db.create_conversation(request.user_id)
-        else:
-            conversation_id = request.conversation_id
-            # Verify conversation exists
-            conversation = await db.get_conversation(conversation_id)
-            if not conversation:
-                conversation_id = await db.create_conversation(request.user_id)
-        # Add user message to conversation
-        user_message = ChatMessage(
-            sender="user",
-            text=request.message
-        )
-        await db.add_message_to_conversation(conversation_id, user_message.dict())
-        # Get conversation history for context
-        conversation = await db.get_conversation(conversation_id)
-        history = conversation.get('messages', []) if conversation else []
-        # Generate response using RAG system
-        rag_result = await rag_system.generate_chat_response(request.message, history)
-        # Add bot response to conversation
-        bot_message = ChatMessage(
-            sender="bot",
-            text=rag_result["response"]
-        )
-        await db.add_message_to_conversation(conversation_id, bot_message.dict())
-        return ChatResponse(
-            response=rag_result["response"],
-            conversation_id=conversation_id,
-            suggested_questions=rag_result["suggested_questions"],
-            relevant_products=rag_result["relevant_products"],
-            intent=rag_result["intent"],
-            confidence=rag_result["confidence"]
-        )
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Chat processing error: {str(e)}")
-@app.get("/products", response_model=List[Product])
-async def get_products(category: Optional[str] = None, limit: int = 20):
-    """Get products with optional category filter"""
     try:
-        if category:
-            products = await db.get_products_by_category(category, limit)
-        else:
-            products = await db.get_all_products(limit)
-        return products
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error fetching products: {str(e)}")
-@app.get("/products/{product_id}", response_model=Product)
-async def get_product(product_id: str):
-    """Get specific product by ID"""
-    product = await db.get_product(product_id)
-    if not product:
-        raise HTTPException(status_code=404, detail="Product not found")
-    return product
-@app.post("/products", response_model=dict)
-async def create_product(product: ProductCreate):
-    """Create a new product"""
-    try:
-        product_data = product.dict()
-        product_data["_id"] = str(uuid.uuid4())[:8]  # Simple ID generation
-        product_id = await db.create_product(product_data)
-        return {"message": "Product created successfully", "product_id": product_id}
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error creating product: {str(e)}")
-@app.post("/search", response_model=List[Product])
-async def search_products(request: SearchRequest):
-    """Semantic search for products"""
     try:
-        products = await rag_system.retrieve_relevant_products(
-            request.query,
-            request.category,
-            request.max_results
         )
-        return products
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Search error: {str(e)}")
-@app.get("/conversations/{user_id}", response_model=List[Conversation])
-async def get_user_conversations(user_id: str, limit: int = 10):
-    """Get user's conversation history"""
     try:
-        conversations = await db.get_user_conversations(user_id, limit)
-        return conversations
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Error fetching conversations: {str(e)}")
-@app.get("/intents")
-async def get_available_intents():
-    """Get information about available intents"""
-    return {
-        "intents": [
-            "product_inquiry", "pricing", "shipping",
-            "returns", "support", "greeting", "unknown"
-        ],
-        "description": "Intent classification for user messages"
-    }
-@app.get("/test-products")
-async def test_products(limit: int = 3):
-    """Test endpoint to see transformed products"""
     try:
-        products = await db.get_all_products(limit)
-        transformed_products = [rag_system._transform_product_doc(product) for product in products]
         return {
-            "original_count": len(products),
-            "transformed_count": len(transformed_products),
-            "original_sample": products[0] if products else {},
-            "transformed_sample": transformed_products[0] if transformed_products else {},
-            "all_transformed": transformed_products
         }
     except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Test error: {str(e)}")
-# Sample data initialization
-@app.on_event("startup")
-async def startup_event():
-    """Initialize sample data if needed"""
-    try:
-        # Check if we have any products
-        products = await db.get_all_products(1)
-        if not products:
-            await _initialize_sample_data()
-    except Exception as e:
-        print(f"Startup initialization error: {e}")
-async def _initialize_sample_data():
-    """Initialize sample product data"""
-    sample_products = [
-        {
-            "name": "Wireless Bluetooth Earbuds",
-            "description": "High-quality wireless earbuds with noise cancellation and 24-hour battery life.",
-            "price": 79.99,
-            "category": "electronics",
-            "in_stock": True,
-            "tags": ["wireless", "bluetooth", "audio", "noise-cancellation"],
-            "features": ["Noise Cancellation", "24h Battery", "Water Resistant", "Touch Controls"]
-        },
-        {
-            "name": "Smart Fitness Watch",
-            "description": "Advanced fitness tracker with heart rate monitoring, GPS, and smartphone connectivity.",
-            "price": 199.99,
-            "category": "electronics",
-            "in_stock": True,
-            "tags": ["fitness", "smartwatch", "health", "tracking"],
-            "features": ["Heart Rate Monitor", "GPS", "Sleep Tracking", "Waterproof"]
-        },
-        {
-            "name": "Organic Cotton T-Shirt",
-            "description": "Comfortable and sustainable organic cotton t-shirt available in multiple colors.",
-            "price": 24.99,
-            "category": "clothing",
-            "in_stock": True,
-            "tags": ["cotton", "organic", "sustainable", "casual"],
-            "features": ["Organic Cotton", "Machine Washable", "Multiple Colors"]
-        }
-    ]
-    for product in sample_products:
-        await db.create_product(product)
-    print("Sample data initialized successfully")
 if __name__ == "__main__":
-    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=settings.port)

+import asyncio
+from fastapi import FastAPI, HTTPException, Query
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
 import uuid
+import uvicorn
+from typing import List, Optional
+import traceback
+# Import your existing modules
 from database import db
+from models import ChatMessage, ChatRequest, ChatResponse, Product, SearchRequest, Conversation, KnowledgeDocument, Document, SourceInfo
+from config import settings
+from rag_system import rag_pipeline
 app = FastAPI(
+    title="RAG Chatbot API",
+    description="Lightweight RAG Chatbot using MongoDB Atlas and Gemini",
     version="1.0.0"
 )
 # CORS middleware
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
+embeddings_generated = False
+@app.on_event("startup")
+async def startup_event():
+    """Run on application startup"""
+    global embeddings_generated
     try:
+        print("🚀 Starting RAG Chatbot API...")
+        # Initialize database connection
+        await db.connect()
+        # Check if we need to generate embeddings
+        total_docs = await db.collection.count_documents({})
+        docs_with_embeddings = await db.collection.count_documents({"embedding": {"$exists": True}})
+        print(f"📊 Database status: {total_docs} total documents, {docs_with_embeddings} with embeddings")
+        # If we have documents but no embeddings, generate them
+        if total_docs > 0 and docs_with_embeddings == 0:
+            print("🔄 No embeddings found. Starting automatic embedding generation...")
+            await generate_embeddings_on_startup()
+            embeddings_generated = True
+        elif docs_with_embeddings > 0:
+            print(f"✅ Embeddings already exist for {docs_with_embeddings} documents")
+            embeddings_generated = True
+        else:
+            print("ℹ️  No documents found in database")
+        print("✅ RAG Chatbot API is ready!")
     except Exception as e:
+        print(f"❌ Startup error: {e}")
+        raise
+async def generate_embeddings_on_startup():
+    """Generate embeddings for all documents on startup"""
     try:
+        # Find all documents without embeddings
+        cursor = db.collection.find({"embedding": {"$exists": False}})
+        documents_without_embeddings = []
+        async for doc in cursor:
+            documents_without_embeddings.append(doc)
+        if not documents_without_embeddings:
+            print("✅ All documents already have embeddings")
+            return
+        print(f"🔄 Generating embeddings for {len(documents_without_embeddings)} documents...")
+        updated_count = 0
+        errors = 0
+        # Process in smaller batches to avoid timeout
+        batch_size = 50
+        for i in range(0, len(documents_without_embeddings), batch_size):
+            batch = documents_without_embeddings[i:i + batch_size]
+            for doc in batch:
+                try:
+                    # Create meaningful content for embedding
+                    content_parts = []
+                    # Include all relevant text fields
+                    if doc.get('title'):
+                        content_parts.append(f"Product: {doc['title']}")
+                    if doc.get('product_description'):
+                        content_parts.append(f"Description: {doc['product_description']}")
+                    if doc.get('category'):
+                        content_parts.append(f"Category: {doc['category']}")
+                    content = ". ".join(content_parts)
+                    if content.strip():
+                        # Generate embedding
+                        embedding = await rag_pipeline.get_embeddings([content])
+                        # Update document with embedding
+                        await db.collection.update_one(
+                            {"_id": doc["_id"]},
+                            {"$set": {"embedding": embedding[0]}}
+                        )
+                        updated_count += 1
+                        # Progress update every 50 documents
+                        if updated_count % 50 == 0:
+                            print(f"✅ Processed {updated_count}/{len(documents_without_embeddings)} documents...")
+                except Exception as e:
+                    errors += 1
+                    print(f"❌ Error processing document: {e}")
+                    continue
+            # Small delay between batches
+            await asyncio.sleep(1)
+        # Final status
+        final_with_embeddings = await db.collection.count_documents({"embedding": {"$exists": True}})
+        print(f"🎉 Embedding generation completed! {final_with_embeddings} documents now have embeddings")
     except Exception as e:
+        print(f"❌ Embedding generation failed: {e}")
+@app.get("/")
+async def root():
+    return {"message": "RAG Chatbot API is running!", "status": "healthy"}
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy", "service": "rag-chatbot"}
+@app.post("/chat")
+async def chat_with_assistant(request: ChatRequest):
+    """Main chat endpoint for product queries"""
     try:
+        print(f"💬 Received chat request: {request.message}")
+        response, sources = await rag_pipeline.generate_response(request.message)
+        suggested_questions = rag_pipeline.generate_followup_questions(
+            request.message,
+            sources
+        )
+        # Convert to SourceInfo objects
+        source_objects = []
+        for product in sources:
+            source_objects.append(SourceInfo(
+                id=product.get("id", ""),
+                name=product.get("source", "Product"),
+                category=product.get("metadata", {}).get("category", "N/A"),
+                price=str(product.get("metadata", {}).get("price", "N/A")),
+                similarity_score=product.get("metadata", {}).get("similarity_score", 0)
+            ))
+        return ChatResponse(
+            response=response,
+            sources=source_objects,
+            suggested_questions=suggested_questions,
+            conversation_id=request.conversation_id
         )
     except Exception as e:
+        print(f"❌ Error in /chat endpoint: {traceback.format_exc()}")
+        raise HTTPException(status_code=500, detail=f"Error processing request: {str(e)}")
+@app.get("/debug/vector-results")
+async def debug_vector_results(query: str = "tops"):
+    """See exactly what vector search returns"""
     try:
+        # Get embeddings for the query
+        query_embedding = await rag_pipeline.get_embeddings([query])
+        print(f"🔍 Testing vector search for: '{query}'")
+        print(f"📐 Embedding dimensions: {len(query_embedding[0])}")
+        # Perform vector search
+        results = await db.similarity_search(query_embedding[0], limit=5)
+        response_data = {
+            "query": query,
+            "embedding_dimensions": len(query_embedding[0]),
+            "results_found": len(results),
+            "raw_results": []
+        }
+        for i, doc in enumerate(results):
+            response_data["raw_results"].append({
+                "rank": i + 1,
+                "id": doc["id"],
+                "content": doc["content"],
+                "source": doc.get("source", "unknown"),
+                "metadata": doc.get("metadata", {})
+            })
+            print(f"📄 Result {i+1}: {doc['content'][:100]}...")
+        return response_data
     except Exception as e:
+        return {"error": str(e), "traceback": traceback.format_exc()}
+@app.get("/debug/sample-products")
+async def debug_sample_products(category: str = "tops", limit: int = 5):
+    """Get sample products to see what content is available"""
     try:
+        cursor = db.collection.find({"category": {"$regex": category, "$options": "i"}}).limit(limit)
+        products = []
+        async for doc in cursor:
+            product_info = {
+                "id": str(doc["_id"]),
+                "name": doc.get("title", "N/A"),
+                "category": doc.get("category", "N/A"),
+                "description": doc.get("product_description", "N/A"),
+                "price": doc.get("final_price", "N/A"),
+                "has_embedding": "embedding" in doc,
+                "content_used_for_embedding": f"{doc.get('title', '')} {doc.get('product_description', '')} {doc.get('category', '')}"
+            }
+            products.append(product_info)
         return {
+            "category": category,
+            "products_found": len(products),
+            "products": products
         }
     except Exception as e:
+        return {"error": str(e)}
 if __name__ == "__main__":
+    uvicorn.run("main:app", host="0.0.0.0", port=7860, reload=True)

models.py CHANGED Viewed

@@ -1,100 +1,64 @@
-from pydantic import BaseModel, Field, validator
 from typing import List, Optional, Dict, Any
 from datetime import datetime
-from enum import Enum
-class ProductCategory(str, Enum):
-    ELECTRONICS = "electronics"
-    CLOTHING = "clothing"
-    HOME = "home"
-    BEAUTY = "beauty"
-    SPORTS = "sports"
-    BOOKS = "books"
-    OTHER = "other"
-class ProductCreate(BaseModel):
-    name: str
-    description: str = ""
-    price: float = 0.0
-    category: ProductCategory = ProductCategory.OTHER
-    in_stock: bool = True
-    tags: List[str] = []
-    features: List[str] = []
-    image_url: Optional[str] = None
-    @validator('price', pre=True)
-    def validate_price(cls, v):
-        if v is None:
-            return 0.0
-        try:
-            return float(v)
-        except (TypeError, ValueError):
-            return 0.0
-    @validator('category', pre=True)
-    def validate_category(cls, v):
-        if isinstance(v, ProductCategory):
-            return v
-        try:
-            return ProductCategory(v)
-        except ValueError:
-            return ProductCategory.OTHER
-class Product(BaseModel):
     id: str
     name: str
-    description: str = ""
-    price: float = 0.0
-    category: ProductCategory = ProductCategory.OTHER
-    in_stock: bool = True
-    tags: List[str] = []
-    features: List[str] = []
-    image_url: Optional[str] = None
-    @validator('price', pre=True)
-    def validate_price(cls, v):
-        if v is None:
-            return 0.0
-        try:
-            return float(v)
-        except (TypeError, ValueError):
-            return 0.0
-    @validator('category', pre=True)
-    def validate_category(cls, v):
-        if isinstance(v, ProductCategory):
-            return v
-        try:
-            return ProductCategory(v)
-        except ValueError:
-            return ProductCategory.OTHER
-class ChatMessage(BaseModel):
-    sender: str
-    text: str
-    timestamp: datetime = Field(default_factory=datetime.now)
 class ChatRequest(BaseModel):
     message: str
     conversation_id: Optional[str] = None
-    user_id: Optional[str] = "anonymous"
 class ChatResponse(BaseModel):
     response: str
-    conversation_id: str
-    suggested_questions: List[str] = []
-    relevant_products: List[Product] = []
-    intent: str
-    confidence: float
 class SearchRequest(BaseModel):
     query: str
-    category: Optional[ProductCategory] = None
-    max_results: int = 5
 class Conversation(BaseModel):
     id: str
     user_id: str
-    messages: List[ChatMessage] = []
-    created_at: datetime = Field(default_factory=datetime.now)
-    updated_at: datetime = Field(default_factory=datetime.now)

+from pydantic import BaseModel, Field
 from typing import List, Optional, Dict, Any
 from datetime import datetime
+# Source information model
+class SourceInfo(BaseModel):
     id: str
     name: str
+    category: str
+    price: str
+    similarity_score: float
+# Chat request and response models
 class ChatRequest(BaseModel):
     message: str
     conversation_id: Optional[str] = None
 class ChatResponse(BaseModel):
     response: str
+    sources: List[SourceInfo]  # Changed from List[str] to List[SourceInfo]
+    suggested_questions: List[str]
+    conversation_id: Optional[str] = None  # Make this optional
+# Product models
+class Product(BaseModel):
+    id: str
+    name: str
+    category: str
+    description: str
+    price: float
+    image_url: Optional[str] = None
+    tags: List[str] = []
 class SearchRequest(BaseModel):
     query: str
+    category: Optional[str] = None
+    limit: int = 20
+# Conversation models
+class ChatMessage(BaseModel):
+    role: str  # "user" or "assistant"
+    content: str
+    timestamp: datetime
 class Conversation(BaseModel):
     id: str
     user_id: str
+    messages: List[ChatMessage]
+    created_at: datetime
+    updated_at: datetime
+# Knowledge base models
+class Document(BaseModel):
+    content: str
+    metadata: Dict[str, Any] = {}
+    source: str = "upload"
+class KnowledgeDocument(BaseModel):
+    id: str
+    content: str
+    embedding: List[float]
+    metadata: Dict[str, Any]
+    source: str
+    created_at: datetime

rag_system.py CHANGED Viewed

@@ -1,285 +1,162 @@
-from typing import List, Dict, Any, Optional
-from database import db
-from gemini_service import gemini_service
-from models import Product, ProductCategory
-import numpy as np
 from sentence_transformers import SentenceTransformer
 import asyncio
-class RAGSystem:
     def __init__(self):
         self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
-    def _transform_product_doc(self, product_doc: Dict) -> Dict:
-        """Transform MongoDB document to match Product model using actual field names"""
         try:
-            # Extract fields from your actual database schema
-            product_id = str(product_doc.get('_id', '')) or str(product_doc.get('product_id', ''))
-            # Map to our expected fields
-            transformed = {
-                "id": product_id,
-                "name": product_doc.get('title', 'Unnamed Product'),
-                "description": product_doc.get('product_description', 'No description available'),
-                "price": float(product_doc.get('final_price', 0.0)),
-                "category": self._map_category(product_doc.get('category', 'other')),
-                "in_stock": True,  # Default to True since we don't have this field
-                "tags": self._extract_tags(product_doc),
-                "features": self._extract_features(product_doc),
-                "image_url": self._get_first_image(product_doc.get('images', '')),
-            }
-            return transformed
         except Exception as e:
-            print(f"Error transforming product {product_doc.get('_id')}: {e}")
-            # Return a safe default product
-            return {
-                "id": str(product_doc.get('_id', 'unknown')),
-                "name": product_doc.get('title', 'Unnamed Product'),
-                "description": "Product information unavailable",
-                "price": 0.0,
-                "category": ProductCategory.OTHER,
-                "in_stock": True,
-                "tags": [],
-                "features": [],
-            }
-    def _map_category(self, raw_category: str) -> ProductCategory:
-        """Map raw category string to ProductCategory enum"""
-        if not raw_category:
-            return ProductCategory.OTHER
-        category_lower = raw_category.lower()
-        # Map based on your actual category values
-        if any(keyword in category_lower for keyword in ['electronic', 'tech', 'computer', 'phone']):
-            return ProductCategory.ELECTRONICS
-        elif any(keyword in category_lower for keyword in ['cloth', 'fashion', 'wear', 'top', 'dress', 'shirt', 'jeans']):
-            return ProductCategory.CLOTHING
-        elif any(keyword in category_lower for keyword in ['home', 'garden', 'furniture', 'decor']):
-            return ProductCategory.HOME
-        elif any(keyword in category_lower for keyword in ['beauty', 'cosmetic', 'skin', 'hair', 'cream', 'mask', 'makeup']):
-            return ProductCategory.BEAUTY
-        elif any(keyword in category_lower for keyword in ['sport', 'fitness', 'exercise', 'gym']):
-            return ProductCategory.SPORTS
-        elif any(keyword in category_lower for keyword in ['book', 'literature']):
-            return ProductCategory.BOOKS
-        else:
-            return ProductCategory.OTHER
-    def _extract_tags(self, product_doc: Dict) -> List[str]:
-        """Extract tags from product details"""
-        tags = []
-        # Add category as a tag
-        category = product_doc.get('category')
-        if category:
-            tags.append(category)
-        # Extract from product details if available
-        details = product_doc.get('product_details', '')
-        if 'Cotton' in details:
-            tags.append('Cotton')
-        if 'Polyester' in details:
-            tags.append('Polyester')
-        return tags
-    def _extract_features(self, product_doc: Dict) -> List[str]:
-        """Extract features from product details"""
-        features = []
-        # Extract key features from product details
-        details = product_doc.get('product_details', '')
-        # Look for common features
-        feature_keywords = [
-            'Machine wash', 'Hand wash', 'Dry clean', 'Cotton',
-            'Polyester', 'Elastane', 'Spaghetti', 'Sleeveless',
-            'Solid pattern', 'Sweetheart neck'
-        ]
-        for keyword in feature_keywords:
-            if keyword in details:
-                features.append(keyword)
-        # Add rating as a feature
-        rating = product_doc.get('rating')
-        if rating:
-            features.append(f'{rating}★ Rating')
-        # Add delivery options
-        delivery = product_doc.get('delivery_options', '')
-        if 'Pay on delivery' in delivery:
-            features.append('Pay on Delivery')
-        if 'Easy returns' in delivery:
-            features.append('Easy Returns')
-        return features
-    def _get_first_image(self, images_str: str) -> Optional[str]:
-        """Extract first image URL from comma-separated string"""
-        if not images_str:
-            return None
-        images = images_str.split(',')
-        return images[0].strip() if images else None
-    # ... rest of your existing methods remain the same
-    async def retrieve_relevant_products(self, query: str, category: Optional[str] = None, limit: int = 5) -> List[Dict]:
-        """Retrieve and transform relevant products"""
         try:
-            # Generate query embedding
-            query_embedding = await self._get_embedding(query)
-            # Get all products or filtered by category
-            if category:
-                products = await db.get_products_by_category(category, limit=50)
-            else:
-                products = await db.get_all_products(limit=100)
-            # Transform products first
-            transformed_products = [self._transform_product_doc(product) for product in products]
-            # Calculate similarity scores on transformed products
-            scored_products = []
-            for product in transformed_products:
-                product_text = f"{product.get('name', '')} {product.get('description', '')} {' '.join(product.get('tags', []))}"
-                product_embedding = await self._get_embedding(product_text)
-                similarity = self._cosine_similarity(query_embedding, product_embedding)
-                scored_products.append((product, similarity))
-            # Sort by similarity and return top results
-            scored_products.sort(key=lambda x: x[1], reverse=True)
-            return [product for product, score in scored_products[:limit]]
         except Exception as e:
-            print(f"Product retrieval error: {e}")
-            # Fallback to simple keyword search with transformation
-            return await self._keyword_search_products(query, category, limit)
-    async def _keyword_search_products(self, query: str, category: Optional[str], limit: int) -> List[Dict]:
-        """Fallback keyword-based product search with transformation"""
-        search_terms = query.lower().split()
-        if category:
-            products = await db.get_products_by_category(category, limit=50)
-        else:
-            products = await db.get_all_products(limit=100)
-        # Transform all products first
-        transformed_products = [self._transform_product_doc(product) for product in products]
-        scored_products = []
-        for product in transformed_products:
-            score = 0
-            product_text = f"{product.get('name', '')} {product.get('description', '')} {' '.join(product.get('tags', []))}".lower()
-            for term in search_terms:
-                if term in product_text:
-                    score += 1
-                if term in product.get('name', '').lower():
-                    score += 2  # Higher weight for name matches
-            if score > 0:
-                scored_products.append((product, score))
-        scored_products.sort(key=lambda x: x[1], reverse=True)
-        return [product for product, score in scored_products[:limit]]
-    async def _get_embedding(self, text: str) -> List[float]:
-        """Get embedding for text using available methods"""
-        try:
-            embedding = await gemini_service.generate_embedding(text)
-            if embedding and len(embedding) > 10:
-                return embedding
-        except:
-            pass
-        embedding = self.embedding_model.encode(text)
-        return embedding.tolist()
-    def _cosine_similarity(self, vec1: List[float], vec2: List[float]) -> float:
-        """Calculate cosine similarity between two vectors"""
-        vec1 = np.array(vec1)
-        vec2 = np.array(vec2)
-        dot_product = np.dot(vec1, vec2)
-        norm1 = np.linalg.norm(vec1)
-        norm2 = np.linalg.norm(vec2)
-        if norm1 == 0 or norm2 == 0:
-            return 0.0
-        return dot_product / (norm1 * norm2)
-    async def build_context(self, query: str, relevant_products: List[Dict]) -> str:
-        """Build context string from relevant products for the LLM"""
-        if not relevant_products:
-            return "No specific product information available. Use general knowledge about e-commerce and shopping."
-        context_parts = ["Relevant Products Information:"]
-        for i, product in enumerate(relevant_products, 1):
-            context_parts.append(f"""
-            Product {i}:
-            - Name: {product.get('name', 'N/A')}
-            - Description: {product.get('description', 'N/A')}
-            - Price: ${product.get('price', 'N/A')}
-            - Category: {product.get('category', 'N/A')}
-            - In Stock: {'Yes' if product.get('in_stock') else 'No'}
-            - Features: {', '.join(product.get('features', []))}
-            - Tags: {', '.join(product.get('tags', []))}
-            """)
-        return "\n".join(context_parts)
-    async def generate_chat_response(self, user_message: str, conversation_history: List[Dict] = None) -> Dict[str, Any]:
-        """Generate response using RAG pipeline"""
-        # Classify intent
-        intent_result = await gemini_service.classify_intent(user_message)
-        # Retrieve relevant products (already transformed)
-        relevant_products = await self.retrieve_relevant_products(user_message, limit=3)
-        # Build context
-        context = await self.build_context(user_message, relevant_products)
-        # Generate response using Gemini with context
-        response = await gemini_service.generate_response(user_message, context)
-        # Generate suggested questions based on intent
-        suggested_questions = await self._generate_suggested_questions(intent_result['intent'], relevant_products)
-        return {
-            "response": response,
-            "relevant_products": relevant_products,
-            "suggested_questions": suggested_questions,
-            "intent": intent_result['intent'],
-            "confidence": intent_result['confidence']
-        }
-    async def _generate_suggested_questions(self, intent: str, products: List[Dict]) -> List[str]:
-        """Generate context-aware suggested questions"""
-        base_questions = {
-            "product_inquiry": ["Show me more products", "What are your best sellers?", "Any current deals?"],
-            "pricing": ["Do you offer discounts?", "What's the return policy?", "Any bundle deals?"],
-            "shipping": ["How long does delivery take?", "Do you ship internationally?", "What are shipping costs?"],
-            "returns": ["How do I return an item?", "What's your warranty policy?", "Do you offer exchanges?"],
-            "support": ["Contact customer service", "Store locations", "Business hours"],
-            "default": ["Best sellers", "Current deals", "Shipping info", "Return policy"]
-        }
-        questions = base_questions.get(intent, base_questions["default"])
-        # Add product-specific questions if we have products
-        if products and intent == "product_inquiry":
-            categories = list(set(str(p.get('category')) for p in products))
-            if categories:
-                questions = [f"Show me more {cat} products" for cat in categories[:2]] + questions
-        return questions[:4]
-# RAG system instance
-rag_system = RAGSystem()

+import google.generativeai as genai
 from sentence_transformers import SentenceTransformer
+from typing import List, Tuple, Dict, Any
 import asyncio
+from database import db
+from config import settings
+class ProductRAGPipeline:
     def __init__(self):
+        # Initialize embedding model
         self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
+        # Initialize Gemini
+        genai.configure(api_key=settings.GEMINI_API_KEY)
+        self.gemini_model = genai.GenerativeModel('gemini-2.5-flash')
+        # Enhanced personality for shopping assistant
+        self.personality_traits = """
+        You are a friendly, knowledgeable shopping assistant for a fashion e-commerce store. Your personality traits:
+        - Warm, approachable, and enthusiastic about fashion
+        - Helpful and patient with customer queries
+        - Knowledgeable about products, styles, and fashion trends
+        - Casual but professional tone, like a friendly store assistant
+        - Use emojis occasionally to express emotion (but don't overdo it)
+        - Ask follow-up questions to better understand customer needs
+        - Be concise but thorough in product recommendations
+        - Always mention key product features, price, and benefits
+        - If you suggest multiple products, compare them briefly
+        """
+    async def get_embeddings(self, texts: List[str]) -> List[List[float]]:
+        """Get embeddings for texts (async wrapper)"""
+        loop = asyncio.get_event_loop()
+        embeddings = await loop.run_in_executor(
+            None, self.embedding_model.encode, texts
+        )
+        return embeddings.tolist()
+    async def retrieve_relevant_products(self, query: str, limit: int = 3) -> List[Dict]:
+        """Retrieve relevant products using vector search with fallback"""
         try:
+            # Try vector search first
+            query_embedding = await self.get_embeddings([query])
+            print(f"🔍 Performing vector search with embedding dim: {len(query_embedding[0])}")
+            relevant_docs = await db.similarity_search(query_embedding[0], limit=limit)
+            print(f"✅ Vector search returned {len(relevant_docs)} results")
+            if not relevant_docs:
+                print("🔄 No results from vector search, trying category-based search")
+                # Fallback to category-based search
+                category_keywords = self._extract_category_from_query(query)
+                if category_keywords:
+                    relevant_docs = await db.search_by_category(category_keywords[0], limit=limit)
+                    print(f"✅ Category search returned {len(relevant_docs)} results")
+            return relevant_docs
         except Exception as e:
+            print(f"❌ Error in vector search: {e}")
+            # Final fallback to generic product search
+            return await db.search_by_category("tops", limit=limit)
+    def _extract_category_from_query(self, query: str) -> List[str]:
+        """Extract potential category keywords from user query"""
+        query_lower = query.lower()
+        categories = []
+        category_mapping = {
+            'tops': ['top', 'shirt', 'blouse', 't-shirt', 'tshirt', 'crop top', 'spaghetti'],
+            'bottoms': ['pant', 'jeans', 'trouser', 'leggings', 'skirt', 'short'],
+            'dresses': ['dress', 'gown', 'frock'],
+            'outerwear': ['jacket', 'sweater', 'hoodie', 'cardigan', 'coat'],
+            'accessories': ['bag', 'jewelry', 'scarf', 'hat', 'belt']
+        }
+        for category, keywords in category_mapping.items():
+            if any(keyword in query_lower for keyword in keywords):
+                categories.append(category)
+        return categories if categories else ['tops']  # Default to tops
+    def create_product_prompt(self, query: str, products: List[Dict]) -> str:
+        """Create context-aware prompt with product information"""
+        if products:
+            context = "AVAILABLE PRODUCTS:\n"
+            for i, product in enumerate(products, 1):
+                context += f"{i}. {product['content']}\n"
+        else:
+            context = "No specific product information available at the moment."
+        prompt = f"""
+        {self.personality_traits}
+        {context}
+        USER QUESTION: {query}
+        INSTRUCTIONS:
+        1. Answer based primarily on the provided product information
+        2. If suggesting products, mention:
+           - Key features and benefits
+           - Price (if available)
+           - Why it might suit the user's needs
+        3. Be conversational and helpful
+        4. If the exact answer isn't in the products, use your general knowledge but be honest about limitations
+        5. Keep responses concise but complete (2-4 sentences usually)
+        6. Always maintain a friendly, shopping assistant tone
+        7. If multiple products are relevant, compare them briefly
+        SHOPPING ASSISTANT RESPONSE:
+        """
+        return prompt
+    async def generate_response(self, query: str) -> Tuple[str, List[Dict]]:
+        """Generate response using product RAG pipeline"""
         try:
+            # Retrieve relevant products
+            relevant_products = await self.retrieve_relevant_products(query)
+            print(f"📦 Retrieved {len(relevant_products)} relevant products")
+            # Create context-aware prompt
+            prompt = self.create_product_prompt(query, relevant_products)
+            # Generate response using Gemini
+            response = self.gemini_model.generate_content(prompt)
+            response_text = response.text.strip()
+            print(f"🤖 Generated response: {response_text[:100]}...")
+            return response_text, relevant_products
         except Exception as e:
+            print(f"❌ Error generating response: {e}")
+            fallback_msg = "I apologize, but I'm having trouble accessing our product information right now. Please try again in a moment or contact our customer service for immediate assistance. 😊"
+            return fallback_msg, []
+    def generate_followup_questions(self, query: str, products: List[Dict]) -> List[str]:
+        """Generate context-aware follow-up questions"""
+        base_questions = [
+            "Tell me more about this product",
+            "What are the alternatives in different colors?",
+            "Do you have similar items in different price ranges?",
+            "What's the sizing like for these products?",
+            "Are any of these currently on sale?"
+        ]
+        # Context-aware questions
+        query_lower = query.lower()
+        if any(word in query_lower for word in ['price', 'cost', 'expensive', 'cheap']):
+            base_questions.extend([
+                "What's the price range for similar items?",
+                "Are there any ongoing discounts?"
+            ])
+        if any(word in query_lower for word in ['color', 'colour', 'pattern']):
+            base_questions.extend([
+                "What other colors are available?",
+                "Do you have this in solid colors vs patterns?"
+            ])
+        return base_questions[:5]  # Return top 5 questions
+# Global RAG pipeline instance
+rag_pipeline = ProductRAGPipeline()

run.py CHANGED Viewed

@@ -1,11 +1,5 @@
 import uvicorn
-from config import settings
 if __name__ == "__main__":
-    uvicorn.run(
-        "main:app",
-        host="0.0.0.0",
-        port=settings.port,
-        reload=True,  # Enable auto-reload during development
-        log_level="info"
-    )

+from main import app
 import uvicorn
 if __name__ == "__main__":
+    uvicorn.run("app.main:app", host="0.0.0.0", port=7860, reload=True)