Spaces:

subhrajit-mohanty
/

rag_api

Running

App Files Files Community

SUBHRAJIT MOHANTY commited on Jul 9, 2025

Commit

247920d

1 Parent(s): d1f7294

initial commit

Browse files

Files changed (3) hide show

Dockerfile +0 -0
app.py +511 -0
requirements.txt +15 -0

Dockerfile ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,511 @@

+from fastapi import FastAPI, HTTPException, Request
+from fastapi.responses import StreamingResponse
+from pydantic import BaseModel, Field
+from typing import List, Optional, Dict, Any, AsyncGenerator
+import asyncio
+import json
+import uuid
+from datetime import datetime
+import os
+from contextlib import asynccontextmanager
+# Third-party imports
+from groq import AsyncGroq
+from qdrant_client import AsyncQdrantClient
+from qdrant_client.models import Distance, VectorParams, PointStruct, Filter, FieldCondition, MatchValue
+from sentence_transformers import SentenceTransformer
+import torch
+import asyncio
+from concurrent.futures import ThreadPoolExecutor
+# Models for OpenAI-compatible API
+class Message(BaseModel):
+    role: str = Field(..., description="The role of the message author")
+    content: str = Field(..., description="The content of the message")
+class ChatCompletionRequest(BaseModel):
+    model: str = Field(default="mixtral-8x7b-32768", description="Model to use")
+    messages: List[Message] = Field(..., description="List of messages")
+    max_tokens: Optional[int] = Field(default=1024, description="Maximum tokens to generate")
+    temperature: Optional[float] = Field(default=0.7, description="Temperature for sampling")
+    stream: Optional[bool] = Field(default=False, description="Whether to stream responses")
+    top_p: Optional[float] = Field(default=1.0, description="Top-p sampling parameter")
+class ChatCompletionResponse(BaseModel):
+    id: str
+    object: str = "chat.completion"
+    created: int
+    model: str
+    choices: List[Dict[str, Any]]
+    usage: Optional[Dict[str, int]] = None
+class ChatCompletionChunk(BaseModel):
+    id: str
+    object: str = "chat.completion.chunk"
+    created: int
+    model: str
+    choices: List[Dict[str, Any]]
+# Configuration
+class Config:
+    GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+    QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
+    QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
+    COLLECTION_NAME = os.getenv("COLLECTION_NAME", "documents")
+    EMBEDDING_MODEL = os.getenv("EMBEDDING_MODEL", "sentence-transformers/all-MiniLM-L6-v2")
+    TOP_K = int(os.getenv("TOP_K", "5"))
+    SIMILARITY_THRESHOLD = float(os.getenv("SIMILARITY_THRESHOLD", "0.7"))
+    DEVICE = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
+# Global clients
+groq_client = None
+qdrant_client = None
+embedding_service = None
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    global groq_client, qdrant_client, embedding_service
+    if not Config.GROQ_API_KEY:
+        raise ValueError("GROQ_API_KEY environment variable is required")
+    groq_client = AsyncGroq(api_key=Config.GROQ_API_KEY)
+    qdrant_client = AsyncQdrantClient(
+        url=Config.QDRANT_URL,
+        api_key=Config.QDRANT_API_KEY
+    )
+    # Initialize embedding service
+    embedding_service = None
+    # Verify connections
+    try:
+        collections = await qdrant_client.get_collections()
+        print(f"Connected to Qdrant. Available collections: {[c.name for c in collections.collections]}")
+    except Exception as e:
+        print(f"Warning: Could not connect to Qdrant: {e}")
+    # Check embedding model
+    try:
+        print(f"Embedding model loaded: {Config.EMBEDDING_MODEL}")
+        print(f"Model device: {Config.DEVICE}")
+        print(f"Vector dimension: {embedding_service.dimension}")
+    except Exception as e:
+        print(f"Warning: Could not load embedding model: {e}")
+    yield
+    # Shutdown
+    if qdrant_client:
+        await qdrant_client.close()
+# Initialize FastAPI app
+app = FastAPI(
+    title="RAG API with Groq and Qdrant",
+    description="OpenAI-compatible API for RAG using Groq LLM and Qdrant vector database",
+    version="1.0.0",
+    lifespan=lifespan
+)
+class EmbeddingService:
+    """Service for generating embeddings using sentence-transformers"""
+    def __init__(self):
+        self.model_name = Config.EMBEDDING_MODEL
+        self.device = Config.DEVICE
+        self.dimension = 384  # all-MiniLM-L6-v2 dimension
+        self.executor = ThreadPoolExecutor(max_workers=4)
+        # Load the model
+        print(f"Loading embedding model: {self.model_name}")
+        self.model = SentenceTransformer(self.model_name, device=self.device)
+        print(f"Model loaded successfully on device: {self.device}")
+    async def get_embedding(self, text: str) -> List[float]:
+        """Generate embedding for given text"""
+        try:
+            # Run the synchronous model.encode in a thread pool
+            loop = asyncio.get_event_loop()
+            embedding = await loop.run_in_executor(
+                self.executor,
+                self._encode_text,
+                text
+            )
+            return embedding.tolist()
+        except Exception as e:
+            print(f"Error generating embedding: {e}")
+            return [0.1] * self.dimension
+    def _encode_text(self, text: str):
+        """Synchronous text encoding - runs in thread pool"""
+        return self.model.encode([text])[0]
+    async def get_document_embedding(self, text: str) -> List[float]:
+        """Generate embedding for document text"""
+        return await self.get_embedding(text)
+    async def get_query_embedding(self, text: str) -> List[float]:
+        """Generate embedding for query text"""
+        return await self.get_embedding(text)
+    async def batch_embed(self, texts: List[str]) -> List[List[float]]:
+        """Generate embeddings for multiple texts efficiently"""
+        try:
+            loop = asyncio.get_event_loop()
+            embeddings = await loop.run_in_executor(
+                self.executor,
+                self._batch_encode_texts,
+                texts
+            )
+            return embeddings.tolist()
+        except Exception as e:
+            print(f"Error in batch embedding: {e}")
+            return [[0.1] * self.dimension for _ in texts]
+    def _batch_encode_texts(self, texts: List[str]):
+        """Synchronous batch encoding - runs in thread pool"""
+        return self.model.encode(texts)
+    def health_check(self) -> dict:
+        """Check embedding service health"""
+        try:
+            # Test encoding
+            test_embedding = self.model.encode(["test"])
+            return {
+                "status": "healthy",
+                "model": self.model_name,
+                "device": self.device,
+                "dimension": self.dimension,
+                "test_embedding_shape": test_embedding.shape
+            }
+        except Exception as e:
+            return {
+                "status": "unhealthy",
+                "model": self.model_name,
+                "error": str(e)
+            }
+embedding_service = EmbeddingService()
+class RAGService:
+    """Service for retrieval-augmented generation"""
+    @staticmethod
+    async def retrieve_relevant_chunks(query: str, top_k: int = Config.TOP_K) -> List[str]:
+        """Retrieve relevant document chunks from Qdrant"""
+        try:
+            # Get query embedding - all-MiniLM works well without special prefixes
+            query_embedding = await embedding_service.get_query_embedding(query)
+            # Search in Qdrant
+            search_results = await qdrant_client.search(
+                collection_name=Config.COLLECTION_NAME,
+                query_vector=query_embedding,
+                limit=top_k,
+                score_threshold=Config.SIMILARITY_THRESHOLD
+            )
+            # Extract content from results
+            chunks = []
+            for result in search_results:
+                if hasattr(result, 'payload') and 'content' in result.payload:
+                    chunks.append(result.payload['content'])
+                elif hasattr(result, 'payload') and 'text' in result.payload:
+                    chunks.append(result.payload['text'])
+            print(f"Retrieved {len(chunks)} relevant chunks for query")
+            return chunks
+        except Exception as e:
+            print(f"Error retrieving chunks: {e}")
+            return []
+    @staticmethod
+    def build_context_prompt(query: str, chunks: List[str]) -> str:
+        """Build a context-aware prompt with retrieved chunks"""
+        if not chunks:
+            return query
+        context = "\n\n".join([f"Document {i+1}: {chunk}" for i, chunk in enumerate(chunks)])
+        prompt = f"""Based on the following documents, please answer the user's question. If the information is not available in the documents, please say so.
+Context Documents:
+{context}
+User Question: {query}
+Please provide a helpful and accurate response based on the context provided."""
+        return prompt
+@app.get("/")
+async def root():
+    return {"message": "RAG API with Groq and Qdrant", "status": "running"}
+@app.get("/health")
+async def health_check():
+    """Health check endpoint"""
+    try:
+        # Test Qdrant connection
+        collections = await qdrant_client.get_collections()
+        qdrant_status = "connected"
+    except Exception as e:
+        qdrant_status = f"error: {str(e)}"
+    # Test embedding service
+    embedding_health = embedding_service.health_check()
+    return {
+        "status": "healthy",
+        "groq": "connected" if groq_client else "not configured",
+        "qdrant": qdrant_status,
+        "embedding_service": embedding_health,
+        "collection": Config.COLLECTION_NAME,
+        "embedding_model": Config.EMBEDDING_MODEL
+    }
+@app.post("/v1/chat/completions")
+async def chat_completions(request: ChatCompletionRequest):
+    """OpenAI-compatible chat completions endpoint with RAG"""
+    if not groq_client:
+        raise HTTPException(status_code=500, detail="Groq client not initialized")
+    try:
+        # Get the last user message for retrieval
+        user_messages = [msg for msg in request.messages if msg.role == "user"]
+        if not user_messages:
+            raise HTTPException(status_code=400, detail="No user message found")
+        last_user_message = user_messages[-1].content
+        # Retrieve relevant chunks
+        relevant_chunks = await RAGService.retrieve_relevant_chunks(last_user_message)
+        # Build context-aware prompt
+        if relevant_chunks:
+            context_prompt = RAGService.build_context_prompt(last_user_message, relevant_chunks)
+            # Replace the last user message with context-enhanced version
+            enhanced_messages = request.messages[:-1] + [Message(role="user", content=context_prompt)]
+        else:
+            enhanced_messages = request.messages
+        # Convert to Groq format
+        groq_messages = [{"role": msg.role, "content": msg.content} for msg in enhanced_messages]
+        if request.stream:
+            return StreamingResponse(
+                stream_chat_completion(groq_messages, request),
+                media_type="text/plain"
+            )
+        else:
+            return await create_chat_completion(groq_messages, request)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Internal server error: {str(e)}")
+async def create_chat_completion(messages: List[Dict], request: ChatCompletionRequest) -> ChatCompletionResponse:
+    """Create a non-streaming chat completion"""
+    try:
+        response = await groq_client.chat.completions.create(
+            model=request.model,
+            messages=messages,
+            max_tokens=request.max_tokens,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            stream=False
+        )
+        # Convert Groq response to OpenAI format
+        return ChatCompletionResponse(
+            id=f"chatcmpl-{uuid.uuid4().hex}",
+            created=int(datetime.now().timestamp()),
+            model=request.model,
+            choices=[{
+                "index": 0,
+                "message": {
+                    "role": "assistant",
+                    "content": response.choices[0].message.content
+                },
+                "finish_reason": response.choices[0].finish_reason
+            }],
+            usage={
+                "prompt_tokens": response.usage.prompt_tokens,
+                "completion_tokens": response.usage.completion_tokens,
+                "total_tokens": response.usage.total_tokens
+            }
+        )
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error calling Groq API: {str(e)}")
+async def stream_chat_completion(messages: List[Dict], request: ChatCompletionRequest) -> AsyncGenerator[str, None]:
+    """Stream chat completion responses"""
+    try:
+        completion_id = f"chatcmpl-{uuid.uuid4().hex}"
+        created = int(datetime.now().timestamp())
+        stream = await groq_client.chat.completions.create(
+            model=request.model,
+            messages=messages,
+            max_tokens=request.max_tokens,
+            temperature=request.temperature,
+            top_p=request.top_p,
+            stream=True
+        )
+        async for chunk in stream:
+            if chunk.choices and chunk.choices[0].delta:
+                delta = chunk.choices[0].delta
+                chunk_response = ChatCompletionChunk(
+                    id=completion_id,
+                    created=created,
+                    model=request.model,
+                    choices=[{
+                        "index": 0,
+                        "delta": {
+                            "role": delta.role if hasattr(delta, 'role') and delta.role else None,
+                            "content": delta.content if hasattr(delta, 'content') else None
+                        },
+                        "finish_reason": chunk.choices[0].finish_reason
+                    }]
+                )
+                yield f"data: {chunk_response.model_dump_json()}\n\n"
+        # Send final chunk
+        final_chunk = ChatCompletionChunk(
+            id=completion_id,
+            created=created,
+            model=request.model,
+            choices=[{
+                "index": 0,
+                "delta": {},
+                "finish_reason": "stop"
+            }]
+        )
+        yield f"data: {final_chunk.model_dump_json()}\n\n"
+        yield "data: [DONE]\n\n"
+    except Exception as e:
+        error_chunk = {
+            "error": {
+                "message": str(e),
+                "type": "internal_error"
+            }
+        }
+        yield f"data: {json.dumps(error_chunk)}\n\n"
+# Additional endpoints for managing the vector database
+@app.post("/v1/embeddings/add")
+async def add_document(content: str, metadata: Optional[Dict] = None):
+    """Add a document to the vector database"""
+    try:
+        # Generate embedding for document
+        embedding = await embedding_service.get_document_embedding(content)
+        # Create point
+        point = PointStruct(
+            id=str(uuid.uuid4()),
+            vector=embedding,
+            payload={
+                "content": content,
+                "metadata": metadata or {},
+                "timestamp": datetime.now().isoformat()
+            }
+        )
+        # Insert into Qdrant
+        await qdrant_client.upsert(
+            collection_name=Config.COLLECTION_NAME,
+            points=[point]
+        )
+        return {"message": "Document added successfully", "id": point.id}
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error adding document: {str(e)}")
+@app.post("/v1/embeddings/batch_add")
+async def batch_add_documents(documents: List[Dict[str, Any]]):
+    """Add multiple documents to the vector database"""
+    try:
+        # Extract texts and metadata
+        texts = [doc.get("content", "") for doc in documents]
+        metadatas = [doc.get("metadata", {}) for doc in documents]
+        # Generate embeddings for all documents
+        embeddings = await embedding_service.batch_embed(texts)
+        # Create points
+        points = []
+        for i, (text, embedding, metadata) in enumerate(zip(texts, embeddings, metadatas)):
+            point = PointStruct(
+                id=str(uuid.uuid4()),
+                vector=embedding,
+                payload={
+                    "content": text,
+                    "metadata": metadata,
+                    "timestamp": datetime.now().isoformat()
+                }
+            )
+            points.append(point)
+        # Insert all points into Qdrant
+        await qdrant_client.upsert(
+            collection_name=Config.COLLECTION_NAME,
+            points=points
+        )
+        return {
+            "message": f"Successfully added {len(points)} documents",
+            "ids": [point.id for point in points]
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error adding documents: {str(e)}")
+@app.post("/v1/embeddings/create_collection")
+async def create_collection():
+    """Create a new collection in Qdrant with the correct vector size"""
+    try:
+        from qdrant_client.models import VectorParams, Distance
+        await qdrant_client.create_collection(
+            collection_name=Config.COLLECTION_NAME,
+            vectors_config=VectorParams(
+                size=embedding_service.dimension,  # 384 for all-MiniLM-L6-v2
+                distance=Distance.COSINE
+            )
+        )
+        return {
+            "message": f"Collection '{Config.COLLECTION_NAME}' created successfully",
+            "vector_size": embedding_service.dimension,
+            "distance": "cosine"
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error creating collection: {str(e)}")
+@app.get("/v1/collections/info")
+async def get_collection_info():
+    """Get information about the collection"""
+    try:
+        collection_info = await qdrant_client.get_collection(Config.COLLECTION_NAME)
+        return {
+            "name": Config.COLLECTION_NAME,
+            "vectors_count": collection_info.vectors_count,
+            "status": collection_info.status
+        }
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error getting collection info: {str(e)}")
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8000)

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+fastapi==0.104.1
+uvicorn[standard]==0.24.0
+groq==0.4.1
+qdrant-client==1.7.0
+sentence-transformers==2.2.2
+torch==2.1.1
+pydantic==2.5.0
+httpx==0.25.2
+numpy==1.24.3
+transformers==4.36.0
+tokenizers==0.15.0
+huggingface-hub==0.19.4
+scipy==1.11.4
+scikit-learn==1.3.2
+python-multipart==0.0.6