Spaces:

Ariyan-Pro
/

rag-latency-optimization

Sleeping

File size: 5,713 Bytes

04ab625

#!/usr/bin/env python3
"""
Real LLM integration for RAG system.
Uses HuggingFace transformers with CPU optimizations.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))

from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
from typing import List, Dict, Any
import time
from config import MAX_TOKENS, TEMPERATURE

class CPUOptimizedLLM:
    """CPU-optimized LLM for RAG responses."""
    
    def __init__(self, model_name="microsoft/phi-2"):
        """
        Initialize a CPU-friendly model.
        Options: microsoft/phi-2, TinyLlama/TinyLlama-1.1B, Qwen/Qwen2.5-0.5B
        """
        self.model_name = model_name
        self.tokenizer = None
        self.model = None
        self.pipeline = None
        self._initialized = False
        
        # CPU optimization settings
        self.torch_dtype = torch.float32  # Use float32 for CPU
        self.device = "cpu"
        self.load_in_8bit = False  # Can't use 8-bit on CPU without special setup
        
    def initialize(self):
        """Lazy initialization of the model."""
        if self._initialized:
            return
            
        print(f"Loading LLM model: {self.model_name} (CPU optimized)...")
        start_time = time.time()
        
        try:
            # Load tokenizer
            self.tokenizer = AutoTokenizer.from_pretrained(
                self.model_name,
                trust_remote_code=True
            )
            
            # Add padding token if missing
            if self.tokenizer.pad_token is None:
                self.tokenizer.pad_token = self.tokenizer.eos_token
            
            # Load model with CPU optimizations
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=self.torch_dtype,
                device_map="cpu",
                low_cpu_mem_usage=True,
                trust_remote_code=True
            )
            
            # Create text generation pipeline
            self.pipeline = pipeline(
                "text-generation",
                model=self.model,
                tokenizer=self.tokenizer,
                device=-1,  # CPU
                torch_dtype=self.torch_dtype
            )
            
            load_time = time.time() - start_time
            print(f"LLM loaded in {load_time:.1f}s")
            self._initialized = True
            
        except Exception as e:
            print(f"Error loading model {self.model_name}: {e}")
            print("Falling back to simulated LLM...")
            self._initialized = False
    
    def generate_response(self, question: str, context: str) -> str:
        """
        Generate a response using the LLM.
        
        Args:
            question: User's question
            context: Retrieved context chunks
            
        Returns:
            Generated answer
        """
        if not self._initialized:
            # Fallback to simulated response
            return self._generate_simulated_response(question, context)
        
        # Create prompt
        prompt = f"""Context information:
{context}

Based on the context above, answer the following question:
Question: {question}

Answer: """
        
        try:
            # Generate response
            start_time = time.perf_counter()
            
            outputs = self.pipeline(
                prompt,
                max_new_tokens=MAX_TOKENS,
                temperature=TEMPERATURE,
                do_sample=True,
                top_p=0.95,
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
                num_return_sequences=1
            )
            
            generation_time = (time.perf_counter() - start_time) * 1000
            
            # Extract response
            response = outputs[0]['generated_text']
            
            # Remove the prompt from response
            if response.startswith(prompt):
                response = response[len(prompt):].strip()
            
            print(f"  [Real LLM] Generation: {generation_time:.1f}ms")
            return response
            
        except Exception as e:
            print(f"  [Real LLM Error] {e}, falling back to simulated...")
            return self._generate_simulated_response(question, context)
    
    def _generate_simulated_response(self, question: str, context: str) -> str:
        """Fallback simulated response."""
        # Simulate generation time (80ms for optimized, 200ms for naive)
        time.sleep(0.08 if len(context) < 1000 else 0.2)
        
        if context:
            return f"Based on the context: {context[:300]}..."
        else:
            return "I don't have enough information to answer that question."
    
    def close(self):
        """Clean up model resources."""
        if self.model:
            del self.model
            torch.cuda.empty_cache() if torch.cuda.is_available() else None
        self._initialized = False

# Test the LLM integration
if __name__ == "__main__":
    llm = CPUOptimizedLLM("microsoft/phi-2")
    llm.initialize()
    
    test_context = """Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. There are three main types: supervised learning, unsupervised learning, and reinforcement learning."""
    
    test_question = "What is machine learning?"
    
    response = llm.generate_response(test_question, test_context)
    print(f"\nQuestion: {test_question}")
    print(f"Response: {response[:200]}...")
    
    llm.close()