rag-latency-optimization / app /llm_integration.py
Ariyan-Pro's picture
Deploy RAG Latency Optimization v1.0
04ab625
#!/usr/bin/env python3
"""
Real LLM integration for RAG system.
Uses HuggingFace transformers with CPU optimizations.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
from typing import List, Dict, Any
import time
from config import MAX_TOKENS, TEMPERATURE
class CPUOptimizedLLM:
"""CPU-optimized LLM for RAG responses."""
def __init__(self, model_name="microsoft/phi-2"):
"""
Initialize a CPU-friendly model.
Options: microsoft/phi-2, TinyLlama/TinyLlama-1.1B, Qwen/Qwen2.5-0.5B
"""
self.model_name = model_name
self.tokenizer = None
self.model = None
self.pipeline = None
self._initialized = False
# CPU optimization settings
self.torch_dtype = torch.float32 # Use float32 for CPU
self.device = "cpu"
self.load_in_8bit = False # Can't use 8-bit on CPU without special setup
def initialize(self):
"""Lazy initialization of the model."""
if self._initialized:
return
print(f"Loading LLM model: {self.model_name} (CPU optimized)...")
start_time = time.time()
try:
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name,
trust_remote_code=True
)
# Add padding token if missing
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# Load model with CPU optimizations
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=self.torch_dtype,
device_map="cpu",
low_cpu_mem_usage=True,
trust_remote_code=True
)
# Create text generation pipeline
self.pipeline = pipeline(
"text-generation",
model=self.model,
tokenizer=self.tokenizer,
device=-1, # CPU
torch_dtype=self.torch_dtype
)
load_time = time.time() - start_time
print(f"LLM loaded in {load_time:.1f}s")
self._initialized = True
except Exception as e:
print(f"Error loading model {self.model_name}: {e}")
print("Falling back to simulated LLM...")
self._initialized = False
def generate_response(self, question: str, context: str) -> str:
"""
Generate a response using the LLM.
Args:
question: User's question
context: Retrieved context chunks
Returns:
Generated answer
"""
if not self._initialized:
# Fallback to simulated response
return self._generate_simulated_response(question, context)
# Create prompt
prompt = f"""Context information:
{context}
Based on the context above, answer the following question:
Question: {question}
Answer: """
try:
# Generate response
start_time = time.perf_counter()
outputs = self.pipeline(
prompt,
max_new_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
do_sample=True,
top_p=0.95,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id,
num_return_sequences=1
)
generation_time = (time.perf_counter() - start_time) * 1000
# Extract response
response = outputs[0]['generated_text']
# Remove the prompt from response
if response.startswith(prompt):
response = response[len(prompt):].strip()
print(f" [Real LLM] Generation: {generation_time:.1f}ms")
return response
except Exception as e:
print(f" [Real LLM Error] {e}, falling back to simulated...")
return self._generate_simulated_response(question, context)
def _generate_simulated_response(self, question: str, context: str) -> str:
"""Fallback simulated response."""
# Simulate generation time (80ms for optimized, 200ms for naive)
time.sleep(0.08 if len(context) < 1000 else 0.2)
if context:
return f"Based on the context: {context[:300]}..."
else:
return "I don't have enough information to answer that question."
def close(self):
"""Clean up model resources."""
if self.model:
del self.model
torch.cuda.empty_cache() if torch.cuda.is_available() else None
self._initialized = False
# Test the LLM integration
if __name__ == "__main__":
llm = CPUOptimizedLLM("microsoft/phi-2")
llm.initialize()
test_context = """Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. There are three main types: supervised learning, unsupervised learning, and reinforcement learning."""
test_question = "What is machine learning?"
response = llm.generate_response(test_question, test_context)
print(f"\nQuestion: {test_question}")
print(f"Response: {response[:200]}...")
llm.close()