Spaces:

Ariyan-Pro
/

rag-latency-optimization

Sleeping

App Files Files Community

rag-latency-optimization / app /llm_integration.py

Ariyan-Pro

Deploy RAG Latency Optimization v1.0

04ab625 10 days ago

raw

history blame contribute delete

5.71 kB

	#!/usr/bin/env python3
	"""
	Real LLM integration for RAG system.
	Uses HuggingFace transformers with CPU optimizations.
	"""
	import sys
	from pathlib import Path
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
	import torch
	from typing import List, Dict, Any
	import time
	from config import MAX_TOKENS, TEMPERATURE

	class CPUOptimizedLLM:
	"""CPU-optimized LLM for RAG responses."""

	def __init__(self, model_name="microsoft/phi-2"):
	"""
	Initialize a CPU-friendly model.
	Options: microsoft/phi-2, TinyLlama/TinyLlama-1.1B, Qwen/Qwen2.5-0.5B
	"""
	self.model_name = model_name
	self.tokenizer = None
	self.model = None
	self.pipeline = None
	self._initialized = False

	# CPU optimization settings
	self.torch_dtype = torch.float32 # Use float32 for CPU
	self.device = "cpu"
	self.load_in_8bit = False # Can't use 8-bit on CPU without special setup

	def initialize(self):
	"""Lazy initialization of the model."""
	if self._initialized:
	return

	print(f"Loading LLM model: {self.model_name} (CPU optimized)...")
	start_time = time.time()

	try:
	# Load tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(
	self.model_name,
	trust_remote_code=True
	)

	# Add padding token if missing
	if self.tokenizer.pad_token is None:
	self.tokenizer.pad_token = self.tokenizer.eos_token

	# Load model with CPU optimizations
	self.model = AutoModelForCausalLM.from_pretrained(
	self.model_name,
	torch_dtype=self.torch_dtype,
	device_map="cpu",
	low_cpu_mem_usage=True,
	trust_remote_code=True
	)

	# Create text generation pipeline
	self.pipeline = pipeline(
	"text-generation",
	model=self.model,
	tokenizer=self.tokenizer,
	device=-1, # CPU
	torch_dtype=self.torch_dtype
	)

	load_time = time.time() - start_time
	print(f"LLM loaded in {load_time:.1f}s")
	self._initialized = True

	except Exception as e:
	print(f"Error loading model {self.model_name}: {e}")
	print("Falling back to simulated LLM...")
	self._initialized = False

	def generate_response(self, question: str, context: str) -> str:
	"""
	Generate a response using the LLM.

	Args:
	question: User's question
	context: Retrieved context chunks

	Returns:
	Generated answer
	"""
	if not self._initialized:
	# Fallback to simulated response
	return self._generate_simulated_response(question, context)

	# Create prompt
	prompt = f"""Context information:
	{context}

	Based on the context above, answer the following question:
	Question: {question}

	Answer: """

	try:
	# Generate response
	start_time = time.perf_counter()

	outputs = self.pipeline(
	prompt,
	max_new_tokens=MAX_TOKENS,
	temperature=TEMPERATURE,
	do_sample=True,
	top_p=0.95,
	pad_token_id=self.tokenizer.pad_token_id,
	eos_token_id=self.tokenizer.eos_token_id,
	num_return_sequences=1
	)

	generation_time = (time.perf_counter() - start_time) * 1000

	# Extract response
	response = outputs[0]['generated_text']

	# Remove the prompt from response
	if response.startswith(prompt):
	response = response[len(prompt):].strip()

	print(f" [Real LLM] Generation: {generation_time:.1f}ms")
	return response

	except Exception as e:
	print(f" [Real LLM Error] {e}, falling back to simulated...")
	return self._generate_simulated_response(question, context)

	def _generate_simulated_response(self, question: str, context: str) -> str:
	"""Fallback simulated response."""
	# Simulate generation time (80ms for optimized, 200ms for naive)
	time.sleep(0.08 if len(context) < 1000 else 0.2)

	if context:
	return f"Based on the context: {context[:300]}..."
	else:
	return "I don't have enough information to answer that question."

	def close(self):
	"""Clean up model resources."""
	if self.model:
	del self.model
	torch.cuda.empty_cache() if torch.cuda.is_available() else None
	self._initialized = False

	# Test the LLM integration
	if __name__ == "__main__":
	llm = CPUOptimizedLLM("microsoft/phi-2")
	llm.initialize()

	test_context = """Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. There are three main types: supervised learning, unsupervised learning, and reinforcement learning."""

	test_question = "What is machine learning?"

	response = llm.generate_response(test_question, test_context)
	print(f"\nQuestion: {test_question}")
	print(f"Response: {response[:200]}...")

	llm.close()