Spaces:
Sleeping
Sleeping
File size: 5,713 Bytes
04ab625 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
#!/usr/bin/env python3
"""
Real LLM integration for RAG system.
Uses HuggingFace transformers with CPU optimizations.
"""
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM
import torch
from typing import List, Dict, Any
import time
from config import MAX_TOKENS, TEMPERATURE
class CPUOptimizedLLM:
"""CPU-optimized LLM for RAG responses."""
def __init__(self, model_name="microsoft/phi-2"):
"""
Initialize a CPU-friendly model.
Options: microsoft/phi-2, TinyLlama/TinyLlama-1.1B, Qwen/Qwen2.5-0.5B
"""
self.model_name = model_name
self.tokenizer = None
self.model = None
self.pipeline = None
self._initialized = False
# CPU optimization settings
self.torch_dtype = torch.float32 # Use float32 for CPU
self.device = "cpu"
self.load_in_8bit = False # Can't use 8-bit on CPU without special setup
def initialize(self):
"""Lazy initialization of the model."""
if self._initialized:
return
print(f"Loading LLM model: {self.model_name} (CPU optimized)...")
start_time = time.time()
try:
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name,
trust_remote_code=True
)
# Add padding token if missing
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
# Load model with CPU optimizations
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=self.torch_dtype,
device_map="cpu",
low_cpu_mem_usage=True,
trust_remote_code=True
)
# Create text generation pipeline
self.pipeline = pipeline(
"text-generation",
model=self.model,
tokenizer=self.tokenizer,
device=-1, # CPU
torch_dtype=self.torch_dtype
)
load_time = time.time() - start_time
print(f"LLM loaded in {load_time:.1f}s")
self._initialized = True
except Exception as e:
print(f"Error loading model {self.model_name}: {e}")
print("Falling back to simulated LLM...")
self._initialized = False
def generate_response(self, question: str, context: str) -> str:
"""
Generate a response using the LLM.
Args:
question: User's question
context: Retrieved context chunks
Returns:
Generated answer
"""
if not self._initialized:
# Fallback to simulated response
return self._generate_simulated_response(question, context)
# Create prompt
prompt = f"""Context information:
{context}
Based on the context above, answer the following question:
Question: {question}
Answer: """
try:
# Generate response
start_time = time.perf_counter()
outputs = self.pipeline(
prompt,
max_new_tokens=MAX_TOKENS,
temperature=TEMPERATURE,
do_sample=True,
top_p=0.95,
pad_token_id=self.tokenizer.pad_token_id,
eos_token_id=self.tokenizer.eos_token_id,
num_return_sequences=1
)
generation_time = (time.perf_counter() - start_time) * 1000
# Extract response
response = outputs[0]['generated_text']
# Remove the prompt from response
if response.startswith(prompt):
response = response[len(prompt):].strip()
print(f" [Real LLM] Generation: {generation_time:.1f}ms")
return response
except Exception as e:
print(f" [Real LLM Error] {e}, falling back to simulated...")
return self._generate_simulated_response(question, context)
def _generate_simulated_response(self, question: str, context: str) -> str:
"""Fallback simulated response."""
# Simulate generation time (80ms for optimized, 200ms for naive)
time.sleep(0.08 if len(context) < 1000 else 0.2)
if context:
return f"Based on the context: {context[:300]}..."
else:
return "I don't have enough information to answer that question."
def close(self):
"""Clean up model resources."""
if self.model:
del self.model
torch.cuda.empty_cache() if torch.cuda.is_available() else None
self._initialized = False
# Test the LLM integration
if __name__ == "__main__":
llm = CPUOptimizedLLM("microsoft/phi-2")
llm.initialize()
test_context = """Machine learning is a subset of artificial intelligence that enables systems to learn and improve from experience without being explicitly programmed. There are three main types: supervised learning, unsupervised learning, and reinforcement learning."""
test_question = "What is machine learning?"
response = llm.generate_response(test_question, test_context)
print(f"\nQuestion: {test_question}")
print(f"Response: {response[:200]}...")
llm.close()
|