ai-reasoning-copilot / models /llm_handler.py
faisalsns's picture
Update models/llm_handler.py
f33454b verified
# import ollama
import json
import logging
from typing import List, Dict, Any, Optional
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from typing import Optional
from config.settings import Settings
logger = logging.getLogger(__name__)
# HuggingFace LLM Handler for Microsoft Phi-3 Mini
import requests
from typing import Optional
import requests
import os
from dotenv import load_dotenv
load_dotenv()
class OpenRouterLLMHandler:
def __init__(self, api_key: str="", model: str = "mistralai/mistral-7b-instruct"):
if (model == ""):
model = self.current_model
API_KEY = os.getenv("OPENROUTER_API_KEY")
api_key= API_KEY if API_KEY else api_key
self.api_key = api_key
self.model = model
self.base_url = "https://openrouter.ai/api/v1/chat/completions"
print(f"🔌 Initialized OpenRouter handler with model: {model}")
def generate_response(self, prompt: str, context: Optional[str] = None, tools_output: Optional[str] = None) -> str:
try:
full_prompt = self._build_simple_prompt(prompt, context, tools_output)
# if self.model_name:
# self.model = self.model_name
#self.model = self.model_name
# if (model == ""):
# model = self.model_name
headers = {
"Authorization": f"Bearer {self.api_key}",
"Content-Type": "application/json"
}
payload = {
"model": self.model,
"messages": [
{"role": "system", "content": "You are a helpful AI assistant."},
{"role": "user", "content": full_prompt}
],
"temperature": 0.7,
"max_tokens": 200
}
# 222
# 320
# 90k
# msai
# 2% candidate
response = requests.post(self.base_url, headers=headers, json=payload)
response.raise_for_status()
result = response.json()
return result["choices"][0]["message"]["content"].strip()
except Exception as e:
return f"Error generating response: {str(e)}"
def _build_simple_prompt(self, user_input: str, context: Optional[str] = None, tools_output: Optional[str] = None) -> str:
prompt_parts = []
if context and len(context) < 300:
prompt_parts.append(f"Context: {context}")
if tools_output and len(tools_output) < 200:
prompt_parts.append(f"Additional info: {tools_output}")
prompt_parts.append(f"User query: {user_input}")
return "\n\n".join(prompt_parts)
def add_to_history(self, user_input: str, assistant_response: str):
"""
Add exchange to conversation history
"""
self.conversation_history.append({
'user': user_input,
'assistant': assistant_response
})
# Keep only recent history
if len(self.conversation_history) > Settings.MAX_CONVERSATION_HISTORY:
self.conversation_history = self.conversation_history[-Settings.MAX_CONVERSATION_HISTORY:]
def clear_history(self):
"""
Clear conversation history
"""
self.conversation_history = []
def get_available_models(self) -> List[str]:
"""
Get list of available Ollama models
"""
try:
models = self.client.list()
return [model['name'] for model in models['models']]
except Exception as e:
logger.error(f"Error getting models: {e}")
return [Settings.DEFAULT_MODEL]
def switch_model(self, model_name: str) -> bool:
"""
Switch to a different model
"""
try:
# Test if model is available
#self.client.generate(model=model_name, prompt="test", options={'num_predict': 1})
self.model = model_name
self.model_name = model_name
logger.info(f"Switched to model: {model_name}")
return True
except Exception as e:
logger.error(f"Error switching to model {model_name}: {e}")
return False
def generate_embedding(self, text: str) -> List[float]:
"""
Generate embeddings for text using Ollama
"""
try:
response = self.client.embeddings(
model=Settings.EMBEDDING_MODEL,
prompt=text
)
return response['embedding']
except Exception as e:
logger.error(f"Error generating embedding: {e}")
return []
class HuggingFaceLLMHandler:
def __init__(self):
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import psutil
self.model_name = "microsoft/Phi-3-mini-4k-instruct"
print("Loading model... this may take a moment on first run")
# Choose device and dtype intelligently
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
torch_dtype = torch.float16 if device.type == "cuda" else torch.float32
print(f"Using device: {device}, dtype: {torch_dtype}")
print(f"Available RAM: {psutil.virtual_memory().available / 1e6:.2f} MB")
# Load tokenizer
self.tokenizer = AutoTokenizer.from_pretrained(
self.model_name,
trust_remote_code=True
)
# Load model safely
try:
self.model = AutoModelForCausalLM.from_pretrained(
self.model_name,
torch_dtype=torch_dtype,
device_map="auto" if device.type == "cuda" else None,
low_cpu_mem_usage=True, # Helps reduce RAM footprint during init
trust_remote_code=True
)
# Explicitly move to CPU if needed
if device.type == "cpu":
self.model = self.model.to(device)
print("Model loaded successfully!")
except RuntimeError as e:
print(f"❌ Error loading model: {e}")
print("Tip: Try switching to a smaller model or free up RAM.")
def generate_response(self, prompt: str, context: Optional[str] = None,
tools_output: Optional[str] = None) -> str:
"""
Generate response using Phi-3 - should be under 10 seconds
"""
try:
# Build simple prompt
full_prompt = self._build_simple_prompt(prompt, context, tools_output)
# Tokenize and move to same device as model
inputs = self.tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=1024)
inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
# Generate
with torch.no_grad():
outputs = self.model.generate(
inputs["input_ids"],
max_new_tokens=200, # Limit response length
temperature=0.7,
do_sample=True,
pad_token_id=self.tokenizer.eos_token_id,
attention_mask=inputs["attention_mask"]
)
# Decode response
response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
return response.strip()
except Exception as e:
logger.error(f"Error generating response: {e}")
return f"Error generating response: {str(e)}"
def _build_simple_prompt(self, user_input: str, context: Optional[str] = None,
tools_output: Optional[str] = None) -> str:
"""Simple prompt builder"""
prompt_parts = ["You are a helpful AI assistant."]
if context and len(context) < 300:
prompt_parts.append(f"Context: {context}")
if tools_output and len(tools_output) < 200:
prompt_parts.append(f"Additional info: {tools_output}")
prompt_parts.append(f"User: {user_input}")
prompt_parts.append("Assistant:")
return "\n\n".join(prompt_parts)
def add_to_history(self, user_input: str, assistant_response: str):
"""
Add exchange to conversation history
"""
self.conversation_history.append({
'user': user_input,
'assistant': assistant_response
})
# Keep only recent history
if len(self.conversation_history) > Settings.MAX_CONVERSATION_HISTORY:
self.conversation_history = self.conversation_history[-Settings.MAX_CONVERSATION_HISTORY:]
def clear_history(self):
"""
Clear conversation history
"""
self.conversation_history = []
def get_available_models(self) -> List[str]:
"""
Get list of available Ollama models
"""
try:
models = self.client.list()
return [model['name'] for model in models['models']]
except Exception as e:
logger.error(f"Error getting models: {e}")
return [Settings.DEFAULT_MODEL]
def switch_model(self, model_name: str) -> bool:
"""
Switch to a different model
"""
try:
# Test if model is available
self.client.generate(model=model_name, prompt="test", options={'num_predict': 1})
self.model_name = model_name
logger.info(f"Switched to model: {model_name}")
return True
except Exception as e:
logger.error(f"Error switching to model {model_name}: {e}")
return False
def generate_embedding(self, text: str) -> List[float]:
"""
Generate embeddings for text using Ollama
"""
try:
response = self.client.embeddings(
model=Settings.EMBEDDING_MODEL,
prompt=text
)
return response['embedding']
except Exception as e:
logger.error(f"Error generating embedding: {e}")
return []