Spaces:

faisalsns
/

ai-reasoning-copilot

Runtime error

App Files Files Community

ai-reasoning-copilot / models /llm_handler.py

faisalsns

Update models/llm_handler.py

f33454b verified 7 months ago

raw

history blame contribute delete

10.4 kB

	# import ollama
	import json
	import logging
	from typing import List, Dict, Any, Optional
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	from typing import Optional
	from config.settings import Settings

	logger = logging.getLogger(__name__)


	# HuggingFace LLM Handler for Microsoft Phi-3 Mini

	import requests
	from typing import Optional
	import requests
	import os
	from dotenv import load_dotenv

	load_dotenv()


	class OpenRouterLLMHandler:
	def __init__(self, api_key: str="", model: str = "mistralai/mistral-7b-instruct"):
	if (model == ""):
	model = self.current_model

	API_KEY = os.getenv("OPENROUTER_API_KEY")
	api_key= API_KEY if API_KEY else api_key
	self.api_key = api_key
	self.model = model
	self.base_url = "https://openrouter.ai/api/v1/chat/completions"
	print(f"🔌 Initialized OpenRouter handler with model: {model}")

	def generate_response(self, prompt: str, context: Optional[str] = None, tools_output: Optional[str] = None) -> str:
	try:
	full_prompt = self._build_simple_prompt(prompt, context, tools_output)

	# if self.model_name:
	# self.model = self.model_name
	#self.model = self.model_name

	# if (model == ""):
	# model = self.model_name

	headers = {
	"Authorization": f"Bearer {self.api_key}",
	"Content-Type": "application/json"
	}

	payload = {
	"model": self.model,
	"messages": [
	{"role": "system", "content": "You are a helpful AI assistant."},
	{"role": "user", "content": full_prompt}
	],
	"temperature": 0.7,
	"max_tokens": 200
	}

	# 222
	# 320
	# 90k
	# msai

	# 2% candidate





	response = requests.post(self.base_url, headers=headers, json=payload)
	response.raise_for_status()
	result = response.json()

	return result["choices"][0]["message"]["content"].strip()

	except Exception as e:
	return f"Error generating response: {str(e)}"

	def _build_simple_prompt(self, user_input: str, context: Optional[str] = None, tools_output: Optional[str] = None) -> str:
	prompt_parts = []

	if context and len(context) < 300:
	prompt_parts.append(f"Context: {context}")

	if tools_output and len(tools_output) < 200:
	prompt_parts.append(f"Additional info: {tools_output}")

	prompt_parts.append(f"User query: {user_input}")
	return "\n\n".join(prompt_parts)

	def add_to_history(self, user_input: str, assistant_response: str):
	"""
	Add exchange to conversation history
	"""
	self.conversation_history.append({
	'user': user_input,
	'assistant': assistant_response
	})

	# Keep only recent history
	if len(self.conversation_history) > Settings.MAX_CONVERSATION_HISTORY:
	self.conversation_history = self.conversation_history[-Settings.MAX_CONVERSATION_HISTORY:]

	def clear_history(self):
	"""
	Clear conversation history
	"""
	self.conversation_history = []

	def get_available_models(self) -> List[str]:
	"""
	Get list of available Ollama models
	"""
	try:
	models = self.client.list()
	return [model['name'] for model in models['models']]
	except Exception as e:
	logger.error(f"Error getting models: {e}")
	return [Settings.DEFAULT_MODEL]

	def switch_model(self, model_name: str) -> bool:
	"""
	Switch to a different model
	"""
	try:
	# Test if model is available
	#self.client.generate(model=model_name, prompt="test", options={'num_predict': 1})
	self.model = model_name
	self.model_name = model_name
	logger.info(f"Switched to model: {model_name}")
	return True
	except Exception as e:
	logger.error(f"Error switching to model {model_name}: {e}")
	return False

	def generate_embedding(self, text: str) -> List[float]:
	"""
	Generate embeddings for text using Ollama
	"""
	try:
	response = self.client.embeddings(
	model=Settings.EMBEDDING_MODEL,
	prompt=text
	)
	return response['embedding']
	except Exception as e:
	logger.error(f"Error generating embedding: {e}")
	return []



	class HuggingFaceLLMHandler:
	def __init__(self):
	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch
	import psutil

	self.model_name = "microsoft/Phi-3-mini-4k-instruct"
	print("Loading model... this may take a moment on first run")

	# Choose device and dtype intelligently
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	torch_dtype = torch.float16 if device.type == "cuda" else torch.float32

	print(f"Using device: {device}, dtype: {torch_dtype}")
	print(f"Available RAM: {psutil.virtual_memory().available / 1e6:.2f} MB")

	# Load tokenizer
	self.tokenizer = AutoTokenizer.from_pretrained(
	self.model_name,
	trust_remote_code=True
	)

	# Load model safely
	try:
	self.model = AutoModelForCausalLM.from_pretrained(
	self.model_name,
	torch_dtype=torch_dtype,
	device_map="auto" if device.type == "cuda" else None,
	low_cpu_mem_usage=True, # Helps reduce RAM footprint during init
	trust_remote_code=True
	)

	# Explicitly move to CPU if needed
	if device.type == "cpu":
	self.model = self.model.to(device)

	print("Model loaded successfully!")

	except RuntimeError as e:
	print(f"❌ Error loading model: {e}")
	print("Tip: Try switching to a smaller model or free up RAM.")

	def generate_response(self, prompt: str, context: Optional[str] = None,
	tools_output: Optional[str] = None) -> str:
	"""
	Generate response using Phi-3 - should be under 10 seconds
	"""
	try:
	# Build simple prompt
	full_prompt = self._build_simple_prompt(prompt, context, tools_output)

	# Tokenize and move to same device as model
	inputs = self.tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=1024)
	inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

	# Generate
	with torch.no_grad():
	outputs = self.model.generate(
	inputs["input_ids"],
	max_new_tokens=200, # Limit response length
	temperature=0.7,
	do_sample=True,
	pad_token_id=self.tokenizer.eos_token_id,
	attention_mask=inputs["attention_mask"]
	)

	# Decode response
	response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
	return response.strip()

	except Exception as e:
	logger.error(f"Error generating response: {e}")
	return f"Error generating response: {str(e)}"

	def _build_simple_prompt(self, user_input: str, context: Optional[str] = None,
	tools_output: Optional[str] = None) -> str:
	"""Simple prompt builder"""
	prompt_parts = ["You are a helpful AI assistant."]

	if context and len(context) < 300:
	prompt_parts.append(f"Context: {context}")

	if tools_output and len(tools_output) < 200:
	prompt_parts.append(f"Additional info: {tools_output}")

	prompt_parts.append(f"User: {user_input}")
	prompt_parts.append("Assistant:")

	return "\n\n".join(prompt_parts)

	def add_to_history(self, user_input: str, assistant_response: str):
	"""
	Add exchange to conversation history
	"""
	self.conversation_history.append({
	'user': user_input,
	'assistant': assistant_response
	})

	# Keep only recent history
	if len(self.conversation_history) > Settings.MAX_CONVERSATION_HISTORY:
	self.conversation_history = self.conversation_history[-Settings.MAX_CONVERSATION_HISTORY:]

	def clear_history(self):
	"""
	Clear conversation history
	"""
	self.conversation_history = []

	def get_available_models(self) -> List[str]:
	"""
	Get list of available Ollama models
	"""
	try:
	models = self.client.list()
	return [model['name'] for model in models['models']]
	except Exception as e:
	logger.error(f"Error getting models: {e}")
	return [Settings.DEFAULT_MODEL]

	def switch_model(self, model_name: str) -> bool:
	"""
	Switch to a different model
	"""
	try:
	# Test if model is available
	self.client.generate(model=model_name, prompt="test", options={'num_predict': 1})
	self.model_name = model_name
	logger.info(f"Switched to model: {model_name}")
	return True
	except Exception as e:
	logger.error(f"Error switching to model {model_name}: {e}")
	return False

	def generate_embedding(self, text: str) -> List[float]:
	"""
	Generate embeddings for text using Ollama
	"""
	try:
	response = self.client.embeddings(
	model=Settings.EMBEDDING_MODEL,
	prompt=text
	)
	return response['embedding']
	except Exception as e:
	logger.error(f"Error generating embedding: {e}")
	return []