Spaces:
Runtime error
Runtime error
| # import ollama | |
| import json | |
| import logging | |
| from typing import List, Dict, Any, Optional | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| from typing import Optional | |
| from config.settings import Settings | |
| logger = logging.getLogger(__name__) | |
| # HuggingFace LLM Handler for Microsoft Phi-3 Mini | |
| import requests | |
| from typing import Optional | |
| import requests | |
| import os | |
| from dotenv import load_dotenv | |
| load_dotenv() | |
| class OpenRouterLLMHandler: | |
| def __init__(self, api_key: str="", model: str = "mistralai/mistral-7b-instruct"): | |
| if (model == ""): | |
| model = self.current_model | |
| API_KEY = os.getenv("OPENROUTER_API_KEY") | |
| api_key= API_KEY if API_KEY else api_key | |
| self.api_key = api_key | |
| self.model = model | |
| self.base_url = "https://openrouter.ai/api/v1/chat/completions" | |
| print(f"🔌 Initialized OpenRouter handler with model: {model}") | |
| def generate_response(self, prompt: str, context: Optional[str] = None, tools_output: Optional[str] = None) -> str: | |
| try: | |
| full_prompt = self._build_simple_prompt(prompt, context, tools_output) | |
| # if self.model_name: | |
| # self.model = self.model_name | |
| #self.model = self.model_name | |
| # if (model == ""): | |
| # model = self.model_name | |
| headers = { | |
| "Authorization": f"Bearer {self.api_key}", | |
| "Content-Type": "application/json" | |
| } | |
| payload = { | |
| "model": self.model, | |
| "messages": [ | |
| {"role": "system", "content": "You are a helpful AI assistant."}, | |
| {"role": "user", "content": full_prompt} | |
| ], | |
| "temperature": 0.7, | |
| "max_tokens": 200 | |
| } | |
| # 222 | |
| # 320 | |
| # 90k | |
| # msai | |
| # 2% candidate | |
| response = requests.post(self.base_url, headers=headers, json=payload) | |
| response.raise_for_status() | |
| result = response.json() | |
| return result["choices"][0]["message"]["content"].strip() | |
| except Exception as e: | |
| return f"Error generating response: {str(e)}" | |
| def _build_simple_prompt(self, user_input: str, context: Optional[str] = None, tools_output: Optional[str] = None) -> str: | |
| prompt_parts = [] | |
| if context and len(context) < 300: | |
| prompt_parts.append(f"Context: {context}") | |
| if tools_output and len(tools_output) < 200: | |
| prompt_parts.append(f"Additional info: {tools_output}") | |
| prompt_parts.append(f"User query: {user_input}") | |
| return "\n\n".join(prompt_parts) | |
| def add_to_history(self, user_input: str, assistant_response: str): | |
| """ | |
| Add exchange to conversation history | |
| """ | |
| self.conversation_history.append({ | |
| 'user': user_input, | |
| 'assistant': assistant_response | |
| }) | |
| # Keep only recent history | |
| if len(self.conversation_history) > Settings.MAX_CONVERSATION_HISTORY: | |
| self.conversation_history = self.conversation_history[-Settings.MAX_CONVERSATION_HISTORY:] | |
| def clear_history(self): | |
| """ | |
| Clear conversation history | |
| """ | |
| self.conversation_history = [] | |
| def get_available_models(self) -> List[str]: | |
| """ | |
| Get list of available Ollama models | |
| """ | |
| try: | |
| models = self.client.list() | |
| return [model['name'] for model in models['models']] | |
| except Exception as e: | |
| logger.error(f"Error getting models: {e}") | |
| return [Settings.DEFAULT_MODEL] | |
| def switch_model(self, model_name: str) -> bool: | |
| """ | |
| Switch to a different model | |
| """ | |
| try: | |
| # Test if model is available | |
| #self.client.generate(model=model_name, prompt="test", options={'num_predict': 1}) | |
| self.model = model_name | |
| self.model_name = model_name | |
| logger.info(f"Switched to model: {model_name}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error switching to model {model_name}: {e}") | |
| return False | |
| def generate_embedding(self, text: str) -> List[float]: | |
| """ | |
| Generate embeddings for text using Ollama | |
| """ | |
| try: | |
| response = self.client.embeddings( | |
| model=Settings.EMBEDDING_MODEL, | |
| prompt=text | |
| ) | |
| return response['embedding'] | |
| except Exception as e: | |
| logger.error(f"Error generating embedding: {e}") | |
| return [] | |
| class HuggingFaceLLMHandler: | |
| def __init__(self): | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| import torch | |
| import psutil | |
| self.model_name = "microsoft/Phi-3-mini-4k-instruct" | |
| print("Loading model... this may take a moment on first run") | |
| # Choose device and dtype intelligently | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| torch_dtype = torch.float16 if device.type == "cuda" else torch.float32 | |
| print(f"Using device: {device}, dtype: {torch_dtype}") | |
| print(f"Available RAM: {psutil.virtual_memory().available / 1e6:.2f} MB") | |
| # Load tokenizer | |
| self.tokenizer = AutoTokenizer.from_pretrained( | |
| self.model_name, | |
| trust_remote_code=True | |
| ) | |
| # Load model safely | |
| try: | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.model_name, | |
| torch_dtype=torch_dtype, | |
| device_map="auto" if device.type == "cuda" else None, | |
| low_cpu_mem_usage=True, # Helps reduce RAM footprint during init | |
| trust_remote_code=True | |
| ) | |
| # Explicitly move to CPU if needed | |
| if device.type == "cpu": | |
| self.model = self.model.to(device) | |
| print("Model loaded successfully!") | |
| except RuntimeError as e: | |
| print(f"❌ Error loading model: {e}") | |
| print("Tip: Try switching to a smaller model or free up RAM.") | |
| def generate_response(self, prompt: str, context: Optional[str] = None, | |
| tools_output: Optional[str] = None) -> str: | |
| """ | |
| Generate response using Phi-3 - should be under 10 seconds | |
| """ | |
| try: | |
| # Build simple prompt | |
| full_prompt = self._build_simple_prompt(prompt, context, tools_output) | |
| # Tokenize and move to same device as model | |
| inputs = self.tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=1024) | |
| inputs = {k: v.to(self.model.device) for k, v in inputs.items()} | |
| # Generate | |
| with torch.no_grad(): | |
| outputs = self.model.generate( | |
| inputs["input_ids"], | |
| max_new_tokens=200, # Limit response length | |
| temperature=0.7, | |
| do_sample=True, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| attention_mask=inputs["attention_mask"] | |
| ) | |
| # Decode response | |
| response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) | |
| return response.strip() | |
| except Exception as e: | |
| logger.error(f"Error generating response: {e}") | |
| return f"Error generating response: {str(e)}" | |
| def _build_simple_prompt(self, user_input: str, context: Optional[str] = None, | |
| tools_output: Optional[str] = None) -> str: | |
| """Simple prompt builder""" | |
| prompt_parts = ["You are a helpful AI assistant."] | |
| if context and len(context) < 300: | |
| prompt_parts.append(f"Context: {context}") | |
| if tools_output and len(tools_output) < 200: | |
| prompt_parts.append(f"Additional info: {tools_output}") | |
| prompt_parts.append(f"User: {user_input}") | |
| prompt_parts.append("Assistant:") | |
| return "\n\n".join(prompt_parts) | |
| def add_to_history(self, user_input: str, assistant_response: str): | |
| """ | |
| Add exchange to conversation history | |
| """ | |
| self.conversation_history.append({ | |
| 'user': user_input, | |
| 'assistant': assistant_response | |
| }) | |
| # Keep only recent history | |
| if len(self.conversation_history) > Settings.MAX_CONVERSATION_HISTORY: | |
| self.conversation_history = self.conversation_history[-Settings.MAX_CONVERSATION_HISTORY:] | |
| def clear_history(self): | |
| """ | |
| Clear conversation history | |
| """ | |
| self.conversation_history = [] | |
| def get_available_models(self) -> List[str]: | |
| """ | |
| Get list of available Ollama models | |
| """ | |
| try: | |
| models = self.client.list() | |
| return [model['name'] for model in models['models']] | |
| except Exception as e: | |
| logger.error(f"Error getting models: {e}") | |
| return [Settings.DEFAULT_MODEL] | |
| def switch_model(self, model_name: str) -> bool: | |
| """ | |
| Switch to a different model | |
| """ | |
| try: | |
| # Test if model is available | |
| self.client.generate(model=model_name, prompt="test", options={'num_predict': 1}) | |
| self.model_name = model_name | |
| logger.info(f"Switched to model: {model_name}") | |
| return True | |
| except Exception as e: | |
| logger.error(f"Error switching to model {model_name}: {e}") | |
| return False | |
| def generate_embedding(self, text: str) -> List[float]: | |
| """ | |
| Generate embeddings for text using Ollama | |
| """ | |
| try: | |
| response = self.client.embeddings( | |
| model=Settings.EMBEDDING_MODEL, | |
| prompt=text | |
| ) | |
| return response['embedding'] | |
| except Exception as e: | |
| logger.error(f"Error generating embedding: {e}") | |
| return [] | |