Spaces:

faisalsns
/

ai-reasoning-copilot

Runtime error

File size: 10,382 Bytes

# import ollama
import json
import logging
from typing import List, Dict, Any, Optional
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from typing import Optional
from config.settings import Settings

logger = logging.getLogger(__name__)


# HuggingFace LLM Handler for Microsoft Phi-3 Mini

import requests
from typing import Optional
import requests
import os
from dotenv import load_dotenv

load_dotenv()


class OpenRouterLLMHandler:
    def __init__(self, api_key: str="", model: str = "mistralai/mistral-7b-instruct"):
        if (model == ""): 
            model = self.current_model

        API_KEY = os.getenv("OPENROUTER_API_KEY")
        api_key= API_KEY if API_KEY else api_key
        self.api_key = api_key
        self.model = model
        self.base_url = "https://openrouter.ai/api/v1/chat/completions"
        print(f"🔌 Initialized OpenRouter handler with model: {model}")

    def generate_response(self, prompt: str, context: Optional[str] = None, tools_output: Optional[str] = None) -> str:
        try:
            full_prompt = self._build_simple_prompt(prompt, context, tools_output)

            # if self.model_name:
            #     self.model = self.model_name            
            #self.model = self.model_name

            # if (model == ""): 
            #     model = self.model_name

            headers = {
                "Authorization": f"Bearer {self.api_key}",
                "Content-Type": "application/json"
            }

            payload = {
                "model": self.model,
                "messages": [
                    {"role": "system", "content": "You are a helpful AI assistant."},
                    {"role": "user", "content": full_prompt}
                ],
                "temperature": 0.7,
                "max_tokens": 200
            }

# 222
# 320
# 90k
# msai

# 2% candidate





            response = requests.post(self.base_url, headers=headers, json=payload)
            response.raise_for_status()
            result = response.json()

            return result["choices"][0]["message"]["content"].strip()

        except Exception as e:
            return f"Error generating response: {str(e)}"

    def _build_simple_prompt(self, user_input: str, context: Optional[str] = None, tools_output: Optional[str] = None) -> str:
        prompt_parts = []

        if context and len(context) < 300:
            prompt_parts.append(f"Context: {context}")

        if tools_output and len(tools_output) < 200:
            prompt_parts.append(f"Additional info: {tools_output}")

        prompt_parts.append(f"User query: {user_input}")
        return "\n\n".join(prompt_parts)

    def add_to_history(self, user_input: str, assistant_response: str):
        """
        Add exchange to conversation history
        """
        self.conversation_history.append({
            'user': user_input,
            'assistant': assistant_response
        })
        
        # Keep only recent history
        if len(self.conversation_history) > Settings.MAX_CONVERSATION_HISTORY:
            self.conversation_history = self.conversation_history[-Settings.MAX_CONVERSATION_HISTORY:]
    
    def clear_history(self):
        """
        Clear conversation history
        """
        self.conversation_history = []
    
    def get_available_models(self) -> List[str]:
        """
        Get list of available Ollama models
        """
        try:
            models = self.client.list()
            return [model['name'] for model in models['models']]
        except Exception as e:
            logger.error(f"Error getting models: {e}")
            return [Settings.DEFAULT_MODEL]
    
    def switch_model(self, model_name: str) -> bool:
        """
        Switch to a different model
        """
        try:
            # Test if model is available
            #self.client.generate(model=model_name, prompt="test", options={'num_predict': 1})
            self.model = model_name
            self.model_name = model_name
            logger.info(f"Switched to model: {model_name}")
            return True
        except Exception as e:
            logger.error(f"Error switching to model {model_name}: {e}")
            return False
    
    def generate_embedding(self, text: str) -> List[float]:
        """
        Generate embeddings for text using Ollama
        """
        try:
            response = self.client.embeddings(
                model=Settings.EMBEDDING_MODEL,
                prompt=text
            )
            return response['embedding']
        except Exception as e:
            logger.error(f"Error generating embedding: {e}")
            return []
        


class HuggingFaceLLMHandler:
    def __init__(self):
        from transformers import AutoTokenizer, AutoModelForCausalLM
        import torch
        import psutil

        self.model_name = "microsoft/Phi-3-mini-4k-instruct"
        print("Loading model... this may take a moment on first run")

        # Choose device and dtype intelligently
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        torch_dtype = torch.float16 if device.type == "cuda" else torch.float32

        print(f"Using device: {device}, dtype: {torch_dtype}")
        print(f"Available RAM: {psutil.virtual_memory().available / 1e6:.2f} MB")

        # Load tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(
            self.model_name,
            trust_remote_code=True
        )

        # Load model safely
        try:
            self.model = AutoModelForCausalLM.from_pretrained(
                self.model_name,
                torch_dtype=torch_dtype,
                device_map="auto" if device.type == "cuda" else None,
                low_cpu_mem_usage=True,  # Helps reduce RAM footprint during init
                trust_remote_code=True
            )

            # Explicitly move to CPU if needed
            if device.type == "cpu":
                self.model = self.model.to(device)

            print("Model loaded successfully!")

        except RuntimeError as e:
            print(f"❌ Error loading model: {e}")
            print("Tip: Try switching to a smaller model or free up RAM.")
       
    def generate_response(self, prompt: str, context: Optional[str] = None, 
                            tools_output: Optional[str] = None) -> str:
            """
            Generate response using Phi-3 - should be under 10 seconds
            """
            try:
                # Build simple prompt
                full_prompt = self._build_simple_prompt(prompt, context, tools_output)
                
                # Tokenize and move to same device as model
                inputs = self.tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=1024)
                inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
                
                # Generate
                with torch.no_grad():
                    outputs = self.model.generate(
                        inputs["input_ids"],
                        max_new_tokens=200,  # Limit response length
                        temperature=0.7,
                        do_sample=True,
                        pad_token_id=self.tokenizer.eos_token_id,
                        attention_mask=inputs["attention_mask"]
                    )
                
                # Decode response
                response = self.tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
                return response.strip()
                
            except Exception as e:
                logger.error(f"Error generating response: {e}")
                return f"Error generating response: {str(e)}"

    def _build_simple_prompt(self, user_input: str, context: Optional[str] = None, 
                            tools_output: Optional[str] = None) -> str:
        """Simple prompt builder"""
        prompt_parts = ["You are a helpful AI assistant."]
        
        if context and len(context) < 300:
            prompt_parts.append(f"Context: {context}")
        
        if tools_output and len(tools_output) < 200:
            prompt_parts.append(f"Additional info: {tools_output}")
        
        prompt_parts.append(f"User: {user_input}")
        prompt_parts.append("Assistant:")
        
        return "\n\n".join(prompt_parts)
    
    def add_to_history(self, user_input: str, assistant_response: str):
        """
        Add exchange to conversation history
        """
        self.conversation_history.append({
            'user': user_input,
            'assistant': assistant_response
        })
        
        # Keep only recent history
        if len(self.conversation_history) > Settings.MAX_CONVERSATION_HISTORY:
            self.conversation_history = self.conversation_history[-Settings.MAX_CONVERSATION_HISTORY:]
    
    def clear_history(self):
        """
        Clear conversation history
        """
        self.conversation_history = []
    
    def get_available_models(self) -> List[str]:
        """
        Get list of available Ollama models
        """
        try:
            models = self.client.list()
            return [model['name'] for model in models['models']]
        except Exception as e:
            logger.error(f"Error getting models: {e}")
            return [Settings.DEFAULT_MODEL]
    
    def switch_model(self, model_name: str) -> bool:
        """
        Switch to a different model
        """
        try:
            # Test if model is available
            self.client.generate(model=model_name, prompt="test", options={'num_predict': 1})
            self.model_name = model_name
            logger.info(f"Switched to model: {model_name}")
            return True
        except Exception as e:
            logger.error(f"Error switching to model {model_name}: {e}")
            return False
    
    def generate_embedding(self, text: str) -> List[float]:
        """
        Generate embeddings for text using Ollama
        """
        try:
            response = self.client.embeddings(
                model=Settings.EMBEDDING_MODEL,
                prompt=text
            )
            return response['embedding']
        except Exception as e:
            logger.error(f"Error generating embedding: {e}")
            return []