""" LLM Service - Chat completions via HuggingFace. """ import logging from typing import Dict, List, Optional, Any from dataclasses import dataclass from huggingface_hub import InferenceClient from config.settings import Settings logger = logging.getLogger(__name__) @dataclass class LLMConfig: """LLM configuration.""" api_key: str model_name: str temperature: float = 0.01 max_tokens: int = 512 class LLMService: """ LLM service using HuggingFace InferenceClient. """ def __init__( self, api_key: Optional[str] = None, model_name: Optional[str] = None, ): """ Initialize LLM service. Args: api_key: HuggingFace API key model_name: Model name/ID """ settings = Settings() key = api_key or settings.hf_token name = model_name or settings.effective_model_name self.config = LLMConfig( api_key=key, model_name=name, temperature=settings.hf_temperature, max_tokens=settings.hf_max_new_tokens, ) self.client = InferenceClient(token=self.config.api_key) logger.info(f"LLMService initialized with model: {self.config.model_name}") async def get_chat_completion( self, messages: List[Dict[str, str]], temperature: Optional[float] = None, max_tokens: Optional[int] = None, ) -> str: """ Get chat completion from the model. Args: messages: List of message dicts with 'role' and 'content' temperature: Override temperature max_tokens: Override max tokens Returns: Assistant response text """ logger.debug(f"Chat completion request with model: {self.config.model_name}") try: response = self.client.chat_completion( messages=messages, model=self.config.model_name, max_tokens=max_tokens or self.config.max_tokens, temperature=temperature or self.config.temperature ) content = response.choices[0].message.content logger.debug(f"Chat completion response: {content[:200]}...") return content except Exception as e: logger.error(f"Chat completion error: {str(e)}") raise Exception(f"LLM completion error: {str(e)}") async def get_streaming_completion( self, messages: List[Dict[str, str]], temperature: Optional[float] = None, max_tokens: Optional[int] = None, ): """ Get streaming chat completion. Yields: Text chunks as they're generated """ logger.debug(f"Streaming completion request with model: {self.config.model_name}") try: stream = self.client.chat_completion( messages=messages, model=self.config.model_name, max_tokens=max_tokens or self.config.max_tokens, temperature=temperature or self.config.temperature, stream=True ) for chunk in stream: if chunk.choices and chunk.choices[0].delta.content: yield chunk.choices[0].delta.content except Exception as e: logger.error(f"Streaming completion error: {str(e)}") raise Exception(f"LLM streaming error: {str(e)}") def build_messages_with_tools( self, system_prompt: str, user_input: str, tools_description: str = "", conversation_history: Optional[List[Dict[str, str]]] = None, tool_results: Optional[str] = None, ) -> List[Dict[str, str]]: """ Build messages array with tools and context. Args: system_prompt: System instruction user_input: User's message tools_description: Available tools description conversation_history: Previous messages tool_results: Results from tool execution Returns: Messages array for chat completion """ messages = [{"role": "system", "content": system_prompt}] if tools_description: messages.append({ "role": "system", "content": f"Available tools:\n{tools_description}" }) # Add conversation history if conversation_history: for msg in conversation_history[-10:]: # Last 10 messages if msg.get("role") in ["user", "assistant"]: messages.append(msg) # Add current user input messages.append({"role": "user", "content": user_input}) # Add tool results if present if tool_results: messages.append({"role": "assistant", "content": tool_results}) return messages