File size: 5,150 Bytes
f26de06
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
"""
LLM Service - Chat completions via HuggingFace.
"""

import logging
from typing import Dict, List, Optional, Any
from dataclasses import dataclass

from huggingface_hub import InferenceClient

from config.settings import Settings

logger = logging.getLogger(__name__)


@dataclass
class LLMConfig:
    """LLM configuration."""
    api_key: str
    model_name: str
    temperature: float = 0.01
    max_tokens: int = 512


class LLMService:
    """
    LLM service using HuggingFace InferenceClient.
    """
    
    def __init__(
        self,
        api_key: Optional[str] = None,
        model_name: Optional[str] = None,
    ):
        """
        Initialize LLM service.
        
        Args:
            api_key: HuggingFace API key
            model_name: Model name/ID
        """
        settings = Settings()
        
        key = api_key or settings.hf_token
        name = model_name or settings.effective_model_name
        
        self.config = LLMConfig(
            api_key=key,
            model_name=name,
            temperature=settings.hf_temperature,
            max_tokens=settings.hf_max_new_tokens,
        )
        
        self.client = InferenceClient(token=self.config.api_key)
        
        logger.info(f"LLMService initialized with model: {self.config.model_name}")

    async def get_chat_completion(
        self,
        messages: List[Dict[str, str]],
        temperature: Optional[float] = None,
        max_tokens: Optional[int] = None,
    ) -> str:
        """
        Get chat completion from the model.
        
        Args:
            messages: List of message dicts with 'role' and 'content'
            temperature: Override temperature
            max_tokens: Override max tokens
            
        Returns:
            Assistant response text
        """
        logger.debug(f"Chat completion request with model: {self.config.model_name}")
        
        try:
            response = self.client.chat_completion(
                messages=messages,
                model=self.config.model_name,
                max_tokens=max_tokens or self.config.max_tokens,
                temperature=temperature or self.config.temperature
            )
            
            content = response.choices[0].message.content
            logger.debug(f"Chat completion response: {content[:200]}...")
            
            return content
            
        except Exception as e:
            logger.error(f"Chat completion error: {str(e)}")
            raise Exception(f"LLM completion error: {str(e)}")

    async def get_streaming_completion(
        self,
        messages: List[Dict[str, str]],
        temperature: Optional[float] = None,
        max_tokens: Optional[int] = None,
    ):
        """
        Get streaming chat completion.
        
        Yields:
            Text chunks as they're generated
        """
        logger.debug(f"Streaming completion request with model: {self.config.model_name}")
        
        try:
            stream = self.client.chat_completion(
                messages=messages,
                model=self.config.model_name,
                max_tokens=max_tokens or self.config.max_tokens,
                temperature=temperature or self.config.temperature,
                stream=True
            )
            
            for chunk in stream:
                if chunk.choices and chunk.choices[0].delta.content:
                    yield chunk.choices[0].delta.content
                    
        except Exception as e:
            logger.error(f"Streaming completion error: {str(e)}")
            raise Exception(f"LLM streaming error: {str(e)}")

    def build_messages_with_tools(
        self,
        system_prompt: str,
        user_input: str,
        tools_description: str = "",
        conversation_history: Optional[List[Dict[str, str]]] = None,
        tool_results: Optional[str] = None,
    ) -> List[Dict[str, str]]:
        """
        Build messages array with tools and context.
        
        Args:
            system_prompt: System instruction
            user_input: User's message
            tools_description: Available tools description
            conversation_history: Previous messages
            tool_results: Results from tool execution
            
        Returns:
            Messages array for chat completion
        """
        messages = [{"role": "system", "content": system_prompt}]
        
        if tools_description:
            messages.append({
                "role": "system",
                "content": f"Available tools:\n{tools_description}"
            })
        
        # Add conversation history
        if conversation_history:
            for msg in conversation_history[-10:]:  # Last 10 messages
                if msg.get("role") in ["user", "assistant"]:
                    messages.append(msg)
        
        # Add current user input
        messages.append({"role": "user", "content": user_input})
        
        # Add tool results if present
        if tool_results:
            messages.append({"role": "assistant", "content": tool_results})
        
        return messages