File size: 12,524 Bytes
01d5a5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
import logging
import os
import time
import sys
import torch  # Add torch import for CUDA detection
import traceback
from dataclasses import asdict

from flask import Blueprint, jsonify, request
from flask_pydantic import validate

from lpm_kernel.api.common.responses import APIResponse
from lpm_kernel.api.domains.kernel2.dto.chat_dto import (
    ChatRequest,
)
from lpm_kernel.api.domains.kernel2.services.chat_service import chat_service
from lpm_kernel.api.domains.kernel2.services.prompt_builder import (
    BasePromptStrategy,
    RoleBasedStrategy,
    KnowledgeEnhancedStrategy,
)
from lpm_kernel.api.domains.loads.services import LoadService
from lpm_kernel.api.services.local_llm_service import local_llm_service

from ...common.script_executor import ScriptExecutor
from ....configs.config import Config

logger = logging.getLogger(__name__)

kernel2_bp = Blueprint("kernel2", __name__, url_prefix="/api/kernel2")

# Create script executor instance
script_executor = ScriptExecutor()


@kernel2_bp.route("/health", methods=["GET"])
def health_check():
    """Health check endpoint"""
    config = Config.from_env()
    app_name = config.app_name or "Service"  # Add default value to prevent None

    status = local_llm_service.get_server_status()
    if status.is_running and status.process_info:
        return jsonify(
            APIResponse.success(
                data={
                    "status": "running",
                    "pid": status.process_info.pid,
                    "cpu_percent": status.process_info.cpu_percent,
                    "memory_percent": status.process_info.memory_percent,
                    "uptime": time.time() - status.process_info.create_time,
                }
            )
        )
    else:
        return jsonify(APIResponse.success(data={"status": "stopped"}))


@kernel2_bp.route("/username", methods=["GET"])
def username():
    return jsonify(APIResponse.success(data={"username": LoadService.get_current_upload_name()}))

# read IN_DOCKER_ENV and output
@kernel2_bp.route("/docker/env", methods=["GET"])
def docker_env():
    return jsonify(APIResponse.success(data={"in_docker_env": os.getenv("IN_DOCKER_ENV")}))

@kernel2_bp.route("/llama/start", methods=["POST"])
def start_llama_server():
    """Start llama-server service"""
    try:
        # Get request parameters
        data = request.get_json()
        if not data or "model_name" not in data:
            return jsonify(APIResponse.error(message="Missing required parameter: model_name", code=400))

        model_name = data["model_name"]
        # Get optional use_gpu parameter with default value of True
        use_gpu = data.get("use_gpu", True)
        base_dir = os.getcwd()
        model_dir = os.path.join(base_dir, "resources/model/output/gguf", model_name)
        gguf_path = os.path.join(model_dir, "model.gguf")

        server_path = os.path.join(os.getcwd(), "llama.cpp/build/bin")
        if os.path.exists(os.path.join(os.getcwd(), "llama.cpp/build/bin/Release")):
            server_path = os.path.join(os.getcwd(), "llama.cpp/build/bin/Release")
            
        # Determine the executable name based on platform (.exe for Windows)
        if sys.platform.startswith("win"):
            server_executable = "llama-server.exe"
        else:
            server_executable = "llama-server"
        server_path = os.path.join(server_path, server_executable)

        # Check if model file exists
        if not os.path.exists(gguf_path):
            return jsonify(APIResponse.error(
                message=f"Model '{model_name}' GGUF file does not exist, please convert model first",
                code=400
            ))

        # Start the server using the LocalLLMService with GPU acceleration if requested
        success = local_llm_service.start_server(gguf_path, use_gpu=use_gpu)
        
        if not success:
            return jsonify(APIResponse.error(message="Failed to start llama-server", code=500))
            
        # Get updated service status
        status = local_llm_service.get_server_status()
        
        # Return success response with GPU info
        gpu_info = "with GPU acceleration" if use_gpu and torch.cuda.is_available() else "with CPU only"
        return jsonify(
            APIResponse.success(
                data={
                    "model_name": model_name,
                    "gguf_path": gguf_path,
                    "status": "running" if status.is_running else "starting",
                    "use_gpu": use_gpu and torch.cuda.is_available(),
                    "gpu_info": gpu_info
                },
                message=f"llama-server service started {gpu_info}"
            )
        )

    except Exception as e:
        error_msg = f"Failed to start llama-server: {str(e)}"
        logger.error(error_msg)
        return jsonify(APIResponse.error(message=error_msg, code=500))


# Flag to track if service is stopping
_stopping_server = False

@kernel2_bp.route("/llama/stop", methods=["POST"])
def stop_llama_server():
    """Stop llama-server service - Force immediate termination of the process"""
    global _stopping_server

    try:
        # If service is already stopping, return notification
        if _stopping_server:
            return jsonify(APIResponse.success(message="llama-server service is stopping"))

        _stopping_server = True  # Set stopping flag

        try:
            # use improved local_llm_service.stop_server() to stop all llama-server process
            status = local_llm_service.stop_server()

            # check if there are still processes running
            if status.is_running and status.process_info:
                pid = status.process_info.pid
                logger.warning(f"llama-server process still running: {pid}")
                return jsonify(APIResponse.success(
                    message="llama-server service could not be fully stopped. Please try again.",
                    data={"running_pid": pid}
                ))
            else:
                return jsonify(APIResponse.success(message="llama-server service has been stopped successfully"))

        except Exception as e:
            logger.error(f"Error while stopping llama-server: {str(e)}")
            return jsonify(APIResponse.error(message=f"Error stopping llama-server: {str(e)}", code=500))
        finally:
            _stopping_server = False

    except Exception as e:
        _stopping_server = False
        logger.error(f"Failed to stop llama-server: {str(e)}")
        return jsonify(APIResponse.error(message=f"Failed to stop llama-server: {str(e)}", code=500))


@kernel2_bp.route("/llama/status", methods=["GET"])
@validate()
def get_llama_server_status():
    """Get llama-server service status"""
    try:
        status = local_llm_service.get_server_status()
        return APIResponse.success(asdict(status))

    except Exception as e:
        logger.error(f"Error getting llama-server status: {str(e)}", exc_info=True)
        return APIResponse.error(f"Error getting llama-server status: {str(e)}")

@kernel2_bp.route("/chat", methods=["POST"])
@validate()
def chat(body: ChatRequest):
    """
    Chat interface - Stream response (OpenAI API compatible)

    Request parameters: Compatible with OpenAI Chat Completions API format
    - messages: List[Dict[str, str]], standard OpenAI message list with format:
        [
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": "Hello, who are you?"},
            {"role": "assistant", "content": "I am a helpful assistant."},
            {"role": "user", "content": "What can you do for me?"}  
        ]
    - metadata: Dict[str, Any], additional parameters for request processing (optional):
        {
            "enable_l0_retrieval": true,  // whether to enable knowledge retrieval
            "enable_l1_retrieval": false, // whether to enable advanced knowledge retrieval
            "role_id": "uuid-string"      // optional role UUID for system customization
        }
    - stream: bool, whether to stream the response (default: True)
    - model: str, model identifier (optional, default uses configured model)
    - temperature: float, controls randomness (default: 0.1)
    - max_tokens: int, maximum tokens to generate (default: 2000)

    Response: Standard OpenAI Chat Completions API format
    For stream=true (Server-Sent Events):
    - id: str, response unique identifier
    - object: "chat.completion.chunk"
    - created: int, timestamp
    - model: str, model identifier
    - system_fingerprint: str, system fingerprint
    - choices: [
        {
          "index": 0,
          "delta": {"content": str},
          "finish_reason": null or "stop"
        }
      ]
    
    The last event will be: data: [DONE]
    
    For stream=false:
    - Complete response object with full message content
    """
    try:
        logger.info(f"Starting chat request: {body}")
        # 1. Check service status
        status = local_llm_service.get_server_status()
        if not status.is_running:
            # Format error response in OpenAI-compatible format
            error_msg = "LLama server is not running"
            logger.error(error_msg)
            error_response = {
                "error": {
                    "message": error_msg,
                    "type": "server_error",
                    "code": "service_unavailable"
                }
            }
            # Return as regular JSON response for non-stream or stream-compatible error
            if not body.stream:
                return APIResponse.error(message="Service temporarily unavailable", code=503), 503
            return local_llm_service.handle_stream_response(iter([error_response]))

        try:
            # Use chat_service to process request with OpenAI-compatible format
            response = chat_service.chat(
                request=body,
                stream=body.stream,  # Respect the stream parameter from request
                json_response=False,
                strategy_chain=[BasePromptStrategy, RoleBasedStrategy, KnowledgeEnhancedStrategy]
            )
            
            # Handle streaming or non-streaming response appropriately
            if body.stream:
                return local_llm_service.handle_stream_response(response)
            else:
                # For non-streaming, return the complete response as JSON
                return jsonify(response)

        except ValueError as e:
            error_msg = str(e)
            logger.error(f"Value error: {error_msg}")
            error_response = {
                "error": {
                    "message": error_msg,
                    "type": "invalid_request_error",
                    "code": "bad_request"
                }
            }
            if not body.stream:
                return jsonify(error_response), 400
            return local_llm_service.handle_stream_response(iter([error_response]))

    except Exception as e:
        error_msg = f"Request processing failed: {str(e)}"
        logger.error(error_msg, exc_info=True)
        error_response = {
            "error": {
                "message": error_msg,
                "type": "server_error",
                "code": "internal_server_error"
            }
        }
        if not getattr(body, 'stream', True):  # Default to stream if attribute missing
            return jsonify(error_response), 500
        return local_llm_service.handle_stream_response(iter([error_response]))


@kernel2_bp.route("/cuda/available", methods=["GET"])
def check_cuda_available():
    """Check if CUDA is available for model training/inference"""
    try:
        import torch
        cuda_available = torch.cuda.is_available()
        cuda_info = {}
        
        if cuda_available:
            cuda_info = {
                "device_count": torch.cuda.device_count(),
                "current_device": torch.cuda.current_device(),
                "device_name": torch.cuda.get_device_name(0)
            }
        
        return jsonify(APIResponse.success(
            data={
                "cuda_available": cuda_available,
                "cuda_info": cuda_info
            },
            message="CUDA availability check completed"
        ))
    except Exception as e:
        error_msg = f"Error checking CUDA availability: {str(e)}"
        logger.error(error_msg)
        return jsonify(APIResponse.error(message=error_msg, code=500))