File size: 8,762 Bytes
defa84e
df53ff4
 
b13a7ae
214e263
 
b13a7ae
f8182c8
 
b13a7ae
 
 
 
 
214e263
 
b13a7ae
 
 
 
214e263
b13a7ae
e3719a1
214e263
 
5e03aec
b13a7ae
5e03aec
 
 
b13a7ae
6daf3d6
5e03aec
 
b13a7ae
214e263
 
b13a7ae
 
 
 
 
 
 
 
 
214e263
b13a7ae
 
214e263
 
b13a7ae
214e263
 
 
defa84e
df53ff4
b13a7ae
df53ff4
214e263
 
 
 
 
b13a7ae
 
214e263
b13a7ae
df53ff4
 
b13a7ae
 
bbd2fc4
214e263
b13a7ae
214e263
 
 
 
defa84e
b13a7ae
5e03aec
 
 
 
b13a7ae
214e263
 
 
b13a7ae
defa84e
b13a7ae
214e263
b13a7ae
 
214e263
 
 
 
b13a7ae
08ac672
214e263
 
b13a7ae
214e263
b13a7ae
defa84e
 
b13a7ae
214e263
b13a7ae
 
214e263
defa84e
214e263
 
 
 
 
 
 
defa84e
b13a7ae
214e263
b13a7ae
 
214e263
 
b13a7ae
214e263
 
 
 
 
 
defa84e
b13a7ae
214e263
b13a7ae
defa84e
b13a7ae
214e263
 
defa84e
 
214e263
 
defa84e
 
 
 
 
b13a7ae
 
 
 
defa84e
 
b13a7ae
5fc3b1a
defa84e
214e263
 
 
 
 
5fc3b1a
 
b13a7ae
defa84e
214e263
 
b13a7ae
214e263
 
 
b13a7ae
 
5fc3b1a
b13a7ae
214e263
 
 
5fc3b1a
214e263
b13a7ae
 
 
 
 
 
214e263
 
 
 
 
 
 
 
 
 
 
 
b13a7ae
214e263
b13a7ae
 
214e263
b13a7ae
 
f1706d4
bbd2fc4
b13a7ae
bbd2fc4
 
b13a7ae
 
bbd2fc4
 
b13a7ae
bbd2fc4
 
b13a7ae
bbd2fc4
b13a7ae
bbd2fc4
 
 
 
 
b13a7ae
bbd2fc4
b13a7ae
bbd2fc4
b13a7ae
bbd2fc4
 
 
b13a7ae
bbd2fc4
 
 
 
 
f1706d4
b13a7ae
f1706d4
b13a7ae
 
 
48cdada
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import os
import sys
import subprocess
import uvicorn
from typing import List, Dict, Any, Optional

# --- Configuration ---
MODEL_NAME = "Qwen3-0.6B-IQ4_XS.gguf"
MODEL_REPO = "unsloth/Qwen3-0.6B-GGUF"
AMD_SPACE_ID = "amd/gpt-oss-120b-chatbot" # Gradio Space ID for remote inference

# --- 0. Dynamic Module Installation ---
# WARNING: This may fail in many hosted environments due to permission issues.
# A `requirements.txt` is generally recommended for production.

def install_required_modules():
    """
    Installs necessary Python modules at runtime using pip,
    forcing compilation with AVX-512 flags for llama-cpp-python.
    """
    required_packages = [
        "fastapi", "uvicorn", "pydantic", "huggingface-hub",
        "git+https://github.com/abetlen/llama-cpp-python.git@main", "gradio_client"
    ]
    
    # ----------------------------------------------------
    # **Core Modification: Llama.cpp Compile Options**
    # ----------------------------------------------------
    compile_env = os.environ.copy()
    compile_env["FORCE_CMAKE"] = "1"
    # Note: If your CPU does not support AVX512, this will cause a runtime error (Illegal instruction).
    compile_env["CMAKE_ARGS"] = "-DGGML_AVX512=ON -DGGML_AVX512_VNNI=ON -DGGML_AVX512_VBMI=ON  -DLLAMA_CURL=OFF"
    # ----------------------------------------------------

    print("--- Attempting Dynamic Installation/Upgrade (AVX-512 Compilation) ---")
    
    try:
        subprocess.check_call(
            [
                sys.executable, "-m", "pip", "install", 
                *required_packages, 
                "--upgrade", "--no-cache-dir", "--force-reinstall" # Ensure recompile
            ], 
            env=compile_env
        )
        print("All modules successfully installed/updated. llama-cpp-python compiled with AVX-512.")
    except subprocess.CalledProcessError as e:
        print(f"**FATAL ERROR**: Module installation failed. Error: {e}")
        print("Check if your CPU supports AVX-512 or try removing the CMAKE_ARGS environment variable.")
        sys.exit(1)
    except Exception as e:
        print(f"**FATAL ERROR**: An unknown error occurred. Error: {e}")
        sys.exit(1)

install_required_modules()


# --- 1. Module Imports (Must be after installation) ---
try:
    from pydantic import BaseModel, Field
    from fastapi import FastAPI, HTTPException
    from fastapi.responses import JSONResponse, HTMLResponse
    from fastapi.middleware.cors import CORSMiddleware
    from huggingface_hub import hf_hub_download
    from llama_cpp import Llama, llama_print_system_info
    from gradio_client import Client
except ImportError as e:
    print(f"**FATAL ERROR**: Failed to import modules. Error: {e}")
    sys.exit(1)

# --- 2. Global State ---
LLAMA_INSTANCE: Optional[Llama] = None

def initialize_llm():
    """Downloads the model and initializes the global Llama instance."""
    global LLAMA_INSTANCE
    
    if LLAMA_INSTANCE is not None:
        return

    # Check AVX-512 status
    print("--- Llama.cpp System Info ---")
    print(llama_print_system_info())
    print("-----------------------------")

    print(f"--- 1. Starting model download: {MODEL_NAME} ---")
    try:
        model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_NAME)
    except Exception as e:
        raise RuntimeError(f"Failed to download model: {e}")

    print("--- 2. Initializing Llama.cpp instance ---")
    try:
        # Use half of physical CPU cores for threads, minimum 1
        n_threads = os.cpu_count() // 2 or 1
        LLAMA_INSTANCE = Llama(
            model_path=model_path,
            n_ctx=4096,
            n_batch=512,
            n_threads=n_threads,
            n_gpu_layers=0,
            verbose=False
        )
        print("Llama.cpp model successfully loaded.")
    except Exception as e:
        raise RuntimeError(f"Llama instance initialization failed: {e}")


# --- 3. FastAPI Setup and Middleware ---
app = FastAPI(
    title="LLM Inference API (Llama.cpp)",
    description="API service for direct inference using Llama.cpp."
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# --- 4. Pydantic Request Model ---
class InferenceRequestMinimal(BaseModel):
    """Data structure for a minimal inference request, accepting only a question."""
    question: str = Field(..., description="The user's input question or prompt.")


# --- 5. Core Inference Function (Non-Streaming) ---
def get_inference_response(
    messages: List[Dict[str, str]],
    system_message: str,
    max_tokens: int,
    temperature: float = 0.7,
    top_p: float = 0.95,
) -> str:
    """Calls the Llama.cpp instance and returns a single text response."""
    if LLAMA_INSTANCE is None:
        raise HTTPException(status_code=503, detail="LLM Service not initialized.")
    
    # Prepend the system message to the conversation history
    full_messages = [{"role": "system", "content": system_message}]
    full_messages.extend(messages)
    
    try:
        response = LLAMA_INSTANCE.create_chat_completion(
            messages=full_messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
        )
        
        # Safely extract the content
        content = response.get('choices', [{}])[0].get('message', {}).get('content')
        
        if content:
            return content
        
        return "⚠️ LLM service returned empty content."

    except Exception as e:
        print(f"[Error] LLM Inference failed: {e}")
        raise HTTPException(
            status_code=503,
            detail=f"LLM Server Response Error: {e}"
        )


# --- 6. FastAPI Routes ---

@app.on_event("startup")
async def startup_event():
    """Execute model initialization when FastAPI starts up."""
    try:
        initialize_llm()
    except Exception as e:
        print(f"Application startup failed: {e}")
        # If initialization fails, LLM_INSTANCE is None, and inference will return 503.

@app.get("/", summary="Home/Health Check")
async def root():
    status = "running" if LLAMA_INSTANCE else "starting/failed (LLM unavailable)"
    return HTMLResponse(content=f"<html><body><h1>LLM API Status: {status}</h1></body></html>", status_code=200)


@app.post("/local/qwen-0-6b", summary="Execute Local LLM Inference (Minimal Input)")
async def infer_local_endpoint(request: InferenceRequestMinimal):
    """
    Executes inference using the local Llama.cpp instance.
    Returns a JSON with the 'response' field.
    """
    FIXED_SYSTEM_MESSAGE = "You are a friendly and concise assistant."
    FIXED_MAX_TOKENS = 4096

    try:
        messages = [{"role": "user", "content": request.question}]

        content = get_inference_response(
            messages=messages,
            system_message=FIXED_SYSTEM_MESSAGE,
            max_tokens=FIXED_MAX_TOKENS,
        )
        
        return JSONResponse(content={"response": content})

    except HTTPException:
        raise
    except Exception as e:
        print(f"[Fatal Error] During local API call: {e}")
        raise HTTPException(status_code=500, detail="Internal Server Error.")
        

@app.post("/remote/amd", summary="Call External AMD LLM Space via Gradio Client")
async def infer_amd_endpoint(request: InferenceRequestMinimal):
    """
    Uses gradio_client to call the /chat API of the AMD_SPACE_ID.
    Input/output format is consistent with the local endpoint.
    """
    try:
        # Initialize Gradio Client using the global AMD_SPACE_ID
        client = Client(AMD_SPACE_ID)

        # Call the Space API
        result = client.predict(
            message=request.question,
            system_prompt="You are a helpful assistant.",
            temperature=0.7,
            api_name="/chat"
        )
        
        # Process and return result in the required format
        if isinstance(result, str):
            return JSONResponse(content={"response": result})
        else:
            raise ValueError("External API returned unexpected non-string format.")

    except Exception as e:
        print(f"[Fatal Error] Gradio Client API call failed: {e}")
        # Return 503 Service Unavailable for external API errors
        raise HTTPException(
            status_code=503,
            detail=f"External AMD LLM Service Error: {e}"
        )

        
# --- 9. Application Startup ---
if __name__ == "__main__":
    print("FastAPI service is starting...")
    # The 'app:app' structure tells uvicorn to look for the 'app' object 
    # inside the current module (which is also named 'app' when run directly).
    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)