Spaces:

hsuwill000
/

ESP01LLMSample

Running

File size: 8,762 Bytes

defa84e
df53ff4
 
b13a7ae
214e263
 
b13a7ae
f8182c8
 
b13a7ae
 
 
 
 
214e263
 
b13a7ae
 
 
 
214e263
b13a7ae
e3719a1
214e263
 
5e03aec
b13a7ae
5e03aec
 
 
b13a7ae
6daf3d6
5e03aec
 
b13a7ae
214e263
 
b13a7ae
 
 
 
 
 
 
 
 
214e263
b13a7ae
 
214e263
 
b13a7ae
214e263
 
 
defa84e
df53ff4
b13a7ae
df53ff4
214e263
 
 
 
 
b13a7ae
 
214e263
b13a7ae
df53ff4
 
b13a7ae
 
bbd2fc4
214e263
b13a7ae
214e263
 
 
 
defa84e
b13a7ae
5e03aec
 
 
 
b13a7ae
214e263
 
 
b13a7ae
defa84e
b13a7ae
214e263
b13a7ae
 
214e263
 
 
 
b13a7ae
08ac672
214e263
 
b13a7ae
214e263
b13a7ae
defa84e
 
b13a7ae
214e263
b13a7ae
 
214e263
defa84e
214e263
 
 
 
 
 
 
defa84e
b13a7ae
214e263
b13a7ae
 
214e263
 
b13a7ae
214e263
 
 
 
 
 
defa84e
b13a7ae
214e263
b13a7ae
defa84e
b13a7ae
214e263
 
defa84e
 
214e263
 
defa84e
 
 
 
 
b13a7ae
 
 
 
defa84e
 
b13a7ae
5fc3b1a
defa84e
214e263
 
 
 
 
5fc3b1a
 
b13a7ae
defa84e
214e263
 
b13a7ae
214e263
 
 
b13a7ae
 
5fc3b1a
b13a7ae
214e263
 
 
5fc3b1a
214e263
b13a7ae
 
 
 
 
 
214e263
 
 
 
 
 
 
 
 
 
 
 
b13a7ae
214e263
b13a7ae
 
214e263
b13a7ae
 
f1706d4
bbd2fc4
b13a7ae
bbd2fc4
 
b13a7ae
 
bbd2fc4
 
b13a7ae
bbd2fc4
 
b13a7ae
bbd2fc4
b13a7ae
bbd2fc4
 
 
 
 
b13a7ae
bbd2fc4
b13a7ae
bbd2fc4
b13a7ae
bbd2fc4
 
 
b13a7ae
bbd2fc4
 
 
 
 
f1706d4
b13a7ae
f1706d4
b13a7ae
 
 
48cdada

import os
import sys
import subprocess
import uvicorn
from typing import List, Dict, Any, Optional

# --- Configuration ---
MODEL_NAME = "Qwen3-0.6B-IQ4_XS.gguf"
MODEL_REPO = "unsloth/Qwen3-0.6B-GGUF"
AMD_SPACE_ID = "amd/gpt-oss-120b-chatbot" # Gradio Space ID for remote inference

# --- 0. Dynamic Module Installation ---
# WARNING: This may fail in many hosted environments due to permission issues.
# A `requirements.txt` is generally recommended for production.

def install_required_modules():
    """
    Installs necessary Python modules at runtime using pip,
    forcing compilation with AVX-512 flags for llama-cpp-python.
    """
    required_packages = [
        "fastapi", "uvicorn", "pydantic", "huggingface-hub",
        "git+https://github.com/abetlen/llama-cpp-python.git@main", "gradio_client"
    ]
    
    # ----------------------------------------------------
    # **Core Modification: Llama.cpp Compile Options**
    # ----------------------------------------------------
    compile_env = os.environ.copy()
    compile_env["FORCE_CMAKE"] = "1"
    # Note: If your CPU does not support AVX512, this will cause a runtime error (Illegal instruction).
    compile_env["CMAKE_ARGS"] = "-DGGML_AVX512=ON -DGGML_AVX512_VNNI=ON -DGGML_AVX512_VBMI=ON  -DLLAMA_CURL=OFF"
    # ----------------------------------------------------

    print("--- Attempting Dynamic Installation/Upgrade (AVX-512 Compilation) ---")
    
    try:
        subprocess.check_call(
            [
                sys.executable, "-m", "pip", "install", 
                *required_packages, 
                "--upgrade", "--no-cache-dir", "--force-reinstall" # Ensure recompile
            ], 
            env=compile_env
        )
        print("All modules successfully installed/updated. llama-cpp-python compiled with AVX-512.")
    except subprocess.CalledProcessError as e:
        print(f"**FATAL ERROR**: Module installation failed. Error: {e}")
        print("Check if your CPU supports AVX-512 or try removing the CMAKE_ARGS environment variable.")
        sys.exit(1)
    except Exception as e:
        print(f"**FATAL ERROR**: An unknown error occurred. Error: {e}")
        sys.exit(1)

install_required_modules()


# --- 1. Module Imports (Must be after installation) ---
try:
    from pydantic import BaseModel, Field
    from fastapi import FastAPI, HTTPException
    from fastapi.responses import JSONResponse, HTMLResponse
    from fastapi.middleware.cors import CORSMiddleware
    from huggingface_hub import hf_hub_download
    from llama_cpp import Llama, llama_print_system_info
    from gradio_client import Client
except ImportError as e:
    print(f"**FATAL ERROR**: Failed to import modules. Error: {e}")
    sys.exit(1)

# --- 2. Global State ---
LLAMA_INSTANCE: Optional[Llama] = None

def initialize_llm():
    """Downloads the model and initializes the global Llama instance."""
    global LLAMA_INSTANCE
    
    if LLAMA_INSTANCE is not None:
        return

    # Check AVX-512 status
    print("--- Llama.cpp System Info ---")
    print(llama_print_system_info())
    print("-----------------------------")

    print(f"--- 1. Starting model download: {MODEL_NAME} ---")
    try:
        model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_NAME)
    except Exception as e:
        raise RuntimeError(f"Failed to download model: {e}")

    print("--- 2. Initializing Llama.cpp instance ---")
    try:
        # Use half of physical CPU cores for threads, minimum 1
        n_threads = os.cpu_count() // 2 or 1
        LLAMA_INSTANCE = Llama(
            model_path=model_path,
            n_ctx=4096,
            n_batch=512,
            n_threads=n_threads,
            n_gpu_layers=0,
            verbose=False
        )
        print("Llama.cpp model successfully loaded.")
    except Exception as e:
        raise RuntimeError(f"Llama instance initialization failed: {e}")


# --- 3. FastAPI Setup and Middleware ---
app = FastAPI(
    title="LLM Inference API (Llama.cpp)",
    description="API service for direct inference using Llama.cpp."
)

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# --- 4. Pydantic Request Model ---
class InferenceRequestMinimal(BaseModel):
    """Data structure for a minimal inference request, accepting only a question."""
    question: str = Field(..., description="The user's input question or prompt.")


# --- 5. Core Inference Function (Non-Streaming) ---
def get_inference_response(
    messages: List[Dict[str, str]],
    system_message: str,
    max_tokens: int,
    temperature: float = 0.7,
    top_p: float = 0.95,
) -> str:
    """Calls the Llama.cpp instance and returns a single text response."""
    if LLAMA_INSTANCE is None:
        raise HTTPException(status_code=503, detail="LLM Service not initialized.")
    
    # Prepend the system message to the conversation history
    full_messages = [{"role": "system", "content": system_message}]
    full_messages.extend(messages)
    
    try:
        response = LLAMA_INSTANCE.create_chat_completion(
            messages=full_messages,
            max_tokens=max_tokens,
            temperature=temperature,
            top_p=top_p,
        )
        
        # Safely extract the content
        content = response.get('choices', [{}])[0].get('message', {}).get('content')
        
        if content:
            return content
        
        return "⚠️ LLM service returned empty content."

    except Exception as e:
        print(f"[Error] LLM Inference failed: {e}")
        raise HTTPException(
            status_code=503,
            detail=f"LLM Server Response Error: {e}"
        )


# --- 6. FastAPI Routes ---

@app.on_event("startup")
async def startup_event():
    """Execute model initialization when FastAPI starts up."""
    try:
        initialize_llm()
    except Exception as e:
        print(f"Application startup failed: {e}")
        # If initialization fails, LLM_INSTANCE is None, and inference will return 503.

@app.get("/", summary="Home/Health Check")
async def root():
    status = "running" if LLAMA_INSTANCE else "starting/failed (LLM unavailable)"
    return HTMLResponse(content=f"<html><body><h1>LLM API Status: {status}</h1></body></html>", status_code=200)


@app.post("/local/qwen-0-6b", summary="Execute Local LLM Inference (Minimal Input)")
async def infer_local_endpoint(request: InferenceRequestMinimal):
    """
    Executes inference using the local Llama.cpp instance.
    Returns a JSON with the 'response' field.
    """
    FIXED_SYSTEM_MESSAGE = "You are a friendly and concise assistant."
    FIXED_MAX_TOKENS = 4096

    try:
        messages = [{"role": "user", "content": request.question}]

        content = get_inference_response(
            messages=messages,
            system_message=FIXED_SYSTEM_MESSAGE,
            max_tokens=FIXED_MAX_TOKENS,
        )
        
        return JSONResponse(content={"response": content})

    except HTTPException:
        raise
    except Exception as e:
        print(f"[Fatal Error] During local API call: {e}")
        raise HTTPException(status_code=500, detail="Internal Server Error.")
        

@app.post("/remote/amd", summary="Call External AMD LLM Space via Gradio Client")
async def infer_amd_endpoint(request: InferenceRequestMinimal):
    """
    Uses gradio_client to call the /chat API of the AMD_SPACE_ID.
    Input/output format is consistent with the local endpoint.
    """
    try:
        # Initialize Gradio Client using the global AMD_SPACE_ID
        client = Client(AMD_SPACE_ID)

        # Call the Space API
        result = client.predict(
            message=request.question,
            system_prompt="You are a helpful assistant.",
            temperature=0.7,
            api_name="/chat"
        )
        
        # Process and return result in the required format
        if isinstance(result, str):
            return JSONResponse(content={"response": result})
        else:
            raise ValueError("External API returned unexpected non-string format.")

    except Exception as e:
        print(f"[Fatal Error] Gradio Client API call failed: {e}")
        # Return 503 Service Unavailable for external API errors
        raise HTTPException(
            status_code=503,
            detail=f"External AMD LLM Service Error: {e}"
        )

        
# --- 9. Application Startup ---
if __name__ == "__main__":
    print("FastAPI service is starting...")
    # The 'app:app' structure tells uvicorn to look for the 'app' object 
    # inside the current module (which is also named 'app' when run directly).
    uvicorn.run("app:app", host="0.0.0.0", port=7860, reload=False)