| import os |
| import gc |
| from fastapi import FastAPI, HTTPException |
| from fastapi.middleware.cors import CORSMiddleware |
| from pydantic import BaseModel |
| from llama_cpp import Llama |
| from huggingface_hub import hf_hub_download |
|
|
| app = FastAPI() |
|
|
| app.add_middleware( |
| CORSMiddleware, |
| allow_origins=["*"], |
| allow_credentials=True, |
| allow_methods=["*"], |
| allow_headers=["*"], |
| ) |
|
|
| model = None |
| current_id = "" |
|
|
| class ChatRequest(BaseModel): |
| repo_id: str |
| filename: str |
| prompt: str |
| system_prompt: str = "You are a helpful assistant." |
| max_tokens: int = 512 |
| temperature: float = 0.7 |
|
|
| |
| @app.get("/") |
| async def health(): |
| return {"status": "online", "message": "API is running. Use POST /chat to interact."} |
|
|
| @app.post("/chat") |
| async def chat(request: ChatRequest): |
| global model, current_id |
| new_id = f"{request.repo_id}/{request.filename}" |
| |
| try: |
| if model is None or current_id != new_id: |
| if model is not None: |
| del model |
| gc.collect() |
| |
| path = hf_hub_download(repo_id=request.repo_id, filename=request.filename) |
| model = Llama( |
| model_path=path, |
| n_ctx=2048, |
| n_threads=os.cpu_count() or 4, |
| n_gpu_layers=0, |
| verbose=False |
| ) |
| current_id = new_id |
|
|
| full_prompt = f"System: {request.system_prompt}\nUser: {request.prompt}\nAssistant:" |
| output = model.create_completion( |
| prompt=full_prompt, |
| max_tokens=request.max_tokens, |
| temperature=request.temperature, |
| stop=["User:", "System:", "</s>"] |
| ) |
| |
| return { |
| "response": output["choices"][0]["text"].strip(), |
| "model": current_id |
| } |
| except Exception as e: |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| |
| uvicorn.run(app, host="0.0.0.0", port=7860) |