| import os |
| from fastapi import FastAPI, HTTPException |
| from pydantic import BaseModel |
| from llama_cpp import Llama |
| from huggingface_hub import hf_hub_download |
|
|
| app = FastAPI() |
|
|
| |
| MODEL_REPO = "bartowski/Qwen2.5-3B-Instruct-GGUF" |
| MODEL_FILE = "Qwen2.5-3B-Instruct-Q4_K_M.gguf" |
|
|
| print("Downloading Qwen2.5-3B model (Faster & Smarter)...") |
| model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) |
|
|
| print("Loading model for fast CPU inference...") |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=2048, |
| n_threads=2, |
| n_batch=512, |
| verbose=False |
| ) |
|
|
| class PromptRequest(BaseModel): |
| prompt: str |
|
|
| @app.get("/") |
| def read_root(): |
| return {"message": "High-Speed Qwen2.5-3B API is running. Use POST /api."} |
|
|
| @app.get("/health") |
| def health_check(): |
| return {"status": "alive"} |
|
|
| @app.post("/api") |
| async def generate_response(request: PromptRequest): |
| try: |
| |
| formatted_prompt = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{request.prompt}<|im_end|>\n<|im_start|>assistant\n" |
| |
| output = llm( |
| formatted_prompt, |
| max_tokens=1024, |
| stop=["<|im_end|>"], |
| echo=False |
| ) |
| |
| response_text = output['choices'][0]['text'].strip() |
| |
| return { |
| "status": "success", |
| "text": response_text |
| } |
| except Exception as e: |
| print(f"Error: {e}") |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|