| import os |
| from fastapi import FastAPI, HTTPException |
| from pydantic import BaseModel |
| from llama_cpp import Llama |
| from huggingface_hub import hf_hub_download |
|
|
| app = FastAPI() |
|
|
| |
| MODEL_REPO = "bartowski/gemma-2-2b-it-GGUF" |
| MODEL_FILE = "gemma-2-2b-it-Q4_K_M.gguf" |
|
|
| print("Downloading Gemma-2-2B model...") |
| model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) |
|
|
| print("Loading Gemma for high-quality CPU inference...") |
| llm = Llama( |
| model_path=model_path, |
| n_ctx=2048, |
| n_threads=2, |
| verbose=False |
| ) |
|
|
| class PromptRequest(BaseModel): |
| prompt: str |
|
|
| @app.get("/") |
| def read_root(): |
| return {"message": "Gemma-2-2B-IT API is running."} |
|
|
| @app.get("/health") |
| def health_check(): |
| return {"status": "alive"} |
|
|
| @app.post("/api") |
| async def generate_response(request: PromptRequest): |
| try: |
| |
| formatted_prompt = f"<start_of_turn>user\n{request.prompt}<end_of_turn>\n<start_of_turn>model\n" |
| |
| output = llm( |
| formatted_prompt, |
| max_tokens=1024, |
| stop=["<end_of_turn>"], |
| echo=False |
| ) |
| |
| response_text = output['choices'][0]['text'].strip() |
| |
| return { |
| "status": "success", |
| "text": response_text |
| } |
| except Exception as e: |
| print(f"Error: {e}") |
| raise HTTPException(status_code=500, detail=str(e)) |
|
|
| if __name__ == "__main__": |
| import uvicorn |
| uvicorn.run(app, host="0.0.0.0", port=7860) |
|
|