import os from fastapi import FastAPI, HTTPException from pydantic import BaseModel from llama_cpp import Llama from huggingface_hub import hf_hub_download app = FastAPI() # Switching to Gemma-2-2B-Instruct (High Quality & Good Speed) MODEL_REPO = "bartowski/gemma-2-2b-it-GGUF" MODEL_FILE = "gemma-2-2b-it-Q4_K_M.gguf" print("Downloading Gemma-2-2B model...") model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE) print("Loading Gemma for high-quality CPU inference...") llm = Llama( model_path=model_path, n_ctx=2048, n_threads=2, verbose=False ) class PromptRequest(BaseModel): prompt: str @app.get("/") def read_root(): return {"message": "Gemma-2-2B-IT API is running."} @app.get("/health") def health_check(): return {"status": "alive"} @app.post("/api") async def generate_response(request: PromptRequest): try: # Gemma 2 Prompt Format formatted_prompt = f"user\n{request.prompt}\nmodel\n" output = llm( formatted_prompt, max_tokens=1024, stop=[""], echo=False ) response_text = output['choices'][0]['text'].strip() return { "status": "success", "text": response_text } except Exception as e: print(f"Error: {e}") raise HTTPException(status_code=500, detail=str(e)) if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=7860)