Spaces:

CooLLaMACEO
/

Gemma-Nano-Max

Build error

Update main.py

0c80cd5 verified 2 months ago

1.33 kB

	import os
	from fastapi import FastAPI, Request
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	from contextlib import asynccontextmanager

	llm = None

	@asynccontextmanager
	async def lifespan(app: FastAPI):
	global llm
	print("📥 Downloading Gemma-3 from Hub...")
	# This downloads the file to the HF cache
	model_path = hf_hub_download(
	repo_id="mradermacher/gemma-3-4b-it-GGUF",
	filename="gemma-3-4b-it.Q4_K_M.gguf"
	)

	print("🚀 Loading Model...")
	llm = Llama(
	model_path=model_path,
	n_ctx=2048,
	n_threads=2 # Hugging Face free tier usually has 2 vCPUs
	)
	print("✅ Ready!")
	yield

	app = FastAPI(lifespan=lifespan)

	@app.post("/completion")
	async def completion(request: Request):
	data = await request.json()
	prompt = data.get("prompt", "")

	# Gemma-3 specific formatting
	formatted_prompt = f"<\|begin_of_text\|>user\n{prompt}\nassistant\n"

	output = llm(
	formatted_prompt,
	max_tokens=512,
	stop=["<\|end_of_text\|>", "user"]
	)
	return {"content": output["choices"][0]["text"]}

	@app.get("/")
	def home():
	return {"message": "Gemma-3 API is running on Hugging Face"}

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)