Spaces:

Jacky2305
/

LLM_API

Running

App Files Files Community

LLM_API / main.py

Jacky2305

feat: switch to custom FastAPI app to avoid server issues

19c7563 about 11 hours ago

raw

history blame contribute delete

1.82 kB

	from fastapi import FastAPI, HTTPException
	from fastapi.responses import JSONResponse
	from pydantic import BaseModel, Field
	from typing import List, Optional
	import os

	from llama_cpp import Llama

	# Load model
	MODEL_PATH = "/app/models/Qwen2.5-3B-Instruct-Q4_K_M.gguf"
	if not os.path.exists(MODEL_PATH):
	raise FileNotFoundError(f"Model not found at {MODEL_PATH}")

	llm = Llama(
	model_path=MODEL_PATH,
	n_ctx=4096,
	n_threads=4,
	chat_format="chatml",
	hf_pretrained_model_name_or_path="Qwen/Qwen2.5-3B-Instruct",
	verbose=False,
	)

	app = FastAPI(title="Qwen2.5-3B API", version="0.1.0")

	class Message(BaseModel):
	role: str = Field(..., description="Role: 'system', 'user', or 'assistant'")
	content: str = Field(..., description="Message content")

	class ChatRequest(BaseModel):
	model: str = Field(..., description="Model identifier (ignored, single model)")
	messages: List[Message] = Field(..., description="List of messages")
	max_tokens: Optional[int] = Field(None, description="Maximum tokens to generate")
	stream: Optional[bool] = Field(False, description="Stream response (not supported)")

	@app.post("/v1/chat/completions")
	async def chat_completion(req: ChatRequest):
	# Ignore model name; use loaded model
	try:
	result = llm.create_chat_completion(
	messages=[m.dict() for m in req.messages],
	max_tokens=req.max_tokens,
	stream=req.stream,
	)
	return JSONResponse(content=result)
	except Exception as e:
	raise HTTPException(status_code=400, detail=str(e))

	@app.get("/healthz")
	async def healthz():
	return {"status": "ok"}

	# For HF Spaces compatibility
	if __name__ == "__main__":
	import uvicorn
	port = int(os.getenv("PORT", 7860))
	uvicorn.run(app, host="0.0.0.0", port=port)