Spaces:

aiqknow
/

Qwen2-api

Sleeping

App Files Files Community

Qwen2-api / app.py

aiqknow

Upload 4 files

063c231 verified 19 days ago

raw

history blame contribute delete

1.81 kB

	import os
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	app = FastAPI()

	# Optimized Model Configuration (Qwen2.5-3B is faster than Phi-3)
	MODEL_REPO = "bartowski/Qwen2.5-3B-Instruct-GGUF"
	MODEL_FILE = "Qwen2.5-3B-Instruct-Q4_K_M.gguf"

	print("Downloading Qwen2.5-3B model (Faster & Smarter)...")
	model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)

	print("Loading model for fast CPU inference...")
	llm = Llama(
	model_path=model_path,
	n_ctx=2048, # Context window
	n_threads=2, # HF Free tier has 2 vCPUs
	n_batch=512, # Process 512 tokens at once for speed
	verbose=False
	)

	class PromptRequest(BaseModel):
	prompt: str

	@app.get("/")
	def read_root():
	return {"message": "High-Speed Qwen2.5-3B API is running. Use POST /api."}

	@app.get("/health")
	def health_check():
	return {"status": "alive"}

	@app.post("/api")
	async def generate_response(request: PromptRequest):
	try:
	# Qwen2.5 uses ChatML format for best results
	formatted_prompt = f"<\|im_start\|>system\nYou are a helpful assistant.<\|im_end\|>\n<\|im_start\|>user\n{request.prompt}<\|im_end\|>\n<\|im_start\|>assistant\n"

	output = llm(
	formatted_prompt,
	max_tokens=1024, # Increased limit
	stop=["<\|im_end\|>"],
	echo=False
	)

	response_text = output['choices'][0]['text'].strip()

	return {
	"status": "success",
	"text": response_text
	}
	except Exception as e:
	print(f"Error: {e}")
	raise HTTPException(status_code=500, detail=str(e))

	if __name__ == "__main__":
	import uvicorn
	uvicorn.run(app, host="0.0.0.0", port=7860)