Spaces:

ResearchEngineering
/

AGI

Sleeping

AGI / app.py

Dmitry Beresnev

fix dockerfile, pyproject.toml, app

9d0ed97 15 days ago

1.52 kB

	from fastapi import FastAPI
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download
	import os

	# GGUF model configuration
	REPO_ID = "TheBloke/deepseek-coder-6.7B-instruct-GGUF"
	FILENAME = "deepseek-coder-6.7b-instruct.Q4_K_M.gguf"

	app = FastAPI()

	# Download and cache the GGUF model
	print(f"Downloading {FILENAME} from {REPO_ID}...")
	model_path = hf_hub_download(
	repo_id=REPO_ID,
	filename=FILENAME,
	cache_dir=os.getenv("HF_HOME", "./models")
	)
	print(f"Model downloaded to: {model_path}")

	# Load the model with llama-cpp-python
	print("Loading model into memory...")
	llm = Llama(
	model_path=model_path,
	n_ctx=2048, # Context window
	n_threads=4, # CPU threads
	n_gpu_layers=0, # Use CPU only (set >0 if GPU available)
	verbose=False
	)
	print("Model loaded successfully!")


	@app.post("/v1/chat/completions")
	def chat(req: dict):
	messages = req.get("messages", [])
	max_tokens = req.get("max_tokens", 256)
	temperature = req.get("temperature", 0.7)

	# Use llama-cpp-python's built-in chat completion
	response = llm.create_chat_completion(
	messages=messages,
	max_tokens=max_tokens,
	temperature=temperature,
	stop=["</s>", "User:", "###"]
	)

	return {
	"choices": [{
	"message": {
	"role": "assistant",
	"content": response["choices"][0]["message"]["content"]
	}
	}]
	}


	@app.get("/")
	def root():
	return {"status": "DeepSeek API is online (GGUF)"}