Spaces:

himed001
/

ollama-coder

Running

root

Context 4096

f491e0d 9 days ago

2.97 kB

	import json
	import time
	from contextlib import asynccontextmanager
	from fastapi import FastAPI, HTTPException
	from pydantic import BaseModel

	llm = None

	@asynccontextmanager
	async def lifespan(app: FastAPI):
	global llm
	from llama_cpp import Llama

	print("Descargando modelo Qwen2.5-Coder-7B... puede tardar varios minutos.")
	start = time.time()

	llm = Llama.from_pretrained(
	repo_id="bartowski/Qwen2.5-Coder-7B-Instruct-GGUF",
	filename="Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf",
	n_ctx=4096,
	n_threads=2,
	n_batch=256,
	verbose=False,
	)

	elapsed = time.time() - start
	print(f"Modelo listo en {elapsed:.0f} segundos")
	yield

	app = FastAPI(title="Code Agent LLM", lifespan=lifespan)

	class Message(BaseModel):
	role: str
	content: str

	class Tool(BaseModel):
	type: str = "function"
	function: dict

	class ChatRequest(BaseModel):
	model: str = "qwen2.5-coder-7b"
	messages: list[Message]
	tools: list[Tool] = None
	tool_choice: str = "auto"
	temperature: float = 0.7
	max_tokens: int = 2048
	stream: bool = False

	def build_system_with_tools(tools):
	tools_str = json.dumps(
	[t.model_dump() if hasattr(t, "model_dump") else t for t in tools],
	indent=2,
	)
	return (
	"You are a helpful coding assistant with access to tools.\n"
	"When a tool can help, call it by responding ONLY with a JSON block:\n"
	"```tool_call\n"
	'{"name": "<tool_name>", "arguments": {<args>}}\n'
	"```\n"
	"Do NOT include any other text when making a tool call.\n"
	"When answering directly, respond normally in markdown.\n\n"
	f"Available tools:\n{tools_str}"
	)

	@app.get("/v1/chat/completions")
	async def chat_completion_get():
	return {"status": "ok", "model": "qwen2.5-coder-7b"}

	@app.post("/v1/chat/completions")
	async def chat_completion(request: ChatRequest):
	if llm is None:
	raise HTTPException(status_code=503, detail="Model still loading")
	try:
	messages = []
	if request.tools:
	messages.append({
	"role": "system",
	"content": build_system_with_tools(request.tools),
	})
	for m in request.messages:
	messages.append({"role": m.role, "content": m.content})
	response = llm.create_chat_completion(
	messages=messages,
	temperature=request.temperature,
	max_tokens=request.max_tokens,
	)
	return response
	except Exception as e:
	raise HTTPException(status_code=500, detail=str(e))

	@app.get("/v1/models")
	async def list_models():
	return {
	"data": [{
	"id": "qwen2.5-coder-7b",
	"object": "model",
	"created": int(time.time()),
	"owned_by": "local",
	}]
	}

	@app.get("/health")
	async def health():
	return {"status": "ok", "model_loaded": llm is not None}