ollama-coder / app.py
root
Context 4096
f491e0d
Raw
History Blame Contribute Delete
2.97 kB
import json
import time
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
llm = None
@asynccontextmanager
async def lifespan(app: FastAPI):
global llm
from llama_cpp import Llama
print("Descargando modelo Qwen2.5-Coder-7B... puede tardar varios minutos.")
start = time.time()
llm = Llama.from_pretrained(
repo_id="bartowski/Qwen2.5-Coder-7B-Instruct-GGUF",
filename="Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf",
n_ctx=4096,
n_threads=2,
n_batch=256,
verbose=False,
)
elapsed = time.time() - start
print(f"Modelo listo en {elapsed:.0f} segundos")
yield
app = FastAPI(title="Code Agent LLM", lifespan=lifespan)
class Message(BaseModel):
role: str
content: str
class Tool(BaseModel):
type: str = "function"
function: dict
class ChatRequest(BaseModel):
model: str = "qwen2.5-coder-7b"
messages: list[Message]
tools: list[Tool] = None
tool_choice: str = "auto"
temperature: float = 0.7
max_tokens: int = 2048
stream: bool = False
def build_system_with_tools(tools):
tools_str = json.dumps(
[t.model_dump() if hasattr(t, "model_dump") else t for t in tools],
indent=2,
)
return (
"You are a helpful coding assistant with access to tools.\n"
"When a tool can help, call it by responding ONLY with a JSON block:\n"
"```tool_call\n"
'{"name": "<tool_name>", "arguments": {<args>}}\n'
"```\n"
"Do NOT include any other text when making a tool call.\n"
"When answering directly, respond normally in markdown.\n\n"
f"Available tools:\n{tools_str}"
)
@app.get("/v1/chat/completions")
async def chat_completion_get():
return {"status": "ok", "model": "qwen2.5-coder-7b"}
@app.post("/v1/chat/completions")
async def chat_completion(request: ChatRequest):
if llm is None:
raise HTTPException(status_code=503, detail="Model still loading")
try:
messages = []
if request.tools:
messages.append({
"role": "system",
"content": build_system_with_tools(request.tools),
})
for m in request.messages:
messages.append({"role": m.role, "content": m.content})
response = llm.create_chat_completion(
messages=messages,
temperature=request.temperature,
max_tokens=request.max_tokens,
)
return response
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@app.get("/v1/models")
async def list_models():
return {
"data": [{
"id": "qwen2.5-coder-7b",
"object": "model",
"created": int(time.time()),
"owned_by": "local",
}]
}
@app.get("/health")
async def health():
return {"status": "ok", "model_loaded": llm is not None}