Spaces:
Running
Running
| import json | |
| import time | |
| from contextlib import asynccontextmanager | |
| from fastapi import FastAPI, HTTPException | |
| from pydantic import BaseModel | |
| llm = None | |
| async def lifespan(app: FastAPI): | |
| global llm | |
| from llama_cpp import Llama | |
| print("Descargando modelo Qwen2.5-Coder-7B... puede tardar varios minutos.") | |
| start = time.time() | |
| llm = Llama.from_pretrained( | |
| repo_id="bartowski/Qwen2.5-Coder-7B-Instruct-GGUF", | |
| filename="Qwen2.5-Coder-7B-Instruct-Q4_K_M.gguf", | |
| n_ctx=4096, | |
| n_threads=2, | |
| n_batch=256, | |
| verbose=False, | |
| ) | |
| elapsed = time.time() - start | |
| print(f"Modelo listo en {elapsed:.0f} segundos") | |
| yield | |
| app = FastAPI(title="Code Agent LLM", lifespan=lifespan) | |
| class Message(BaseModel): | |
| role: str | |
| content: str | |
| class Tool(BaseModel): | |
| type: str = "function" | |
| function: dict | |
| class ChatRequest(BaseModel): | |
| model: str = "qwen2.5-coder-7b" | |
| messages: list[Message] | |
| tools: list[Tool] = None | |
| tool_choice: str = "auto" | |
| temperature: float = 0.7 | |
| max_tokens: int = 2048 | |
| stream: bool = False | |
| def build_system_with_tools(tools): | |
| tools_str = json.dumps( | |
| [t.model_dump() if hasattr(t, "model_dump") else t for t in tools], | |
| indent=2, | |
| ) | |
| return ( | |
| "You are a helpful coding assistant with access to tools.\n" | |
| "When a tool can help, call it by responding ONLY with a JSON block:\n" | |
| "```tool_call\n" | |
| '{"name": "<tool_name>", "arguments": {<args>}}\n' | |
| "```\n" | |
| "Do NOT include any other text when making a tool call.\n" | |
| "When answering directly, respond normally in markdown.\n\n" | |
| f"Available tools:\n{tools_str}" | |
| ) | |
| async def chat_completion_get(): | |
| return {"status": "ok", "model": "qwen2.5-coder-7b"} | |
| async def chat_completion(request: ChatRequest): | |
| if llm is None: | |
| raise HTTPException(status_code=503, detail="Model still loading") | |
| try: | |
| messages = [] | |
| if request.tools: | |
| messages.append({ | |
| "role": "system", | |
| "content": build_system_with_tools(request.tools), | |
| }) | |
| for m in request.messages: | |
| messages.append({"role": m.role, "content": m.content}) | |
| response = llm.create_chat_completion( | |
| messages=messages, | |
| temperature=request.temperature, | |
| max_tokens=request.max_tokens, | |
| ) | |
| return response | |
| except Exception as e: | |
| raise HTTPException(status_code=500, detail=str(e)) | |
| async def list_models(): | |
| return { | |
| "data": [{ | |
| "id": "qwen2.5-coder-7b", | |
| "object": "model", | |
| "created": int(time.time()), | |
| "owned_by": "local", | |
| }] | |
| } | |
| async def health(): | |
| return {"status": "ok", "model_loaded": llm is not None} | |