Madras1 commited on
Commit
5a91f27
·
verified ·
1 Parent(s): b1c1e2c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +110 -0
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ from fastapi import FastAPI, HTTPException
4
+ from pydantic import BaseModel
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
+ from threading import Thread
7
+ import spaces
8
+
9
+ # --- Configurações ---
10
+ # Vamos começar com um modelo poderoso que cabe na H200 tranquilo
11
+ MODEL_ID = "Qwen/Qwen2.5-Coder-32B-Instruct"
12
+ # Ou se quiser algo mais leve: "meta-llama/Llama-3.1-8B-Instruct"
13
+
14
+ app = FastAPI(title="APIDOST - Gabriel's Router")
15
+
16
+ print(f"🔄 Carregando modelo: {MODEL_ID}...")
17
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
18
+ model = AutoModelForCausalLM.from_pretrained(
19
+ MODEL_ID,
20
+ torch_dtype=torch.bfloat16,
21
+ device_map="auto"
22
+ )
23
+ print("✅ Modelo carregado e pronto para a guerra!")
24
+
25
+ # --- Estruturas de Dados (Schema OpenAI-like) ---
26
+ class Message(BaseModel):
27
+ role: str
28
+ content: str
29
+
30
+ class ChatCompletionRequest(BaseModel):
31
+ model: str = "default-model"
32
+ messages: list[Message]
33
+ max_tokens: int = 1024
34
+ temperature: float = 0.7
35
+ stream: bool = False
36
+
37
+ # --- A Mágica do ZeroGPU ---
38
+ # O decorator @spaces.GPU garante que essa função rode na H200
39
+ @spaces.GPU
40
+ def generate_response(messages, max_tokens, temperature):
41
+ # Formata o prompt (chat template)
42
+ text_prompt = tokenizer.apply_chat_template(
43
+ messages,
44
+ tokenize=False,
45
+ add_generation_prompt=True
46
+ )
47
+
48
+ inputs = tokenizer(text_prompt, return_tensors="pt").to(model.device)
49
+
50
+ # Configuração de geração
51
+ generate_kwargs = dict(
52
+ inputs,
53
+ max_new_tokens=max_tokens,
54
+ temperature=temperature,
55
+ do_sample=True,
56
+ top_p=0.9,
57
+ )
58
+
59
+ # Gera a resposta
60
+ output = model.generate(**generate_kwargs)
61
+ response_text = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
62
+
63
+ return response_text
64
+
65
+ # --- Endpoints ---
66
+
67
+ @app.get("/")
68
+ def read_root():
69
+ return {"status": "APIDOST is online", "hardware": "Nvidia H200 (ZeroGPU)"}
70
+
71
+ @app.post("/v1/chat/completions")
72
+ async def chat_completions(request: ChatCompletionRequest):
73
+ """
74
+ Endpoint compatível (simplificado) com OpenAI.
75
+ """
76
+ try:
77
+ # Converte as mensagens do Pydantic para lista de dicts
78
+ msgs = [{"role": m.role, "content": m.content} for m in request.messages]
79
+
80
+ # Chama a GPU
81
+ response_content = generate_response(msgs, request.max_tokens, request.temperature)
82
+
83
+ # Formata a resposta estilo OpenAI
84
+ return {
85
+ "id": "chatcmpl-apidost",
86
+ "object": "chat.completion",
87
+ "created": 1234567890,
88
+ "model": request.model,
89
+ "choices": [{
90
+ "index": 0,
91
+ "message": {
92
+ "role": "assistant",
93
+ "content": response_content
94
+ },
95
+ "finish_reason": "stop"
96
+ }],
97
+ "usage": {
98
+ "prompt_tokens": 0, # Implementar contagem real se quiser
99
+ "completion_tokens": 0,
100
+ "total_tokens": 0
101
+ }
102
+ }
103
+
104
+ except Exception as e:
105
+ raise HTTPException(status_code=500, detail=str(e))
106
+
107
+ # Para rodar localmente ou no Spaces via Docker
108
+ if __name__ == "__main__":
109
+ import uvicorn
110
+ uvicorn.run(app, host="0.0.0.0", port=7860)