Madras1 commited on
Commit
4e16522
·
verified ·
1 Parent(s): 5a91f27

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -42
app.py CHANGED
@@ -2,27 +2,19 @@ import os
2
  import torch
3
  from fastapi import FastAPI, HTTPException
4
  from pydantic import BaseModel
5
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
6
- from threading import Thread
7
  import spaces
8
 
9
  # --- Configurações ---
10
- # Vamos começar com um modelo poderoso que cabe na H200 tranquilo
11
- MODEL_ID = "Qwen/Qwen2.5-Coder-32B-Instruct"
12
- # Ou se quiser algo mais leve: "meta-llama/Llama-3.1-8B-Instruct"
13
 
14
- app = FastAPI(title="APIDOST - Gabriel's Router")
 
 
15
 
16
- print(f"🔄 Carregando modelo: {MODEL_ID}...")
17
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
18
- model = AutoModelForCausalLM.from_pretrained(
19
- MODEL_ID,
20
- torch_dtype=torch.bfloat16,
21
- device_map="auto"
22
- )
23
- print("✅ Modelo carregado e pronto para a guerra!")
24
 
25
- # --- Estruturas de Dados (Schema OpenAI-like) ---
26
  class Message(BaseModel):
27
  role: str
28
  content: str
@@ -32,13 +24,25 @@ class ChatCompletionRequest(BaseModel):
32
  messages: list[Message]
33
  max_tokens: int = 1024
34
  temperature: float = 0.7
35
- stream: bool = False
36
 
37
- # --- A Mágica do ZeroGPU ---
38
- # O decorator @spaces.GPU garante que essa função rode na H200
39
- @spaces.GPU
40
  def generate_response(messages, max_tokens, temperature):
41
- # Formata o prompt (chat template)
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  text_prompt = tokenizer.apply_chat_template(
43
  messages,
44
  tokenize=False,
@@ -47,40 +51,32 @@ def generate_response(messages, max_tokens, temperature):
47
 
48
  inputs = tokenizer(text_prompt, return_tensors="pt").to(model.device)
49
 
50
- # Configuração de geração
51
- generate_kwargs = dict(
52
- inputs,
53
  max_new_tokens=max_tokens,
54
  temperature=temperature,
55
  do_sample=True,
56
  top_p=0.9,
57
  )
58
-
59
- # Gera a resposta
60
- output = model.generate(**generate_kwargs)
61
- response_text = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
62
 
 
63
  return response_text
64
 
65
  # --- Endpoints ---
66
-
67
  @app.get("/")
68
  def read_root():
69
- return {"status": "APIDOST is online", "hardware": "Nvidia H200 (ZeroGPU)"}
70
 
71
  @app.post("/v1/chat/completions")
72
  async def chat_completions(request: ChatCompletionRequest):
73
- """
74
- Endpoint compatível (simplificado) com OpenAI.
75
- """
76
  try:
77
- # Converte as mensagens do Pydantic para lista de dicts
78
  msgs = [{"role": m.role, "content": m.content} for m in request.messages]
79
 
80
- # Chama a GPU
81
  response_content = generate_response(msgs, request.max_tokens, request.temperature)
82
 
83
- # Formata a resposta estilo OpenAI
84
  return {
85
  "id": "chatcmpl-apidost",
86
  "object": "chat.completion",
@@ -93,18 +89,13 @@ async def chat_completions(request: ChatCompletionRequest):
93
  "content": response_content
94
  },
95
  "finish_reason": "stop"
96
- }],
97
- "usage": {
98
- "prompt_tokens": 0, # Implementar contagem real se quiser
99
- "completion_tokens": 0,
100
- "total_tokens": 0
101
- }
102
  }
103
 
104
  except Exception as e:
 
105
  raise HTTPException(status_code=500, detail=str(e))
106
 
107
- # Para rodar localmente ou no Spaces via Docker
108
  if __name__ == "__main__":
109
  import uvicorn
110
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
2
  import torch
3
  from fastapi import FastAPI, HTTPException
4
  from pydantic import BaseModel
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
 
6
  import spaces
7
 
8
  # --- Configurações ---
9
+ MODEL_ID = "Qwen/Qwen2.5-Coder-32B-Instruct"
 
 
10
 
11
+ # Variáveis globais iniciadas como None (vazias)
12
+ model = None
13
+ tokenizer = None
14
 
15
+ app = FastAPI(title="APIDOST - Gabriel's Router")
 
 
 
 
 
 
 
16
 
17
+ # --- Estruturas de Dados ---
18
  class Message(BaseModel):
19
  role: str
20
  content: str
 
24
  messages: list[Message]
25
  max_tokens: int = 1024
26
  temperature: float = 0.7
 
27
 
28
+ # --- A Mágica do ZeroGPU com Lazy Loading ---
29
+ # duration=120 garante 2 minutos de GPU, tempo suficiente pro load + inferencia
30
+ @spaces.GPU(duration=120)
31
  def generate_response(messages, max_tokens, temperature):
32
+ global model, tokenizer
33
+
34
+ # O PULO DO GATO: Só carrega se ainda não estiver na memória
35
+ if model is None:
36
+ print(f"⏳ Cold Start: Carregando {MODEL_ID} para a VRAM...")
37
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
38
+ model = AutoModelForCausalLM.from_pretrained(
39
+ MODEL_ID,
40
+ torch_dtype=torch.bfloat16,
41
+ device_map="cuda" # Força o uso da GPU alocada pelo spaces
42
+ )
43
+ print("✅ Modelo carregado com sucesso!")
44
+
45
+ # Prepara o prompt
46
  text_prompt = tokenizer.apply_chat_template(
47
  messages,
48
  tokenize=False,
 
51
 
52
  inputs = tokenizer(text_prompt, return_tensors="pt").to(model.device)
53
 
54
+ # Gera
55
+ output = model.generate(
56
+ **inputs,
57
  max_new_tokens=max_tokens,
58
  temperature=temperature,
59
  do_sample=True,
60
  top_p=0.9,
61
  )
 
 
 
 
62
 
63
+ response_text = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
64
  return response_text
65
 
66
  # --- Endpoints ---
 
67
  @app.get("/")
68
  def read_root():
69
+ return {"status": "APIDOST is online", "mode": "Lazy Loading Active"}
70
 
71
  @app.post("/v1/chat/completions")
72
  async def chat_completions(request: ChatCompletionRequest):
 
 
 
73
  try:
74
+ # Converte mensagens
75
  msgs = [{"role": m.role, "content": m.content} for m in request.messages]
76
 
77
+ # Chama a função protegida pelo @spaces.GPU
78
  response_content = generate_response(msgs, request.max_tokens, request.temperature)
79
 
 
80
  return {
81
  "id": "chatcmpl-apidost",
82
  "object": "chat.completion",
 
89
  "content": response_content
90
  },
91
  "finish_reason": "stop"
92
+ }]
 
 
 
 
 
93
  }
94
 
95
  except Exception as e:
96
+ print(f"❌ Erro: {e}")
97
  raise HTTPException(status_code=500, detail=str(e))
98
 
 
99
  if __name__ == "__main__":
100
  import uvicorn
101
  uvicorn.run(app, host="0.0.0.0", port=7860)