Madras1 commited on
Commit
99622cd
·
verified ·
1 Parent(s): 79e5322

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +51 -75
app.py CHANGED
@@ -1,101 +1,77 @@
1
- import os
2
- import torch
3
- from fastapi import FastAPI, HTTPException
4
- from pydantic import BaseModel
5
- from transformers import AutoModelForCausalLM, AutoTokenizer
6
  import spaces
 
 
 
7
 
8
  # --- Configurações ---
9
  MODEL_ID = "Qwen/Qwen2.5-Coder-32B-Instruct"
10
 
11
- # Variáveis globais iniciadas como None (vazias)
 
 
12
  model = None
13
  tokenizer = None
14
 
15
- app = FastAPI(title="APIDOST - Gabriel's Router")
16
-
17
- # --- Estruturas de Dados ---
18
- class Message(BaseModel):
19
- role: str
20
- content: str
21
-
22
- class ChatCompletionRequest(BaseModel):
23
- model: str = "default-model"
24
- messages: list[Message]
25
- max_tokens: int = 1024
26
- temperature: float = 0.7
27
-
28
- # --- A Mágica do ZeroGPU com Lazy Loading ---
29
- # duration=120 garante 2 minutos de GPU, tempo suficiente pro load + inferencia
30
- @spaces.GPU(duration=120)
31
- def generate_response(messages, max_tokens, temperature):
32
  global model, tokenizer
33
-
34
- # O PULO DO GATO: Só carrega se ainda não estiver na memória
35
  if model is None:
36
- print(f" Cold Start: Carregando {MODEL_ID} para a VRAM...")
37
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
38
  model = AutoModelForCausalLM.from_pretrained(
39
  MODEL_ID,
40
  torch_dtype=torch.bfloat16,
41
- device_map="cuda" # Força o uso da GPU alocada pelo spaces
42
  )
43
- print("✅ Modelo carregado com sucesso!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
  # Prepara o prompt
46
- text_prompt = tokenizer.apply_chat_template(
47
- messages,
48
- tokenize=False,
49
  add_generation_prompt=True
50
  )
51
 
52
- inputs = tokenizer(text_prompt, return_tensors="pt").to(model.device)
53
 
54
- # Gera
55
- output = model.generate(
56
- **inputs,
57
- max_new_tokens=max_tokens,
58
- temperature=temperature,
59
- do_sample=True,
60
- top_p=0.9,
61
  )
62
 
63
- response_text = tokenizer.decode(output[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
64
- return response_text
65
-
66
- # --- Endpoints ---
67
- @app.get("/")
68
- def read_root():
69
- return {"status": "APIDOST is online", "mode": "Lazy Loading Active"}
70
-
71
- @app.post("/v1/chat/completions")
72
- async def chat_completions(request: ChatCompletionRequest):
73
- try:
74
- # Converte mensagens
75
- msgs = [{"role": m.role, "content": m.content} for m in request.messages]
76
-
77
- # Chama a função protegida pelo @spaces.GPU
78
- response_content = generate_response(msgs, request.max_tokens, request.temperature)
79
-
80
- return {
81
- "id": "chatcmpl-apidost",
82
- "object": "chat.completion",
83
- "created": 1234567890,
84
- "model": request.model,
85
- "choices": [{
86
- "index": 0,
87
- "message": {
88
- "role": "assistant",
89
- "content": response_content
90
- },
91
- "finish_reason": "stop"
92
- }]
93
- }
94
 
95
- except Exception as e:
96
- print(f"❌ Erro: {e}")
97
- raise HTTPException(status_code=500, detail=str(e))
 
 
 
 
98
 
99
  if __name__ == "__main__":
100
- import uvicorn
101
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ import gradio as gr
 
 
 
 
2
  import spaces
3
+ import torch
4
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
5
+ from threading import Thread
6
 
7
  # --- Configurações ---
8
  MODEL_ID = "Qwen/Qwen2.5-Coder-32B-Instruct"
9
 
10
+ print(f"⏳ Iniciando carregamento preguiçoso para {MODEL_ID}...")
11
+
12
+ # Variáveis globais para cache do modelo
13
  model = None
14
  tokenizer = None
15
 
16
+ def load_model():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  global model, tokenizer
 
 
18
  if model is None:
19
+ print("🚀 Carregando modelo para a VRAM (Cold Start)...")
20
  tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
21
  model = AutoModelForCausalLM.from_pretrained(
22
  MODEL_ID,
23
  torch_dtype=torch.bfloat16,
24
+ device_map="cuda" # O @spaces.GPU garante que 'cuda' é a H200
25
  )
26
+ print("✅ Modelo carregado!")
27
+ return model, tokenizer
28
+
29
+ # --- A Função Mágica do ZeroGPU ---
30
+ @spaces.GPU(duration=120)
31
+ def respond(message, history):
32
+ # Carrega o modelo apenas quando a GPU é alocada
33
+ model, tokenizer = load_model()
34
+
35
+ # Formata o histórico para o padrão do Qwen
36
+ messages = []
37
+ for user_msg, bot_msg in history:
38
+ if user_msg: messages.append({"role": "user", "content": user_msg})
39
+ if bot_msg: messages.append({"role": "assistant", "content": bot_msg})
40
+
41
+ messages.append({"role": "user", "content": message})
42
 
43
  # Prepara o prompt
44
+ text = tokenizer.apply_chat_template(
45
+ messages,
46
+ tokenize=False,
47
  add_generation_prompt=True
48
  )
49
 
50
+ model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
51
 
52
+ # Configuração de geração
53
+ generated_ids = model.generate(
54
+ **model_inputs,
55
+ max_new_tokens=1024,
56
+ temperature=0.7,
57
+ do_sample=True
 
58
  )
59
 
60
+ # Decodifica a resposta
61
+ generated_ids = [
62
+ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
63
+ ]
64
+ response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
65
+
66
+ return response
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
67
 
68
+ # --- Interface Gradio ---
69
+ demo = gr.ChatInterface(
70
+ respond,
71
+ title="APIDOST - Qwen 2.5 Coder (H200 Powered)",
72
+ description="Rodando no ZeroGPU da Hugging Face. Use via API ou Chat.",
73
+ examples=["Crie um script Python para snake game.", "Explique a teoria da relatividade."],
74
+ )
75
 
76
  if __name__ == "__main__":
77
+ demo.launch()