GTee2 commited on
Commit
40bdbb6
·
verified ·
1 Parent(s): a909106

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -25
app.py CHANGED
@@ -1,29 +1,37 @@
1
  from fastapi import FastAPI, Request
2
  from fastapi.responses import JSONResponse, StreamingResponse
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
  from threading import Thread
5
  from collections import defaultdict
6
- import torch
7
 
8
- app = FastAPI(title="Mariza Koller 1.5B - API com Memória 😈")
9
 
10
- print("🔥 Carregando Qwen2-1.5B-Instruct em int8 na CPU... (aguenta 2-3 min na primeira vez)")
11
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct", trust_remote_code=True)
 
 
 
 
 
 
 
 
 
12
  model = AutoModelForCausalLM.from_pretrained(
13
- "Qwen/Qwen2-1.5B-Instruct",
14
  device_map="cpu",
15
- load_in_8bit=True,
16
- torch_dtype=torch.float16,
17
- trust_remote_code=True
18
  )
19
 
20
- # Cache de conversa em memória: {user_id: lista de mensagens}
21
  history_db = defaultdict(list)
22
  MAX_CONTEXT_TOKENS = 3500
23
 
24
  @app.get("/")
25
  async def root():
26
- return {"message": "Mariza 1.5B viva e quente na CPU, chefe! 😏 manda POST /chat"}
27
 
28
  @app.post("/chat")
29
  async def chat(request: Request):
@@ -35,9 +43,9 @@ async def chat(request: Request):
35
  stream = data.get("stream", False)
36
 
37
  if not prompt:
38
- return JSONResponse({"error": "prompt vazio, seu safado"})
39
 
40
- # Monta histórico no formato do Qwen2
41
  messages = history_db[user_id]
42
  full_prompt = ""
43
  for role, content in messages:
@@ -60,16 +68,10 @@ async def chat(request: Request):
60
  }
61
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
62
  thread.start()
63
-
64
- def generate():
65
- for new_text in streamer:
66
- yield new_text
67
- return StreamingResponse(generate(), media_type="text/event-stream")
68
-
69
  else:
70
  outputs = model.generate(
71
- input_ids=inputs.input_ids,
72
- attention_mask=inputs.attention_mask,
73
  max_new_tokens=max_tokens,
74
  temperature=temperature,
75
  do_sample=True,
@@ -79,12 +81,13 @@ async def chat(request: Request):
79
  resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
80
  resposta = resposta.split("<|im_start|>assistant")[-1].strip()
81
 
82
- # Salva no histórico
83
  messages.append(("user", prompt))
84
  messages.append(("assistant", resposta))
85
 
86
- # Limpa histórico antigo se passar do limite
87
- while sum(len(tokenizer.encode(m[1])) for m in messages) > MAX_CONTEXT_TOKENS:
88
  messages.pop(0)
89
 
90
- return JSONResponse({"response": resposta, "user_id": user_id})
 
 
 
1
  from fastapi import FastAPI, Request
2
  from fastapi.responses import JSONResponse, StreamingResponse
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
4
  from threading import Thread
5
  from collections import defaultdict
 
6
 
7
+ app = FastAPI(title="Mariza Koller 1.5B - CPU Free 4bit 🔥")
8
 
9
+ print("🔥 Carregando Qwen2-1.5B em 4-bit na CPU (agora SIM funciona no HF free!)")
10
+
11
+ # Configuração 4-bit que roda na CPU do Spaces
12
+ quantization_config = BitsAndBytesConfig(
13
+ load_in_4bit=True,
14
+ bnb_4bit_quant_type="nf4",
15
+ bnb_4bit_compute_dtype="float16",
16
+ bnb_4bit_use_double_quant=True,
17
+ )
18
+
19
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct", trust Tom's_remote_code=True)
20
  model = AutoModelForCausalLM.from_pretrained(
21
+ "Qwen/Qwen2-1.5B-Instruct2",
22
  device_map="cpu",
23
+ quantization_config=quantization_config,
24
+ trust_remote_code=True,
25
+ low_cpu_mem_usage=True
26
  )
27
 
28
+ # Cache de conversa
29
  history_db = defaultdict(list)
30
  MAX_CONTEXT_TOKENS = 3500
31
 
32
  @app.get("/")
33
  async def root():
34
+ return {"message": "Mariza 1.5B 4-bit rodando quente na CPU free, chefe! 😈"}
35
 
36
  @app.post("/chat")
37
  async def chat(request: Request):
 
43
  stream = data.get("stream", False)
44
 
45
  if not prompt:
46
+ return JSONResponse({"error": "manda prompt direito, safado"})
47
 
48
+ # Monta histórico
49
  messages = history_db[user_id]
50
  full_prompt = ""
51
  for role, content in messages:
 
68
  }
69
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
70
  thread.start()
71
+ return StreamingResponse(streamer, media_type="text/event-stream")
 
 
 
 
 
72
  else:
73
  outputs = model.generate(
74
+ **inputs,
 
75
  max_new_tokens=max_tokens,
76
  temperature=temperature,
77
  do_sample=True,
 
81
  resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
82
  resposta = resposta.split("<|im_start|>assistant")[-1].strip()
83
 
 
84
  messages.append(("user", prompt))
85
  messages.append(("assistant", resposta))
86
 
87
+ # Limpa histórico se ficar grande
88
+ while sum(len(tokenizer.encode(c[1])) for c in messages) > MAX_CONTEXT_TOKENS:
89
  messages.pop(0)
90
 
91
+ return JSONResponse({"response": resposta})
92
+
93
+ print("✅ Modelo carregado! Mariza tá pronta pra dominar o WhatsApp 😏")