GTee2 commited on
Commit
7efd208
·
verified ·
1 Parent(s): 40bdbb6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +19 -12
app.py CHANGED
@@ -6,9 +6,9 @@ from collections import defaultdict
6
 
7
  app = FastAPI(title="Mariza Koller 1.5B - CPU Free 4bit 🔥")
8
 
9
- print("🔥 Carregando Qwen2-1.5B em 4-bit na CPU (agora SIM funciona no HF free!)")
10
 
11
- # Configuração 4-bit que roda na CPU do Spaces
12
  quantization_config = BitsAndBytesConfig(
13
  load_in_4bit=True,
14
  bnb_4bit_quant_type="nf4",
@@ -16,22 +16,26 @@ quantization_config = BitsAndBytesConfig(
16
  bnb_4bit_use_double_quant=True,
17
  )
18
 
19
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct", trust Tom's_remote_code=True)
 
 
 
 
20
  model = AutoModelForCausalLM.from_pretrained(
21
- "Qwen/Qwen2-1.5B-Instruct2",
22
  device_map="cpu",
23
  quantization_config=quantization_config,
24
  trust_remote_code=True,
25
  low_cpu_mem_usage=True
26
  )
27
 
28
- # Cache de conversa
29
  history_db = defaultdict(list)
30
  MAX_CONTEXT_TOKENS = 3500
31
 
32
  @app.get("/")
33
  async def root():
34
- return {"message": "Mariza 1.5B 4-bit rodando quente na CPU free, chefe! 😈"}
35
 
36
  @app.post("/chat")
37
  async def chat(request: Request):
@@ -43,9 +47,9 @@ async def chat(request: Request):
43
  stream = data.get("stream", False)
44
 
45
  if not prompt:
46
- return JSONResponse({"error": "manda prompt direito, safado"})
47
 
48
- # Monta histórico
49
  messages = history_db[user_id]
50
  full_prompt = ""
51
  for role, content in messages:
@@ -69,25 +73,28 @@ async def chat(request: Request):
69
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
70
  thread.start()
71
  return StreamingResponse(streamer, media_type="text/event-stream")
 
72
  else:
73
  outputs = model.generate(
74
- **inputs,
 
75
  max_new_tokens=max_tokens,
76
  temperature=temperature,
77
  do_sample=True,
78
- top_p=0.9,
79
  repetition_penalty=1.1
80
  )
81
  resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
82
  resposta = resposta.split("<|im_start|>assistant")[-1].strip()
83
 
 
84
  messages.append(("user", prompt))
85
  messages.append(("assistant", resposta))
86
 
87
- # Limpa histórico se ficar grande
88
  while sum(len(tokenizer.encode(c[1])) for c in messages) > MAX_CONTEXT_TOKENS:
89
  messages.pop(0)
90
 
91
  return JSONResponse({"response": resposta})
92
 
93
- print("✅ Modelo carregado! Mariza tá pronta pra dominar o WhatsApp 😏")
 
6
 
7
  app = FastAPI(title="Mariza Koller 1.5B - CPU Free 4bit 🔥")
8
 
9
+ print("🔥 Carregando Qwen2-1.5B em 4-bit na CPU... (vai levar 3-5 min na primeira vez)")
10
 
11
+ # Config 4-bit que funciona na CPU do HF Spaces free
12
  quantization_config = BitsAndBytesConfig(
13
  load_in_4bit=True,
14
  bnb_4bit_quant_type="nf4",
 
16
  bnb_4bit_use_double_quant=True,
17
  )
18
 
19
+ tokenizer = AutoTokenizer.from_pretrained(
20
+ "Qwen/Qwen2-1.5B-Instruct",
21
+ trust_remote_code=True
22
+ )
23
+
24
  model = AutoModelForCausalLM.from_pretrained(
25
+ "Qwen/Qwen2-1.5B-Instruct",
26
  device_map="cpu",
27
  quantization_config=quantization_config,
28
  trust_remote_code=True,
29
  low_cpu_mem_usage=True
30
  )
31
 
32
+ # Cache de conversa por usuário
33
  history_db = defaultdict(list)
34
  MAX_CONTEXT_TOKENS = 3500
35
 
36
  @app.get("/")
37
  async def root():
38
+ return {"message": "Mariza 1.5B 4-bit viva e quente na CPU free, chefe! 😈"}
39
 
40
  @app.post("/chat")
41
  async def chat(request: Request):
 
47
  stream = data.get("stream", False)
48
 
49
  if not prompt:
50
+ return JSONResponse({"error": "prompt vazio, safado"})
51
 
52
+ # Monta histórico no formato Qwen2
53
  messages = history_db[user_id]
54
  full_prompt = ""
55
  for role, content in messages:
 
73
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
74
  thread.start()
75
  return StreamingResponse(streamer, media_type="text/event-stream")
76
+
77
  else:
78
  outputs = model.generate(
79
+ input_ids=inputs.input_ids,
80
+ attention_mask=inputs.attention_mask,
81
  max_new_tokens=max_tokens,
82
  temperature=temperature,
83
  do_sample=True,
84
+ top_p": 0.9,
85
  repetition_penalty=1.1
86
  )
87
  resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
88
  resposta = resposta.split("<|im_start|>assistant")[-1].strip()
89
 
90
+ # Salva histórico
91
  messages.append(("user", prompt))
92
  messages.append(("assistant", resposta))
93
 
94
+ # Limpa se ficar grande demais
95
  while sum(len(tokenizer.encode(c[1])) for c in messages) > MAX_CONTEXT_TOKENS:
96
  messages.pop(0)
97
 
98
  return JSONResponse({"response": resposta})
99
 
100
+ print("✅ Qwen2-1.5B carregado com sucesso! Mariza tá pronta pra dominar o zap 😏")