GTee2 commited on
Commit
9e4e121
·
verified ·
1 Parent(s): 7efd208

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -15
app.py CHANGED
@@ -4,11 +4,10 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
4
  from threading import Thread
5
  from collections import defaultdict
6
 
7
- app = FastAPI(title="Mariza Koller 1.5B - CPU Free 4bit 🔥")
8
 
9
- print("🔥 Carregando Qwen2-1.5B em 4-bit na CPU... (vai levar 3-5 min na primeira vez)")
10
 
11
- # Config 4-bit que funciona na CPU do HF Spaces free
12
  quantization_config = BitsAndBytesConfig(
13
  load_in_4bit=True,
14
  bnb_4bit_quant_type="nf4",
@@ -16,10 +15,7 @@ quantization_config = BitsAndBytesConfig(
16
  bnb_4bit_use_double_quant=True,
17
  )
18
 
19
- tokenizer = AutoTokenizer.from_pretrained(
20
- "Qwen/Qwen2-1.5B-Instruct",
21
- trust_remote_code=True
22
- )
23
 
24
  model = AutoModelForCausalLM.from_pretrained(
25
  "Qwen/Qwen2-1.5B-Instruct",
@@ -29,7 +25,6 @@ model = AutoModelForCausalLM.from_pretrained(
29
  low_cpu_mem_usage=True
30
  )
31
 
32
- # Cache de conversa por usuário
33
  history_db = defaultdict(list)
34
  MAX_CONTEXT_TOKENS = 3500
35
 
@@ -49,7 +44,6 @@ async def chat(request: Request):
49
  if not prompt:
50
  return JSONResponse({"error": "prompt vazio, safado"})
51
 
52
- # Monta histórico no formato Qwen2
53
  messages = history_db[user_id]
54
  full_prompt = ""
55
  for role, content in messages:
@@ -73,7 +67,6 @@ async def chat(request: Request):
73
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
74
  thread.start()
75
  return StreamingResponse(streamer, media_type="text/event-stream")
76
-
77
  else:
78
  outputs = model.generate(
79
  input_ids=inputs.input_ids,
@@ -81,20 +74,18 @@ async def chat(request: Request):
81
  max_new_tokens=max_tokens,
82
  temperature=temperature,
83
  do_sample=True,
84
- top_p": 0.9,
85
  repetition_penalty=1.1
86
  )
87
  resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
88
- resposta = resposta.split("<|im_start|>assistant")[-1].strip()
89
 
90
- # Salva histórico
91
  messages.append(("user", prompt))
92
  messages.append(("assistant", resposta))
93
 
94
- # Limpa se ficar grande demais
95
  while sum(len(tokenizer.encode(c[1])) for c in messages) > MAX_CONTEXT_TOKENS:
96
  messages.pop(0)
97
 
98
  return JSONResponse({"response": resposta})
99
 
100
- print("Qwen2-1.5B carregado com sucesso! Mariza tá pronta pra dominar o zap 😏")
 
4
  from threading import Thread
5
  from collections import defaultdict
6
 
7
+ app = FastAPI(title="Mariza Koller 1.5B - CPU Free 4bit")
8
 
9
+ print("Carregando Qwen2-1.5B em 4-bit na CPU... (3-6 min na primeira vez)")
10
 
 
11
  quantization_config = BitsAndBytesConfig(
12
  load_in_4bit=True,
13
  bnb_4bit_quant_type="nf4",
 
15
  bnb_4bit_use_double_quant=True,
16
  )
17
 
18
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct", trust_remote_code=True)
 
 
 
19
 
20
  model = AutoModelForCausalLM.from_pretrained(
21
  "Qwen/Qwen2-1.5B-Instruct",
 
25
  low_cpu_mem_usage=True
26
  )
27
 
 
28
  history_db = defaultdict(list)
29
  MAX_CONTEXT_TOKENS = 3500
30
 
 
44
  if not prompt:
45
  return JSONResponse({"error": "prompt vazio, safado"})
46
 
 
47
  messages = history_db[user_id]
48
  full_prompt = ""
49
  for role, content in messages:
 
67
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
68
  thread.start()
69
  return StreamingResponse(streamer, media_type="text/event-stream")
 
70
  else:
71
  outputs = model.generate(
72
  input_ids=inputs.input_ids,
 
74
  max_new_tokens=max_tokens,
75
  temperature=temperature,
76
  do_sample=True,
77
+ top_p=0.9,
78
  repetition_penalty=1.1
79
  )
80
  resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
81
+ resposta = resposta.split("<|im_start|>assistant\n")[-1].strip()
82
 
 
83
  messages.append(("user", prompt))
84
  messages.append(("assistant", resposta))
85
 
 
86
  while sum(len(tokenizer.encode(c[1])) for c in messages) > MAX_CONTEXT_TOKENS:
87
  messages.pop(0)
88
 
89
  return JSONResponse({"response": resposta})
90
 
91
+ print("Qwen2-1.5B carregado com sucesso! Mariza tá pronta pra foder o WhatsApp inteiro 😈")