GTee2 commited on
Commit
dd95682
·
verified ·
1 Parent(s): 2284eab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -17
app.py CHANGED
@@ -1,36 +1,29 @@
1
  from fastapi import FastAPI, Request
2
  from fastapi.responses import JSONResponse, StreamingResponse
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig
4
  from threading import Thread
5
  from collections import defaultdict
6
 
7
- app = FastAPI(title="Mariza Koller 1.5B - CPU Free 4bit")
8
 
9
- print("Carregando Qwen2-1.5B em 4-bit na CPU... (3-6 min na primeira vez)")
10
 
11
- quantization_config = BitsAndBytesConfig(
12
- load_in_4bit=True,
13
- bnb_4bit_quant_type="nf4",
14
- bnb_4bit_compute_dtype="float16",
15
- bnb_4bit_use_double_quant=True,
16
- )
17
-
18
- tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-1.5B-Instruct", trust_remote_code=True)
19
 
20
  model = AutoModelForCausalLM.from_pretrained(
21
- "Qwen/Qwen2-1.5B-Instruct",
 
22
  device_map="cpu",
23
- quantization_config=quantization_config,
24
  trust_remote_code=True,
25
  low_cpu_mem_usage=True
26
  )
27
 
28
  history_db = defaultdict(list)
29
- MAX_CONTEXT_TOKENS = 3500
30
 
31
  @app.get("/")
32
  async def root():
33
- return {"message": "Mariza 1.5B 4-bit viva e quente na CPU free, chefe! 😈"}
34
 
35
  @app.post("/chat")
36
  async def chat(request: Request):
@@ -42,8 +35,9 @@ async def chat(request: Request):
42
  stream = data.get("stream", False)
43
 
44
  if not prompt:
45
- return JSONResponse({"error": "prompt vazio, safado"})
46
 
 
47
  messages = history_db[user_id]
48
  full_prompt = ""
49
  for role, content in messages:
@@ -88,4 +82,4 @@ async def chat(request: Request):
88
 
89
  return JSONResponse({"response": resposta})
90
 
91
- print("Qwen2-1.5B carregado com sucesso! Mariza tá pronta pra foder o WhatsApp inteiro 😈")
 
1
  from fastapi import FastAPI, Request
2
  from fastapi.responses import JSONResponse, StreamingResponse
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
  from threading import Thread
5
  from collections import defaultdict
6
 
7
+ app = FastAPI(title="Mariza + Qwen3-0.6B CPU Free")
8
 
9
+ print("Carregando Qwen3-0.6B em fp16 puro na CPU... (2-4 min na primeira vez)")
10
 
11
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
 
 
 
 
 
 
 
12
 
13
  model = AutoModelForCausalLM.from_pretrained(
14
+ "Qwen/Qwen3-0.6B",
15
+ torch_dtype="auto", # deixa o transformers escolher fp16/float16
16
  device_map="cpu",
 
17
  trust_remote_code=True,
18
  low_cpu_mem_usage=True
19
  )
20
 
21
  history_db = defaultdict(list)
22
+ MAX_CONTEXT_TOKENS = 3800
23
 
24
  @app.get("/")
25
  async def root():
26
+ return {"message": "Qwen3-0.6Bvivo e quente na CPU free, chefe! Sem quantização, sem dor de cabeça 😈"}
27
 
28
  @app.post("/chat")
29
  async def chat(request: Request):
 
35
  stream = data.get("stream", False)
36
 
37
  if not prompt:
38
+ return JSONResponse({"error": "prompt vazio, seu safado"})
39
 
40
+ # Monta histórico
41
  messages = history_db[user_id]
42
  full_prompt = ""
43
  for role, content in messages:
 
82
 
83
  return JSONResponse({"response": resposta})
84
 
85
+ print("Qwen3-0.6B carregado! Pode mandar o zap que Mariza tá pronta pra responder 24/7 😏")