GTee2 commited on
Commit
8366339
·
verified ·
1 Parent(s): 299acab

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -30
app.py CHANGED
@@ -4,23 +4,18 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
4
  from threading import Thread
5
  from collections import defaultdict
6
 
7
- app = FastAPI(title="Mariza + Qwen3-0.6B CPU Free")
8
 
9
- print("Carregando Qwen3-0.6B (forçando slow tokenizer pra burlar o bug do tokenizers)")
10
 
11
- # <<< AS DUAS LINHAS MÁGICAS >>>
12
- tokenizer = AutoTokenizer.from_pretrained(
13
- "Qwen/Qwen3-0.6B",
14
- trust_remote_code=True,
15
- use_fast=False # <─ ESSA LINHA SALVA TUDO
16
- )
17
 
18
  model = AutoModelForCausalLM.from_pretrained(
19
  "Qwen/Qwen3-0.6B",
20
  torch_dtype="auto",
21
  device_map="cpu",
22
  trust_remote_code=True,
23
- low_cpu_mem_usage=True
24
  )
25
 
26
  history_db = defaultdict(list)
@@ -28,7 +23,7 @@ MAX_CONTEXT_TOKENS = 3800
28
 
29
  @app.get("/")
30
  async def root():
31
- return {"message": "Qwen3-0.6B vivo e foda na CPU free, chefe! (sem bug do tokenizers) 😈"}
32
 
33
  @app.post("/chat")
34
  async def chat(request: Request):
@@ -43,38 +38,28 @@ async def chat(request: Request):
43
  return JSONResponse({"error": "prompt vazio, safado"})
44
 
45
  messages = history_db[user_id]
46
- full_prompt = ""
47
- for role, content in messages:
48
- full_prompt += f"<|im_start|>{role}\n{content}<|im_end|>\n"
49
  full_prompt += f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
50
 
51
  inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=4096)
52
 
53
  if stream:
54
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
55
- generation_kwargs = {
56
- "input_ids": inputs.input_ids,
57
- "attention_mask": inputs.attention_mask,
58
- "max_new_tokens": max_tokens,
59
- "temperature": temperature,
60
- "do_sample": True,
61
- "top_p": 0.9,
62
- "repetition_penalty": 1.1,
63
- "streamer": streamer
64
- }
65
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
66
- thread.start()
67
- return StreamingResponse(streamer, media_type="text/event-stream")
68
- else:
69
- outputs = model.generate(
70
  input_ids=inputs.input_ids,
71
  attention_mask=inputs.attention_mask,
72
  max_new_tokens=max_tokens,
73
  temperature=temperature,
74
  do_sample=True,
75
  top_p=0.9,
76
- repetition_penalty=1.1
 
77
  )
 
 
 
 
 
78
  resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
79
  resposta = resposta.split("<|im_start|>assistant\n")[-1].strip()
80
 
@@ -86,4 +71,4 @@ async def chat(request: Request):
86
 
87
  return JSONResponse({"response": resposta})
88
 
89
- print("Qwen3-0.6B carregado e pronto pra dominar o WhatsApp 24/7 de graça, chefe! 🔥")
 
4
  from threading import Thread
5
  from collections import defaultdict
6
 
7
+ app = FastAPI(title="Mariza + Qwen3-0.6B CPU Free FINAL")
8
 
9
+ print("Carregando Qwen3-0.6B com transformers atualizado...")
10
 
11
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B", trust_remote_code=True)
 
 
 
 
 
12
 
13
  model = AutoModelForCausalLM.from_pretrained(
14
  "Qwen/Qwen3-0.6B",
15
  torch_dtype="auto",
16
  device_map="cpu",
17
  trust_remote_code=True,
18
+ low_cpu_mem_usage=True=True
19
  )
20
 
21
  history_db = defaultdict(list)
 
23
 
24
  @app.get("/")
25
  async def root():
26
+ return {"message": "Qwen3-0.6B rodando liso na CPU free com transformers novo, chefe! 😈"}
27
 
28
  @app.post("/chat")
29
  async def chat(request: Request):
 
38
  return JSONResponse({"error": "prompt vazio, safado"})
39
 
40
  messages = history_db[user_id]
41
+ full_prompt = "".join([f"<|im_start|>{role}\n{content}<|im_end|>\n" for role, content in messages])
 
 
42
  full_prompt += f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n"
43
 
44
  inputs = tokenizer(full_prompt, return_tensors="pt", truncation=True, max_length=4096)
45
 
46
  if stream:
47
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
48
+ generation_kwargs = dict(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  input_ids=inputs.input_ids,
50
  attention_mask=inputs.attention_mask,
51
  max_new_tokens=max_tokens,
52
  temperature=temperature,
53
  do_sample=True,
54
  top_p=0.9,
55
+ repetition_penalty=1.1,
56
+ streamer=streamer
57
  )
58
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
59
+ thread.start()
60
+ return StreamingResponse(streamer, media_type="text/event-stream")
61
+ else:
62
+ outputs = model.generate(**inputs, max_new_tokens=max_tokens, temperature=temperature, do_sample=True, top_p=0.9, repetition_penalty=1.1)
63
  resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
64
  resposta = resposta.split("<|im_start|>assistant\n")[-1].strip()
65
 
 
71
 
72
  return JSONResponse({"response": resposta})
73
 
74
+ print("Qwen3-0.6B carregado e pronto pra foder o WhatsApp 24h por dia de graça! 🔥")