GTee2 commited on
Commit
8a4701f
·
verified ·
1 Parent(s): 8366339

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -12
app.py CHANGED
@@ -4,7 +4,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStream
4
  from threading import Thread
5
  from collections import defaultdict
6
 
7
- app = FastAPI(title="Mariza + Qwen3-0.6B CPU Free FINAL")
8
 
9
  print("Carregando Qwen3-0.6B com transformers atualizado...")
10
 
@@ -15,7 +15,7 @@ model = AutoModelForCausalLM.from_pretrained(
15
  torch_dtype="auto",
16
  device_map="cpu",
17
  trust_remote_code=True,
18
- low_cpu_mem_usage=True=True
19
  )
20
 
21
  history_db = defaultdict(list)
@@ -23,7 +23,7 @@ MAX_CONTEXT_TOKENS = 3800
23
 
24
  @app.get("/")
25
  async def root():
26
- return {"message": "Qwen3-0.6B rodando liso na CPU free com transformers novo, chefe! 😈"}
27
 
28
  @app.post("/chat")
29
  async def chat(request: Request):
@@ -45,21 +45,29 @@ async def chat(request: Request):
45
 
46
  if stream:
47
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
48
- generation_kwargs = dict(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  input_ids=inputs.input_ids,
50
  attention_mask=inputs.attention_mask,
51
  max_new_tokens=max_tokens,
52
  temperature=temperature,
53
  do_sample=True,
54
  top_p=0.9,
55
- repetition_penalty=1.1,
56
- streamer=streamer
57
  )
58
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
59
- thread.start()
60
- return StreamingResponse(streamer, media_type="text/event-stream")
61
- else:
62
- outputs = model.generate(**inputs, max_new_tokens=max_tokens, temperature=temperature, do_sample=True, top_p=0.9, repetition_penalty=1.1)
63
  resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
64
  resposta = resposta.split("<|im_start|>assistant\n")[-1].strip()
65
 
@@ -71,4 +79,4 @@ async def chat(request: Request):
71
 
72
  return JSONResponse({"response": resposta})
73
 
74
- print("Qwen3-0.6B carregado e pronto pra foder o WhatsApp 24h por dia de graça! 🔥")
 
4
  from threading import Thread
5
  from collections import defaultdict
6
 
7
+ app = FastAPI(title="Mariza + Qwen3-0.6B CPU Free")
8
 
9
  print("Carregando Qwen3-0.6B com transformers atualizado...")
10
 
 
15
  torch_dtype="auto",
16
  device_map="cpu",
17
  trust_remote_code=True,
18
+ low_cpu_mem_usage=True
19
  )
20
 
21
  history_db = defaultdict(list)
 
23
 
24
  @app.get("/")
25
  async def root():
26
+ return {"message": "Qwen3-0.6B rodando perfeito na CPU free, chefe! 😈"}
27
 
28
  @app.post("/chat")
29
  async def chat(request: Request):
 
45
 
46
  if stream:
47
  streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
48
+ generation_kwargs = {
49
+ "input_ids": inputs.input_ids,
50
+ "attention_mask": inputs.attention_mask,
51
+ "max_new_tokens": max_tokens,
52
+ "temperature": temperature,
53
+ "do_sample": True,
54
+ "top_p": 0.9,
55
+ "repetition_penalty": 1.1,
56
+ "streamer": streamer
57
+ }
58
+ thread = Thread(target=model.generate, kwargs=generation_kwargs)
59
+ thread.start()
60
+ return StreamingResponse(streamer, media_type="text/event-stream")
61
+ else:
62
+ outputs = model.generate(
63
  input_ids=inputs.input_ids,
64
  attention_mask=inputs.attention_mask,
65
  max_new_tokens=max_tokens,
66
  temperature=temperature,
67
  do_sample=True,
68
  top_p=0.9,
69
+ repetition_penalty=1.1
 
70
  )
 
 
 
 
 
71
  resposta = tokenizer.decode(outputs[0], skip_special_tokens=True)
72
  resposta = resposta.split("<|im_start|>assistant\n")[-1].strip()
73
 
 
79
 
80
  return JSONResponse({"response": resposta})
81
 
82
+ print("Qwen3-0.6B carregado e pronto pra dominar o WhatsApp de graça 24/7! 🔥")