yuhueng commited on
Commit
693f9ea
·
verified ·
1 Parent(s): 9c16eb5

Updated parameters to model/tokenizer

Browse files
Files changed (1) hide show
  1. app.py +11 -6
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import spaces
2
  import gradio as gr
3
- from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import torch
5
 
6
  MODEL_ID = "yuhueng/qwen3-4b-singlish-base" # replace with your model
@@ -11,16 +11,21 @@ model = AutoModelForCausalLM.from_pretrained(
11
  torch_dtype=torch.float16,
12
  )
13
 
 
 
 
 
 
 
14
  @spaces.GPU(duration=120)
15
  def inference(prompt: str, max_tokens: int = 256) -> str:
16
  model.to("cuda") # Move to GPU inside decorated function
17
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
18
  outputs = model.generate(
19
- **inputs,
20
- max_new_tokens=max_tokens,
21
- do_sample=True,
22
- temperature=0.8,
23
- pad_token_id=tokenizer.eos_token_id
24
  )
25
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
26
 
 
1
  import spaces
2
  import gradio as gr
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer
4
  import torch
5
 
6
  MODEL_ID = "yuhueng/qwen3-4b-singlish-base" # replace with your model
 
11
  torch_dtype=torch.float16,
12
  )
13
 
14
+ text = tokenizer.apply_chat_template(
15
+ messages,
16
+ tokenize = False,
17
+ add_generation_prompt = True, # Must add for generation
18
+ )
19
+
20
  @spaces.GPU(duration=120)
21
  def inference(prompt: str, max_tokens: int = 256) -> str:
22
  model.to("cuda") # Move to GPU inside decorated function
23
  inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
24
  outputs = model.generate(
25
+ **tokenizer(text, return_tensors = "pt").to("cuda"),
26
+ max_new_tokens = 100, # Increase for longer outputs!
27
+ temperature = 0.7, top_p = 0.8, top_k = 20, # For non thinking
28
+ streamer = TextStreamer(tokenizer, skip_prompt = True),
 
29
  )
30
  return tokenizer.decode(outputs[0], skip_special_tokens=True)
31