DrDavis commited on
Commit
04c0cc5
·
verified ·
1 Parent(s): 2875eed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +55 -20
app.py CHANGED
@@ -1,34 +1,69 @@
1
  import gradio as gr
2
- from transformers import pipeline
 
3
 
4
- MODEL_ID = "google/flan-t5-base" # swap to flan-t5-base if you have more CPU
5
- pipe = pipeline("text2text-generation", model=MODEL_ID)
6
 
7
- def infer(prompt, max_new_tokens, num_beams, length_penalty):
8
- if not prompt.strip():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  return "Please type something to generate."
10
- # Beam search is deterministic and tends to avoid loops on T5.
11
- out = pipe(
12
- prompt,
13
- max_new_tokens=150,#int(max_new_tokens),
14
- num_beams=5,#int(num_beams),
15
- early_stopping=True,
16
- length_penalty=1,#float(length_penalty), # <1.0 favors shorter, reduces rambling
17
- no_repeat_ngram_size=3 # prevents 3-gram loops like "blue sky with"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  )
19
- return out[0]["generated_text"]
 
20
 
21
  demo = gr.Interface(
22
  fn=infer,
23
  inputs=[
24
- gr.Textbox(lines=3, label="Instruction",
25
- placeholder="Explain in one paragraph: Why is the sky blue?"),
26
- gr.Slider(32, 192, 96, step=8, label="Max new tokens"),
27
- gr.Slider(2, 8, 4, step=1, label="Num beams"),
28
- gr.Slider(0.6, 1.3, 0.9, step=0.05, label="Length penalty")
29
  ],
30
  outputs=gr.Textbox(lines=10, label="Output"),
31
- title="FLAN-T5 (Deterministic) — Mini LLM"
32
  )
33
 
34
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
4
 
5
+ # Tiny, modern instruct model that can (patiently) run on CPU
6
+ MODEL_ID = "Qwen/Qwen2.5-0.5B-Instruct"
7
 
8
+ # Load tokenizer + model
9
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
10
+ model = AutoModelForCausalLM.from_pretrained(
11
+ MODEL_ID,
12
+ torch_dtype=torch.float32, # CPU-safe; on GPU you could use torch.float16/bfloat16
13
+ low_cpu_mem_usage=True # helps reduce peak RAM on load
14
+ )
15
+
16
+ # Make sure a pad token exists (avoids warnings on generation)
17
+ if tokenizer.pad_token_id is None and tokenizer.eos_token_id is not None:
18
+ tokenizer.pad_token = tokenizer.eos_token
19
+
20
+ # Wrap with a text-generation pipeline
21
+ pipe = pipeline(
22
+ task="text-generation",
23
+ model=model,
24
+ tokenizer=tokenizer
25
+ )
26
+
27
+ def infer(prompt, max_new_tokens=128, temperature=0.7, top_p=0.9):
28
+ """Single-turn chat-style inference with Qwen 0.5B Instruct."""
29
+ if not prompt or not prompt.strip():
30
  return "Please type something to generate."
31
+
32
+ # Use Qwen's chat template for better instruct-style behavior
33
+ messages = [
34
+ {"role": "system", "content": "You are a helpful, concise assistant for beginners learning about LLMs."},
35
+ {"role": "user", "content": prompt.strip()}
36
+ ]
37
+ chat_prompt = tokenizer.apply_chat_template(
38
+ messages,
39
+ tokenize=False,
40
+ add_generation_prompt=True # appends assistant prefix as the generation start
41
+ )
42
+
43
+ # Generation with light anti-repetition guards
44
+ outputs = pipe(
45
+ chat_prompt,
46
+ max_new_tokens=int(max_new_tokens),
47
+ do_sample=True,
48
+ temperature=float(temperature),
49
+ top_p=float(top_p),
50
+ no_repeat_ngram_size=3, # prevents short n-gram loops
51
+ repetition_penalty=1.1, # gentle nudge against repeating phrases
52
+ return_full_text=False # only return the assistant's new text
53
  )
54
+
55
+ return outputs[0]["generated_text"]
56
 
57
  demo = gr.Interface(
58
  fn=infer,
59
  inputs=[
60
+ gr.Textbox(lines=3, label="Instruction", placeholder="Explain in one paragraph: Why is the sky blue?"),
61
+ gr.Slider(16, 256, 128, step=8, label="Max new tokens"),
62
+ gr.Slider(0.0, 1.5, 0.7, step=0.05, label="Temperature"),
63
+ gr.Slider(0.1, 1.0, 0.9, step=0.05, label="Top-p"),
 
64
  ],
65
  outputs=gr.Textbox(lines=10, label="Output"),
66
+ title="Mini LLM (Local) — Qwen 2.5 (0.5B) Instruct"
67
  )
68
 
69
  if __name__ == "__main__":