ulduldp commited on
Commit
d0d995c
·
verified ·
1 Parent(s): 1f5570d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -78
app.py CHANGED
@@ -1,107 +1,86 @@
1
  import os
 
2
  import gradio as gr
3
  from transformers import AutoTokenizer, AutoModelForCausalLM
4
- import torch
5
 
6
- MODEL_ID = "google/gemma-3-1b-it"
 
7
 
8
- import os
9
- from huggingface_hub import login
10
 
11
- HF_TOKEN = os.getenv("HF_TOKEN")
12
-
13
- login(token=HF_TOKEN)
 
14
 
15
  print("Loading tokenizer...")
16
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, token=HF_TOKEN)
 
 
 
 
 
 
 
17
 
18
  print("Loading model...")
19
  model = AutoModelForCausalLM.from_pretrained(
20
  MODEL_ID,
21
- torch_dtype=torch.float32,
22
  device_map="cpu",
23
- token=HF_TOKEN
 
24
  )
25
 
26
- print("Model loaded!")
27
-
28
- SYSTEM_PROMPT = """
29
- You are an expert documentary writer and cinematic image prompt engineer.
30
-
31
- Tasks:
32
- 1. Explain facts in engaging documentary style
33
- 2. Generate cinematic AI image prompts
34
- 3. Create social-media-ready narration
35
-
36
- Always:
37
- - Be descriptive
38
- - Use vivid imagery
39
- - Keep responses high quality
40
- """
41
 
 
 
42
 
43
- def generate(prompt, max_new_tokens, temperature):
44
- full_prompt = f"""
45
- {SYSTEM_PROMPT}
 
 
46
 
47
- User: {prompt}
48
- Assistant:
49
- """
50
 
51
- inputs = tokenizer(
52
- full_prompt,
53
- return_tensors="pt"
 
54
  )
55
 
56
- with torch.no_grad():
 
 
57
  outputs = model.generate(
58
  **inputs,
59
- max_new_tokens=max_new_tokens,
60
- temperature=temperature,
61
  do_sample=True,
62
- top_p=0.95,
63
- repetition_penalty=1.1
 
 
 
64
  )
65
 
66
- response = tokenizer.decode(
67
- outputs[0],
68
- skip_special_tokens=True
69
- )
70
-
71
- response = response.split("Assistant:")[-1].strip()
72
-
73
- return response
74
-
75
-
76
- demo = gr.Interface(
77
- fn=generate,
78
- inputs=[
79
- gr.Textbox(
80
- lines=8,
81
- label="Prompt",
82
- placeholder="Enter your fact or image prompt request..."
83
- ),
84
- gr.Slider(
85
- minimum=64,
86
- maximum=1024,
87
- value=256,
88
- step=32,
89
- label="Max New Tokens"
90
- ),
91
- gr.Slider(
92
- minimum=0.1,
93
- maximum=1.5,
94
- value=0.7,
95
- step=0.1,
96
- label="Temperature"
97
- )
98
  ],
99
- outputs=gr.Textbox(
100
- lines=20,
101
- label="Response"
102
- ),
103
- title="Gemma 3 4B CPU Demo",
104
- description="Running fully on CPU using Hugging Face Spaces"
105
  )
106
 
107
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
 
1
  import os
2
+ import torch
3
  import gradio as gr
4
  from transformers import AutoTokenizer, AutoModelForCausalLM
 
5
 
6
+ MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
7
+ HF_TOKEN = os.getenv("HF_TOKEN", None)
8
 
9
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
10
+ torch.set_num_threads(max(1, (os.cpu_count() or 4) - 1))
11
 
12
+ SYSTEM_PROMPT = (
13
+ "You are a helpful assistant. Answer clearly and concisely. "
14
+ "If the user asks for tool JSON, return only valid JSON."
15
+ )
16
 
17
  print("Loading tokenizer...")
18
+ tokenizer = AutoTokenizer.from_pretrained(
19
+ MODEL_ID,
20
+ token=HF_TOKEN,
21
+ use_fast=True,
22
+ )
23
+
24
+ if tokenizer.pad_token is None:
25
+ tokenizer.pad_token = tokenizer.eos_token
26
 
27
  print("Loading model...")
28
  model = AutoModelForCausalLM.from_pretrained(
29
  MODEL_ID,
30
+ token=HF_TOKEN,
31
  device_map="cpu",
32
+ torch_dtype=torch.float32,
33
+ low_cpu_mem_usage=True,
34
  )
35
 
36
+ model.eval()
37
+ print("Model loaded.")
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ def respond(message, history):
40
+ messages = [{"role": "system", "content": SYSTEM_PROMPT}]
41
 
42
+ for user_msg, assistant_msg in history:
43
+ if user_msg:
44
+ messages.append({"role": "user", "content": user_msg})
45
+ if assistant_msg:
46
+ messages.append({"role": "assistant", "content": assistant_msg})
47
 
48
+ messages.append({"role": "user", "content": message})
 
 
49
 
50
+ prompt = tokenizer.apply_chat_template(
51
+ messages,
52
+ tokenize=False,
53
+ add_generation_prompt=True,
54
  )
55
 
56
+ inputs = tokenizer(prompt, return_tensors="pt")
57
+
58
+ with torch.inference_mode():
59
  outputs = model.generate(
60
  **inputs,
61
+ max_new_tokens=256,
 
62
  do_sample=True,
63
+ temperature=0.7,
64
+ top_p=0.9,
65
+ repetition_penalty=1.05,
66
+ pad_token_id=tokenizer.eos_token_id,
67
+ eos_token_id=tokenizer.eos_token_id,
68
  )
69
 
70
+ new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
71
+ reply = tokenizer.decode(new_tokens, skip_special_tokens=True).strip()
72
+ return reply
73
+
74
+ demo = gr.ChatInterface(
75
+ fn=respond,
76
+ title="Qwen2.5-1.5B CPU Chat",
77
+ description="Directly loads the model from Hugging Face Hub. No custom model upload needed.",
78
+ examples=[
79
+ "Explain black holes in simple words.",
80
+ "Write a cinematic image prompt for a medieval knight in a storm.",
81
+ "Set a timer for 10 minutes because pizza is baking.",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  ],
 
 
 
 
 
 
83
  )
84
 
85
+ if __name__ == "__main__":
86
+ demo.queue().launch(server_name="0.0.0.0", server_port=7860)