simonper commited on
Commit
83cbea2
·
verified ·
1 Parent(s): 14d11ec

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -86
app.py CHANGED
@@ -1,107 +1,92 @@
1
  import gradio as gr
2
- import torch
3
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
4
- from threading import Thread
5
-
6
- # --- 1. SETUP MODEL & TOKENIZER ---
7
- # User requested the BASE (Untrained) version, not Instruct.
8
- MODEL_ID = "meta-llama/Llama-3.2-1B"
9
-
10
- # Check for GPU, otherwise fallback to CPU
11
- device = "cuda" if torch.cuda.is_available() else "cpu"
12
- print(f"Loading base model on: {device}")
13
-
14
- try:
15
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
16
- model = AutoModelForCausalLM.from_pretrained(
17
- MODEL_ID,
18
- torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
19
- device_map="auto"
20
- )
 
 
 
 
 
 
21
 
22
- # CRITICAL FIX FOR BASE MODELS:
23
- # Base models often do not have a 'chat_template' defined in their config
24
- # because they aren't meant for chat. We must manually assign the Llama 3
25
- # template so the code doesn't crash when using apply_chat_template.
26
- if tokenizer.chat_template is None:
27
- print("Base model detected: Assigning default Llama 3 chat template...")
28
- tokenizer.chat_template = (
29
- "{% set loop_messages = messages %}"
30
- "{% for message in loop_messages %}"
31
- "{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' + message['content'] | trim + '<|eot_id|>' %}"
32
- "{% if loop.index0 == 0 %}"
33
- "{% set content = '<|begin_of_text|>' + content %}"
34
- "{% endif %}"
35
- "{{ content }}"
36
- "{% endfor %}"
37
- "{% if add_generation_prompt %}"
38
- "{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}"
39
- "{% endif %}"
40
  )
41
- # Ensure special tokens used in template exist in tokenizer
42
- tokenizer.pad_token_id = tokenizer.eos_token_id
43
-
44
- except Exception as e:
45
- print(f"Error loading model. Ensure you have a valid HF_TOKEN and access to the gated repo. Error: {e}")
46
- raise e
47
 
48
- # --- 2. GENERATION FUNCTION ---
49
  def respond(
50
  message,
51
  history: list[dict],
52
- system_message_dummy,
53
  max_tokens,
54
  temperature,
55
  top_p,
56
  repetition_penalty,
57
  style_mode,
58
  ):
59
- # Base models ignore system prompts mostly, but we include it for structure
60
- system_prompt = "You are an AI assistant."
61
- if style_mode == "Shakespeare":
62
- system_prompt = "You are William Shakespeare. Speak in Early Modern English."
63
- elif style_mode == "Funny/Ironic":
64
- system_prompt = "You are a sarcastic comedian."
65
-
66
- # Context Window Management
67
- if len(history) > 10:
68
- history = history[-10:]
69
-
70
- # Build messages
71
- messages = [{"role": "system", "content": system_prompt}]
72
- for turn in history:
73
  messages.append({"role": turn['role'], "content": turn['content']})
 
 
74
  messages.append({"role": "user", "content": message})
75
 
76
- # Apply Template
77
- input_ids = tokenizer.apply_chat_template(
78
- messages,
79
- add_generation_prompt=True,
80
- return_tensors="pt"
81
- ).to(model.device)
82
-
83
- terminators = [
84
- tokenizer.eos_token_id,
85
- tokenizer.convert_tokens_to_ids("<|eot_id|>")
86
- ]
87
-
88
- # Generate
89
- outputs = model.generate(
90
- input_ids,
91
- max_new_tokens=int(max_tokens),
92
- eos_token_id=terminators,
93
  temperature=float(temperature),
94
  top_p=float(top_p),
95
- repetition_penalty=float(repetition_penalty),
96
- do_sample=True,
 
97
  )
98
 
99
- response = outputs[0][input_ids.shape[-1]:]
100
- decoded_response = tokenizer.decode(response, skip_special_tokens=True)
101
- return decoded_response
102
 
103
- # --- 3. GUI SETUP ---
104
- # (Kept identical to previous, just updated title)
105
  chatbot = gr.ChatInterface(
106
  respond,
107
  type="messages",
@@ -111,13 +96,18 @@ chatbot = gr.ChatInterface(
111
  gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature"),
112
  gr.Slider(minimum=0.1, maximum=1.0, value=0.9, label="Top-p"),
113
  gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty"),
114
- gr.Dropdown(choices=["Normal", "Professional", "Shakespeare", "Funny/Ironic"], value="Normal", label="Style"),
 
 
 
 
115
  ],
116
  )
117
 
118
  with gr.Blocks() as demo:
119
- gr.Markdown("# Chat with Llama 3.2 1B (Base/Untrained)")
120
- gr.Markdown("> **Warning:** You are running the base model. It will likely hallucinate or autocomplete text rather than chatting normally.")
 
121
  chatbot.render()
122
 
123
  if __name__ == "__main__":
 
1
  import gradio as gr
2
+ from llama_cpp import Llama
3
+ from transformers import AutoTokenizer
4
+
5
+
6
+ MODEL_REPO = "simonper/Llama-3.2-1B-bnb-4bit_untrained_gguf_4bit"
7
+ MODEL_FILE = "Llama-3.2-1B.Q4_K_M.gguf"
8
+
9
+
10
+ TOKENIZER_ID = "chthees/lora_model_full_finetome-tokenizer"
11
+
12
+ print("Loading Tokenizer...")
13
+ tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_ID)
14
+
15
+ print("Loading Model...")
16
+ llm = Llama.from_pretrained(
17
+ repo_id=MODEL_REPO,
18
+ filename=MODEL_FILE,
19
+ n_ctx=2048,
20
+ n_threads=2,
21
+ verbose=False
22
+ )
23
+
24
+ # --- SYSTEM PROMPT LOGIC ---
25
+ def get_system_prompt(style_mode):
26
+ base_instruction = "You are a helpful and intelligent AI assistant."
27
 
28
+ prompts = {
29
+ "Normal": f"{base_instruction} Answer clearly and concisely.",
30
+ "Professional": (
31
+ f"{base_instruction} You are a senior corporate executive. "
32
+ "Your tone is strictly professional, polite, and business-oriented."
33
+ ),
34
+ "Shakespeare": (
35
+ f"{base_instruction} You are William Shakespeare. "
36
+ "Speak only in Early Modern English (thee, thou, hath). Be poetic and dramatic."
37
+ ),
38
+ "Funny/Ironic": (
39
+ f"{base_instruction} You are a sarcastic comedian. "
40
+ "Wrap your answers in dry humor, irony, and witty remarks."
 
 
 
 
 
41
  )
42
+ }
43
+ return prompts.get(style_mode, prompts["Normal"])
 
 
 
 
44
 
45
+ # --- CORE RESPONSE FUNCTION ---
46
  def respond(
47
  message,
48
  history: list[dict],
49
+ system_message_dummy,
50
  max_tokens,
51
  temperature,
52
  top_p,
53
  repetition_penalty,
54
  style_mode,
55
  ):
56
+ messages = []
57
+
58
+ # Add System Persona
59
+ system_prompt = get_system_prompt(style_mode)
60
+ messages.append({"role": "system", "content": system_prompt})
61
+
62
+ # Add Conversation History
63
+ # We slice to the last 10 turns to keep the context window manageable
64
+ for turn in history[-10:]:
 
 
 
 
 
65
  messages.append({"role": turn['role'], "content": turn['content']})
66
+
67
+ # Add Current User Message
68
  messages.append({"role": "user", "content": message})
69
 
70
+ prompt_str = tokenizer.apply_chat_template(
71
+ messages,
72
+ tokenize=False,
73
+ add_generation_prompt=True
74
+ )
75
+
76
+ # 3. Generate Response
77
+ output = llm(
78
+ prompt_str,
79
+ max_tokens=int(max_tokens),
 
 
 
 
 
 
 
80
  temperature=float(temperature),
81
  top_p=float(top_p),
82
+ repeat_penalty=float(repetition_penalty),
83
+ stop=[tokenizer.eos_token, "<|eot_id|>"],
84
+ echo=False
85
  )
86
 
87
+ return output["choices"][0]["text"].strip()
 
 
88
 
89
+ # --- GUI SETUP ---
 
90
  chatbot = gr.ChatInterface(
91
  respond,
92
  type="messages",
 
96
  gr.Slider(minimum=0.1, maximum=2.0, value=0.7, label="Temperature"),
97
  gr.Slider(minimum=0.1, maximum=1.0, value=0.9, label="Top-p"),
98
  gr.Slider(minimum=1.0, maximum=2.0, value=1.1, step=0.05, label="Repetition Penalty"),
99
+ gr.Dropdown(
100
+ choices=["Normal", "Professional", "Shakespeare", "Funny/Ironic"],
101
+ value="Normal",
102
+ label="Choose the Style / Tone"
103
+ )
104
  ],
105
  )
106
 
107
  with gr.Blocks() as demo:
108
+ gr.Markdown("# Styled Chat Bot")
109
+ with gr.Sidebar():
110
+ gr.LoginButton()
111
  chatbot.render()
112
 
113
  if __name__ == "__main__":