fix model handling + more part 2

#4
by Scoopala7 - opened
Files changed (1) hide show
  1. app.py +45 -101
app.py CHANGED
@@ -1,106 +1,55 @@
1
  import os
2
- from threading import Thread
3
-
4
  import gradio as gr
5
- import torch
6
- from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
7
-
8
 
9
  MODEL_ID = os.getenv("MODEL_ID", "GenueAI/Inelly-4.5-Blaze")
10
-
11
- tokenizer = None
12
- model = None
13
-
14
 
15
  def load_model():
16
- global tokenizer, model
17
-
18
- if tokenizer is not None and model is not None:
19
- return tokenizer, model
20
-
21
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
22
- if tokenizer.pad_token_id is None:
23
- tokenizer.pad_token = tokenizer.eos_token
24
-
25
- kwargs = {
26
- "trust_remote_code": True,
27
- "torch_dtype": torch.float16 if torch.cuda.is_available() else torch.float32,
28
- "low_cpu_mem_usage": True,
29
- }
30
-
31
- if torch.cuda.is_available():
32
- kwargs["device_map"] = "auto"
33
-
34
- model = AutoModelForCausalLM.from_pretrained(MODEL_ID, **kwargs)
35
- if not torch.cuda.is_available():
36
- model = model.to("cpu")
37
-
38
- model.eval()
39
- return tokenizer, model
40
-
41
-
42
- def build_prompt(message, history, system_prompt):
43
  messages = []
44
  if system_prompt.strip():
45
  messages.append({"role": "system", "content": system_prompt.strip()})
46
 
47
- for user_message, assistant_message in history:
48
- if user_message:
49
- messages.append({"role": "user", "content": user_message})
50
- if assistant_message:
51
- messages.append({"role": "assistant", "content": assistant_message})
52
-
53
- messages.append({"role": "user", "content": message})
54
-
55
- tok, _ = load_model()
56
- if hasattr(tok, "apply_chat_template") and tok.chat_template:
57
- return tok.apply_chat_template(
58
- messages,
59
- tokenize=False,
60
- add_generation_prompt=True,
61
- )
62
-
63
- prompt = ""
64
- for item in messages:
65
- role = item["role"].capitalize()
66
- prompt += f"{role}: {item['content']}\n"
67
- return prompt + "Assistant:"
68
-
69
-
70
- def chat(message, history, system_prompt, max_new_tokens, temperature, top_p, repetition_penalty):
71
- if not message.strip():
72
- yield ""
73
- return
74
-
75
- tok, mdl = load_model()
76
- prompt = build_prompt(message, history, system_prompt)
77
- inputs = tok(prompt, return_tensors="pt").to(mdl.device)
78
-
79
- streamer = TextIteratorStreamer(tok, skip_prompt=True, skip_special_tokens=True)
80
- generation_kwargs = {
81
- **inputs,
82
- "streamer": streamer,
83
- "max_new_tokens": int(max_new_tokens),
84
- "temperature": float(temperature),
85
- "top_p": float(top_p),
86
- "repetition_penalty": float(repetition_penalty),
87
- "do_sample": temperature > 0,
88
- "pad_token_id": tok.pad_token_id,
89
- "eos_token_id": tok.eos_token_id,
90
- }
91
-
92
- thread = Thread(target=mdl.generate, kwargs=generation_kwargs)
93
- thread.start()
94
-
95
- response = ""
96
- for token in streamer:
97
- response += token
98
- yield response
99
-
100
-
101
- with gr.Blocks(title="Matrix Prime 8B Chat") as demo:
102
- gr.Markdown("# Matrix Prime 8B Chat")
103
- gr.Markdown(f"Chat with `{MODEL_ID}` from Hugging Face.")
104
 
105
  with gr.Row():
106
  with gr.Column(scale=4):
@@ -112,21 +61,16 @@ with gr.Blocks(title="Matrix Prime 8B Chat") as demo:
112
  value="You are a helpful assistant.",
113
  lines=3,
114
  ),
115
- gr.Slider(64, 4096, value=512, step=32, label="Max new tokens"),
116
- gr.Slider(0.0, 2.0, value=0.7, step=0.05, label="Temperature"),
117
- gr.Slider(0.05, 1.0, value=0.9, step=0.05, label="Top-p"),
118
- gr.Slider(1.0, 2.0, value=1.1, step=0.05, label="Repetition penalty"),
119
  ],
120
  textbox=gr.Textbox(
121
- placeholder="Ask Matrix Prime 8B anything...",
 
122
  container=False,
123
  scale=7,
124
  ),
125
- submit_btn="Send",
126
- stop_btn="Stop"
127
  )
128
 
129
-
130
  if __name__ == "__main__":
 
131
  demo.queue()
132
  demo.launch()
 
1
  import os
 
 
2
  import gradio as gr
3
+ from transformers import pipeline
 
 
4
 
5
  MODEL_ID = os.getenv("MODEL_ID", "GenueAI/Inelly-4.5-Blaze")
6
+ MODEL_NAME = os.getenv("MODEL_NAME", "Inelly 4.5 Blaze")
7
+ pipe = None
 
 
8
 
9
  def load_model():
10
+ global pipe
11
+ pipe = pipeline(
12
+ "text-generation",
13
+ model=MODEL_ID,
14
+ torch_dtype="auto",
15
+ model_kwargs={
16
+ "low_cpu_mem_usage": True,
17
+ "device_map": "sequential"
18
+ }
19
+ )
20
+
21
+ def build_prompt(message_text, history, system_prompt):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  messages = []
23
  if system_prompt.strip():
24
  messages.append({"role": "system", "content": system_prompt.strip()})
25
 
26
+ for msg in history:
27
+ messages.append({"role": msg["role"], "content": msg["content"]})
28
+
29
+ messages.append({"role": "user", "content": message_text})
30
+ return messages
31
+
32
+ def chat(message, history, system_prompt):
33
+ message_text = message.get("text", "").strip() if isinstance(message, dict) else str(message).strip()
34
+
35
+ if not message_text:
36
+ return ""
37
+
38
+ prompt = build_prompt(message_text, history, system_prompt)
39
+
40
+ outputs = pipe(prompt)
41
+
42
+ try:
43
+ return outputs[0]["generated_text"][-1]["content"]
44
+ except (KeyError, IndexError, TypeError):
45
+ try:
46
+ return outputs["generated_text"][-1]["content"]
47
+ except (KeyError, IndexError, TypeError):
48
+ return str(outputs)
49
+
50
+ with gr.Blocks(title="Genue Chat") as demo:
51
+ gr.Markdown("# Genue Chat")
52
+ gr.Markdown(f"Chat with {MODEL_NAME} from Hugging Face.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
 
54
  with gr.Row():
55
  with gr.Column(scale=4):
 
61
  value="You are a helpful assistant.",
62
  lines=3,
63
  ),
 
 
 
 
64
  ],
65
  textbox=gr.Textbox(
66
+ label="Prompt",
67
+ placeholder="Ask anything...",
68
  container=False,
69
  scale=7,
70
  ),
 
 
71
  )
72
 
 
73
  if __name__ == "__main__":
74
+ load_model()
75
  demo.queue()
76
  demo.launch()