Deva1211 commited on
Commit
b8dd0f7
·
1 Parent(s): fdac9a3

Switched to TheBloke/Mistral-7B-Instruct-v0.2-AWQ

Browse files
Files changed (3) hide show
  1. app.py +31 -12
  2. requirements.txt +3 -1
  3. test_model.py +28 -14
app.py CHANGED
@@ -4,9 +4,14 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import re
5
 
6
  # Load model and tokenizer
7
- print("Loading gemma...")
8
- tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b-it")
9
- model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
 
 
 
 
 
10
 
11
  # Add pad token if it doesn't exist
12
  if tokenizer.pad_token is None:
@@ -88,33 +93,47 @@ def respond(message, history, max_length=150, temperature=0.9, top_p=0.9, top_k=
88
  if check_crisis_keywords(message):
89
  return get_crisis_response()
90
 
91
- # Build conversation history - SIMPLIFIED for DialoGPT
92
- conversation = ""
 
 
 
93
 
94
  # Only include last 2-3 exchanges to avoid overwhelming the model
95
  recent_history = history[-2:] if len(history) > 2 else history
96
 
97
  for user_msg, bot_msg in recent_history:
98
- conversation += f"{user_msg}{tokenizer.eos_token}{bot_msg}{tokenizer.eos_token}"
 
 
99
 
100
  # Add current message
101
- conversation += f"{message}{tokenizer.eos_token}"
 
 
 
 
 
 
 
102
 
103
  # Tokenize
104
  input_ids = tokenizer.encode(conversation, return_tensors="pt")
105
 
106
- # Generate response with configurable parameters
107
  with torch.no_grad():
108
  chat_history_ids = model.generate(
109
- input_ids,
110
- max_length=max_length,
111
  temperature=temperature,
112
  top_p=top_p,
113
  repetition_penalty=repetition_penalty,
114
  do_sample=True,
115
  top_k=top_k,
116
- pad_token_id=tokenizer.eos_token_id,
117
- no_repeat_ngram_size=3
 
 
118
  )
119
 
120
  # Decode only the new response
 
4
  import re
5
 
6
  # Load model and tokenizer
7
+ print("Loading Mistral-7B-Instruct AWQ...")
8
+ tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.2-AWQ", trust_remote_code=True)
9
+ model = AutoModelForCausalLM.from_pretrained(
10
+ "TheBloke/Mistral-7B-Instruct-v0.2-AWQ",
11
+ device_map="auto",
12
+ trust_remote_code=True,
13
+ torch_dtype=torch.float16
14
+ )
15
 
16
  # Add pad token if it doesn't exist
17
  if tokenizer.pad_token is None:
 
93
  if check_crisis_keywords(message):
94
  return get_crisis_response()
95
 
96
+ # Build conversation history using Mistral chat template
97
+ messages = []
98
+
99
+ # Add system message for Aura personality
100
+ messages.append({"role": "system", "content": AURA_SYSTEM_PROMPT})
101
 
102
  # Only include last 2-3 exchanges to avoid overwhelming the model
103
  recent_history = history[-2:] if len(history) > 2 else history
104
 
105
  for user_msg, bot_msg in recent_history:
106
+ messages.append({"role": "user", "content": user_msg})
107
+ if bot_msg:
108
+ messages.append({"role": "assistant", "content": bot_msg})
109
 
110
  # Add current message
111
+ messages.append({"role": "user", "content": message})
112
+
113
+ # Apply chat template
114
+ conversation = tokenizer.apply_chat_template(
115
+ messages,
116
+ tokenize=False,
117
+ add_generation_prompt=True
118
+ )
119
 
120
  # Tokenize
121
  input_ids = tokenizer.encode(conversation, return_tensors="pt")
122
 
123
+ # Generate response with configurable parameters optimized for Mistral
124
  with torch.no_grad():
125
  chat_history_ids = model.generate(
126
+ input_ids.to(model.device),
127
+ max_new_tokens=min(max_length - input_ids.shape[-1], 512), # Use max_new_tokens instead
128
  temperature=temperature,
129
  top_p=top_p,
130
  repetition_penalty=repetition_penalty,
131
  do_sample=True,
132
  top_k=top_k,
133
+ pad_token_id=tokenizer.pad_token_id,
134
+ eos_token_id=tokenizer.eos_token_id,
135
+ no_repeat_ngram_size=2,
136
+ use_cache=True
137
  )
138
 
139
  # Decode only the new response
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  torch>=2.0.0,<2.2.0
2
- transformers>=4.30.0,<4.40.0
 
 
3
  gradio>=3.50.0,<4.0.0
 
1
  torch>=2.0.0,<2.2.0
2
+ transformers>=4.35.0,<4.40.0
3
+ autoawq>=0.1.8
4
+ accelerate>=0.20.0
5
  gradio>=3.50.0,<4.0.0
test_model.py CHANGED
@@ -1,17 +1,22 @@
1
  #!/usr/bin/env python3
2
  """
3
- Test script to validate DialoGPT model response generation
4
  """
5
 
6
  import torch
7
  from transformers import AutoModelForCausalLM, AutoTokenizer
8
 
9
  def test_model():
10
- print("Loading DialoGPT-medium for testing...")
11
 
12
  # Load model and tokenizer
13
- tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
14
- model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")
 
 
 
 
 
15
 
16
  if tokenizer.pad_token is None:
17
  tokenizer.pad_token = tokenizer.eos_token
@@ -28,24 +33,33 @@ def test_model():
28
  for i, message in enumerate(test_messages):
29
  print(f"\n--- Test {i+1}: '{message}' ---")
30
 
31
- # Simple conversation format
32
- conversation = f"{message}{tokenizer.eos_token}"
 
 
 
 
 
 
 
 
 
 
33
  input_ids = tokenizer.encode(conversation, return_tensors="pt")
34
 
35
- # Generate response with conservative settings
36
  with torch.no_grad():
37
  chat_history_ids = model.generate(
38
- input_ids,
39
- max_length=input_ids.shape[-1] + 50,
40
- num_beams=5,
41
- no_repeat_ngram_size=3,
42
  do_sample=True,
43
- early_stopping=True,
44
- pad_token_id=tokenizer.eos_token_id,
45
  eos_token_id=tokenizer.eos_token_id,
46
  temperature=0.9,
47
  top_k=50,
48
- top_p=0.9
 
49
  )
50
 
51
  # Decode response
 
1
  #!/usr/bin/env python3
2
  """
3
+ Test script to validate Mistral-7B-Instruct AWQ model response generation
4
  """
5
 
6
  import torch
7
  from transformers import AutoModelForCausalLM, AutoTokenizer
8
 
9
  def test_model():
10
+ print("Loading Mistral-7B-Instruct AWQ for testing...")
11
 
12
  # Load model and tokenizer
13
+ tokenizer = AutoTokenizer.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.2-AWQ", trust_remote_code=True)
14
+ model = AutoModelForCausalLM.from_pretrained(
15
+ "TheBloke/Mistral-7B-Instruct-v0.2-AWQ",
16
+ device_map="auto",
17
+ trust_remote_code=True,
18
+ torch_dtype=torch.float16
19
+ )
20
 
21
  if tokenizer.pad_token is None:
22
  tokenizer.pad_token = tokenizer.eos_token
 
33
  for i, message in enumerate(test_messages):
34
  print(f"\n--- Test {i+1}: '{message}' ---")
35
 
36
+ # Use Mistral chat template format
37
+ messages = [
38
+ {"role": "user", "content": message}
39
+ ]
40
+
41
+ # Apply chat template
42
+ conversation = tokenizer.apply_chat_template(
43
+ messages,
44
+ tokenize=False,
45
+ add_generation_prompt=True
46
+ )
47
+
48
  input_ids = tokenizer.encode(conversation, return_tensors="pt")
49
 
50
+ # Generate response with settings optimized for Mistral AWQ
51
  with torch.no_grad():
52
  chat_history_ids = model.generate(
53
+ input_ids.to(model.device),
54
+ max_new_tokens=100,
55
+ no_repeat_ngram_size=2,
 
56
  do_sample=True,
57
+ pad_token_id=tokenizer.pad_token_id,
 
58
  eos_token_id=tokenizer.eos_token_id,
59
  temperature=0.9,
60
  top_k=50,
61
+ top_p=0.9,
62
+ use_cache=True
63
  )
64
 
65
  # Decode response