atiwari751 commited on
Commit
466d588
·
1 Parent(s): 4ea2486

ongoing struggles 12

Browse files
Files changed (1) hide show
  1. app.py +1 -26
app.py CHANGED
@@ -87,20 +87,16 @@ if not st.session_state.model_loaded:
87
  with st.spinner("Loading the fine-tuned model... This may take a minute."):
88
  # Check if CUDA is available, otherwise use CPU
89
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
90
- st.write(f"Using device: {device}")
91
 
92
  # Load tokenizer
93
- st.write("Loading tokenizer...")
94
  tokenizer = AutoTokenizer.from_pretrained(
95
  "microsoft/phi-2",
96
  trust_remote_code=True
97
  )
98
  if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
99
  tokenizer.pad_token = tokenizer.unk_token
100
- st.write("Tokenizer loaded successfully")
101
 
102
  # Load base model with simpler configuration for CPU
103
- st.write("Loading base model...")
104
  base_model = AutoModelForCausalLM.from_pretrained(
105
  "microsoft/phi-2",
106
  trust_remote_code=True,
@@ -108,24 +104,18 @@ if not st.session_state.model_loaded:
108
  low_cpu_mem_usage=True
109
  )
110
  base_model = base_model.to(device)
111
- st.write("Base model loaded successfully")
112
 
113
  # Check if model path exists
114
  model_path = "./final_model"
115
- st.write(f"Checking model path: {model_path}")
116
  if os.path.exists(model_path):
117
- st.write(f"Model path exists. Contents: {os.listdir(model_path)}")
118
-
119
  try:
120
  # Load the fine-tuned LoRA adapter
121
- st.write("Loading fine-tuned adapter...")
122
  model = PeftModel.from_pretrained(
123
  base_model,
124
  model_path,
125
  device_map=None # Don't use device_map on CPU
126
  )
127
  model = model.to(device)
128
- st.write("Fine-tuned adapter loaded successfully")
129
 
130
  model.eval() # Set model to evaluation mode
131
 
@@ -133,21 +123,15 @@ if not st.session_state.model_loaded:
133
  st.session_state.model = model
134
  st.session_state.tokenizer = tokenizer
135
  st.session_state.model_loaded = True
136
- st.success("Fine-tuned model loaded successfully!")
137
  except Exception as e:
138
- st.error(f"Error loading fine-tuned adapter: {str(e)}")
139
- st.write(f"Error details: {type(e).__name__}")
140
  # Fall back to base model
141
- st.warning("Falling back to base model")
142
  model = base_model
143
  model.eval()
144
  st.session_state.model = model
145
  st.session_state.tokenizer = tokenizer
146
  st.session_state.model_loaded = True
147
  else:
148
- st.error(f"Model path {model_path} does not exist!")
149
  # Fall back to base model
150
- st.warning("Falling back to base model")
151
  model = base_model
152
  model.eval()
153
  st.session_state.model = model
@@ -155,14 +139,11 @@ if not st.session_state.model_loaded:
155
  st.session_state.model_loaded = True
156
  except Exception as e:
157
  st.error(f"Error loading model: {str(e)}")
158
- st.write(f"Error details: {type(e).__name__}")
159
 
160
- # Function to generate response - focused on early stopping
161
  def generate_response(model, tokenizer, prompt):
162
- st.write("Generating response...")
163
  try:
164
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
165
- st.write("Input tokenized")
166
 
167
  # Define stopping words and get their token IDs
168
  stop_words = ["Human:", "User:"]
@@ -178,8 +159,6 @@ def generate_response(model, tokenizer, prompt):
178
  all_stop_ids = stop_token_ids + [tokenizer.eos_token_id]
179
 
180
  with torch.no_grad():
181
- st.write("Starting generation...")
182
-
183
  # Generate with combined stop tokens
184
  outputs = model.generate(
185
  input_ids=inputs["input_ids"],
@@ -189,21 +168,17 @@ def generate_response(model, tokenizer, prompt):
189
  pad_token_id=tokenizer.pad_token_id,
190
  eos_token_id=all_stop_ids # Only specify once
191
  )
192
- st.write("Generation completed")
193
 
194
  # Extract just the new tokens
195
  input_length = inputs["input_ids"].shape[1]
196
  response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
197
- st.write(f"Raw response: '{response}'")
198
 
199
  # Minimal cleaning - just remove any "Assistant:" prefix
200
  if response.startswith("Assistant:"):
201
  response = response[len("Assistant:"):].strip()
202
 
203
- st.write(f"Cleaned response: '{response}'")
204
  return response
205
  except Exception as e:
206
- st.write(f"Error in generate_response: {str(e)}")
207
  return f"Error generating response: {str(e)}"
208
 
209
  # Display chat messages
 
87
  with st.spinner("Loading the fine-tuned model... This may take a minute."):
88
  # Check if CUDA is available, otherwise use CPU
89
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
90
 
91
  # Load tokenizer
 
92
  tokenizer = AutoTokenizer.from_pretrained(
93
  "microsoft/phi-2",
94
  trust_remote_code=True
95
  )
96
  if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
97
  tokenizer.pad_token = tokenizer.unk_token
 
98
 
99
  # Load base model with simpler configuration for CPU
 
100
  base_model = AutoModelForCausalLM.from_pretrained(
101
  "microsoft/phi-2",
102
  trust_remote_code=True,
 
104
  low_cpu_mem_usage=True
105
  )
106
  base_model = base_model.to(device)
 
107
 
108
  # Check if model path exists
109
  model_path = "./final_model"
 
110
  if os.path.exists(model_path):
 
 
111
  try:
112
  # Load the fine-tuned LoRA adapter
 
113
  model = PeftModel.from_pretrained(
114
  base_model,
115
  model_path,
116
  device_map=None # Don't use device_map on CPU
117
  )
118
  model = model.to(device)
 
119
 
120
  model.eval() # Set model to evaluation mode
121
 
 
123
  st.session_state.model = model
124
  st.session_state.tokenizer = tokenizer
125
  st.session_state.model_loaded = True
 
126
  except Exception as e:
 
 
127
  # Fall back to base model
 
128
  model = base_model
129
  model.eval()
130
  st.session_state.model = model
131
  st.session_state.tokenizer = tokenizer
132
  st.session_state.model_loaded = True
133
  else:
 
134
  # Fall back to base model
 
135
  model = base_model
136
  model.eval()
137
  st.session_state.model = model
 
139
  st.session_state.model_loaded = True
140
  except Exception as e:
141
  st.error(f"Error loading model: {str(e)}")
 
142
 
143
+ # Function to generate response - clean version without debug output
144
  def generate_response(model, tokenizer, prompt):
 
145
  try:
146
  inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 
147
 
148
  # Define stopping words and get their token IDs
149
  stop_words = ["Human:", "User:"]
 
159
  all_stop_ids = stop_token_ids + [tokenizer.eos_token_id]
160
 
161
  with torch.no_grad():
 
 
162
  # Generate with combined stop tokens
163
  outputs = model.generate(
164
  input_ids=inputs["input_ids"],
 
168
  pad_token_id=tokenizer.pad_token_id,
169
  eos_token_id=all_stop_ids # Only specify once
170
  )
 
171
 
172
  # Extract just the new tokens
173
  input_length = inputs["input_ids"].shape[1]
174
  response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
 
175
 
176
  # Minimal cleaning - just remove any "Assistant:" prefix
177
  if response.startswith("Assistant:"):
178
  response = response[len("Assistant:"):].strip()
179
 
 
180
  return response
181
  except Exception as e:
 
182
  return f"Error generating response: {str(e)}"
183
 
184
  # Display chat messages