Spaces:
Sleeping
Sleeping
Commit ·
466d588
1
Parent(s): 4ea2486
ongoing struggles 12
Browse files
app.py
CHANGED
|
@@ -87,20 +87,16 @@ if not st.session_state.model_loaded:
|
|
| 87 |
with st.spinner("Loading the fine-tuned model... This may take a minute."):
|
| 88 |
# Check if CUDA is available, otherwise use CPU
|
| 89 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 90 |
-
st.write(f"Using device: {device}")
|
| 91 |
|
| 92 |
# Load tokenizer
|
| 93 |
-
st.write("Loading tokenizer...")
|
| 94 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 95 |
"microsoft/phi-2",
|
| 96 |
trust_remote_code=True
|
| 97 |
)
|
| 98 |
if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
|
| 99 |
tokenizer.pad_token = tokenizer.unk_token
|
| 100 |
-
st.write("Tokenizer loaded successfully")
|
| 101 |
|
| 102 |
# Load base model with simpler configuration for CPU
|
| 103 |
-
st.write("Loading base model...")
|
| 104 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 105 |
"microsoft/phi-2",
|
| 106 |
trust_remote_code=True,
|
|
@@ -108,24 +104,18 @@ if not st.session_state.model_loaded:
|
|
| 108 |
low_cpu_mem_usage=True
|
| 109 |
)
|
| 110 |
base_model = base_model.to(device)
|
| 111 |
-
st.write("Base model loaded successfully")
|
| 112 |
|
| 113 |
# Check if model path exists
|
| 114 |
model_path = "./final_model"
|
| 115 |
-
st.write(f"Checking model path: {model_path}")
|
| 116 |
if os.path.exists(model_path):
|
| 117 |
-
st.write(f"Model path exists. Contents: {os.listdir(model_path)}")
|
| 118 |
-
|
| 119 |
try:
|
| 120 |
# Load the fine-tuned LoRA adapter
|
| 121 |
-
st.write("Loading fine-tuned adapter...")
|
| 122 |
model = PeftModel.from_pretrained(
|
| 123 |
base_model,
|
| 124 |
model_path,
|
| 125 |
device_map=None # Don't use device_map on CPU
|
| 126 |
)
|
| 127 |
model = model.to(device)
|
| 128 |
-
st.write("Fine-tuned adapter loaded successfully")
|
| 129 |
|
| 130 |
model.eval() # Set model to evaluation mode
|
| 131 |
|
|
@@ -133,21 +123,15 @@ if not st.session_state.model_loaded:
|
|
| 133 |
st.session_state.model = model
|
| 134 |
st.session_state.tokenizer = tokenizer
|
| 135 |
st.session_state.model_loaded = True
|
| 136 |
-
st.success("Fine-tuned model loaded successfully!")
|
| 137 |
except Exception as e:
|
| 138 |
-
st.error(f"Error loading fine-tuned adapter: {str(e)}")
|
| 139 |
-
st.write(f"Error details: {type(e).__name__}")
|
| 140 |
# Fall back to base model
|
| 141 |
-
st.warning("Falling back to base model")
|
| 142 |
model = base_model
|
| 143 |
model.eval()
|
| 144 |
st.session_state.model = model
|
| 145 |
st.session_state.tokenizer = tokenizer
|
| 146 |
st.session_state.model_loaded = True
|
| 147 |
else:
|
| 148 |
-
st.error(f"Model path {model_path} does not exist!")
|
| 149 |
# Fall back to base model
|
| 150 |
-
st.warning("Falling back to base model")
|
| 151 |
model = base_model
|
| 152 |
model.eval()
|
| 153 |
st.session_state.model = model
|
|
@@ -155,14 +139,11 @@ if not st.session_state.model_loaded:
|
|
| 155 |
st.session_state.model_loaded = True
|
| 156 |
except Exception as e:
|
| 157 |
st.error(f"Error loading model: {str(e)}")
|
| 158 |
-
st.write(f"Error details: {type(e).__name__}")
|
| 159 |
|
| 160 |
-
# Function to generate response -
|
| 161 |
def generate_response(model, tokenizer, prompt):
|
| 162 |
-
st.write("Generating response...")
|
| 163 |
try:
|
| 164 |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
| 165 |
-
st.write("Input tokenized")
|
| 166 |
|
| 167 |
# Define stopping words and get their token IDs
|
| 168 |
stop_words = ["Human:", "User:"]
|
|
@@ -178,8 +159,6 @@ def generate_response(model, tokenizer, prompt):
|
|
| 178 |
all_stop_ids = stop_token_ids + [tokenizer.eos_token_id]
|
| 179 |
|
| 180 |
with torch.no_grad():
|
| 181 |
-
st.write("Starting generation...")
|
| 182 |
-
|
| 183 |
# Generate with combined stop tokens
|
| 184 |
outputs = model.generate(
|
| 185 |
input_ids=inputs["input_ids"],
|
|
@@ -189,21 +168,17 @@ def generate_response(model, tokenizer, prompt):
|
|
| 189 |
pad_token_id=tokenizer.pad_token_id,
|
| 190 |
eos_token_id=all_stop_ids # Only specify once
|
| 191 |
)
|
| 192 |
-
st.write("Generation completed")
|
| 193 |
|
| 194 |
# Extract just the new tokens
|
| 195 |
input_length = inputs["input_ids"].shape[1]
|
| 196 |
response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
|
| 197 |
-
st.write(f"Raw response: '{response}'")
|
| 198 |
|
| 199 |
# Minimal cleaning - just remove any "Assistant:" prefix
|
| 200 |
if response.startswith("Assistant:"):
|
| 201 |
response = response[len("Assistant:"):].strip()
|
| 202 |
|
| 203 |
-
st.write(f"Cleaned response: '{response}'")
|
| 204 |
return response
|
| 205 |
except Exception as e:
|
| 206 |
-
st.write(f"Error in generate_response: {str(e)}")
|
| 207 |
return f"Error generating response: {str(e)}"
|
| 208 |
|
| 209 |
# Display chat messages
|
|
|
|
| 87 |
with st.spinner("Loading the fine-tuned model... This may take a minute."):
|
| 88 |
# Check if CUDA is available, otherwise use CPU
|
| 89 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
| 90 |
|
| 91 |
# Load tokenizer
|
|
|
|
| 92 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 93 |
"microsoft/phi-2",
|
| 94 |
trust_remote_code=True
|
| 95 |
)
|
| 96 |
if tokenizer.pad_token is None or tokenizer.pad_token == tokenizer.eos_token:
|
| 97 |
tokenizer.pad_token = tokenizer.unk_token
|
|
|
|
| 98 |
|
| 99 |
# Load base model with simpler configuration for CPU
|
|
|
|
| 100 |
base_model = AutoModelForCausalLM.from_pretrained(
|
| 101 |
"microsoft/phi-2",
|
| 102 |
trust_remote_code=True,
|
|
|
|
| 104 |
low_cpu_mem_usage=True
|
| 105 |
)
|
| 106 |
base_model = base_model.to(device)
|
|
|
|
| 107 |
|
| 108 |
# Check if model path exists
|
| 109 |
model_path = "./final_model"
|
|
|
|
| 110 |
if os.path.exists(model_path):
|
|
|
|
|
|
|
| 111 |
try:
|
| 112 |
# Load the fine-tuned LoRA adapter
|
|
|
|
| 113 |
model = PeftModel.from_pretrained(
|
| 114 |
base_model,
|
| 115 |
model_path,
|
| 116 |
device_map=None # Don't use device_map on CPU
|
| 117 |
)
|
| 118 |
model = model.to(device)
|
|
|
|
| 119 |
|
| 120 |
model.eval() # Set model to evaluation mode
|
| 121 |
|
|
|
|
| 123 |
st.session_state.model = model
|
| 124 |
st.session_state.tokenizer = tokenizer
|
| 125 |
st.session_state.model_loaded = True
|
|
|
|
| 126 |
except Exception as e:
|
|
|
|
|
|
|
| 127 |
# Fall back to base model
|
|
|
|
| 128 |
model = base_model
|
| 129 |
model.eval()
|
| 130 |
st.session_state.model = model
|
| 131 |
st.session_state.tokenizer = tokenizer
|
| 132 |
st.session_state.model_loaded = True
|
| 133 |
else:
|
|
|
|
| 134 |
# Fall back to base model
|
|
|
|
| 135 |
model = base_model
|
| 136 |
model.eval()
|
| 137 |
st.session_state.model = model
|
|
|
|
| 139 |
st.session_state.model_loaded = True
|
| 140 |
except Exception as e:
|
| 141 |
st.error(f"Error loading model: {str(e)}")
|
|
|
|
| 142 |
|
| 143 |
+
# Function to generate response - clean version without debug output
|
| 144 |
def generate_response(model, tokenizer, prompt):
|
|
|
|
| 145 |
try:
|
| 146 |
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
|
|
|
| 147 |
|
| 148 |
# Define stopping words and get their token IDs
|
| 149 |
stop_words = ["Human:", "User:"]
|
|
|
|
| 159 |
all_stop_ids = stop_token_ids + [tokenizer.eos_token_id]
|
| 160 |
|
| 161 |
with torch.no_grad():
|
|
|
|
|
|
|
| 162 |
# Generate with combined stop tokens
|
| 163 |
outputs = model.generate(
|
| 164 |
input_ids=inputs["input_ids"],
|
|
|
|
| 168 |
pad_token_id=tokenizer.pad_token_id,
|
| 169 |
eos_token_id=all_stop_ids # Only specify once
|
| 170 |
)
|
|
|
|
| 171 |
|
| 172 |
# Extract just the new tokens
|
| 173 |
input_length = inputs["input_ids"].shape[1]
|
| 174 |
response = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True)
|
|
|
|
| 175 |
|
| 176 |
# Minimal cleaning - just remove any "Assistant:" prefix
|
| 177 |
if response.startswith("Assistant:"):
|
| 178 |
response = response[len("Assistant:"):].strip()
|
| 179 |
|
|
|
|
| 180 |
return response
|
| 181 |
except Exception as e:
|
|
|
|
| 182 |
return f"Error generating response: {str(e)}"
|
| 183 |
|
| 184 |
# Display chat messages
|