Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -52,12 +52,12 @@ print(f"Base model device map: {base_model.hf_device_map}") # See what accelerat
|
|
| 52 |
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
| 53 |
try:
|
| 54 |
# Load the PEFT model.
|
| 55 |
-
#
|
| 56 |
-
#
|
| 57 |
-
# if the base model is already configured.
|
| 58 |
model = PeftModel.from_pretrained(
|
| 59 |
base_model,
|
| 60 |
ADAPTER_MODEL_ID,
|
|
|
|
| 61 |
# adapter_name="default" # Default adapter name
|
| 62 |
)
|
| 63 |
model.eval()
|
|
@@ -72,7 +72,7 @@ except Exception as e:
|
|
| 72 |
raise RuntimeError(f"Failed to load LoRA adapter: {e}")
|
| 73 |
|
| 74 |
|
| 75 |
-
# --- Chat Logic
|
| 76 |
def respond(
|
| 77 |
message: str,
|
| 78 |
history: list[tuple[str | None, str | None]],
|
|
@@ -124,11 +124,6 @@ def respond(
|
|
| 124 |
print(f"Formatted prompt for model:\n{prompt_for_model}")
|
| 125 |
print("------------------------------------")
|
| 126 |
|
| 127 |
-
# Determine the device for inputs. If device_map is used, model might be on multiple devices or CPU.
|
| 128 |
-
# For simplicity, if model.device is available (not a complex map), use it. Otherwise, fallback to DEVICE.
|
| 129 |
-
# input_device = model.device if hasattr(model, 'device') and not isinstance(model.device, dict) else DEVICE
|
| 130 |
-
# However, with device_map="auto", inputs should generally be prepared for CPU, and accelerate handles movement.
|
| 131 |
-
# So, sending inputs to DEVICE (which is 'cpu' here) should be correct.
|
| 132 |
inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
|
| 133 |
|
| 134 |
eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
|
|
@@ -170,7 +165,7 @@ def respond(
|
|
| 170 |
current_response_chunk += char_token
|
| 171 |
yield current_response_chunk
|
| 172 |
|
| 173 |
-
# --- Gradio Interface
|
| 174 |
chatbot_ui = gr.ChatInterface(
|
| 175 |
fn=respond,
|
| 176 |
chatbot=gr.Chatbot(
|
|
|
|
| 52 |
print(f"Loading adapter: {ADAPTER_MODEL_ID}")
|
| 53 |
try:
|
| 54 |
# Load the PEFT model.
|
| 55 |
+
# Pass offload_folder here as well, as PeftModel's internal dispatching
|
| 56 |
+
# might need it if accelerate decides to offload parts of the combined model.
|
|
|
|
| 57 |
model = PeftModel.from_pretrained(
|
| 58 |
base_model,
|
| 59 |
ADAPTER_MODEL_ID,
|
| 60 |
+
offload_folder=OFFLOAD_FOLDER, # <--- FIX APPLIED HERE
|
| 61 |
# adapter_name="default" # Default adapter name
|
| 62 |
)
|
| 63 |
model.eval()
|
|
|
|
| 72 |
raise RuntimeError(f"Failed to load LoRA adapter: {e}")
|
| 73 |
|
| 74 |
|
| 75 |
+
# --- Chat Logic ---
|
| 76 |
def respond(
|
| 77 |
message: str,
|
| 78 |
history: list[tuple[str | None, str | None]],
|
|
|
|
| 124 |
print(f"Formatted prompt for model:\n{prompt_for_model}")
|
| 125 |
print("------------------------------------")
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
|
| 128 |
|
| 129 |
eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
|
|
|
|
| 165 |
current_response_chunk += char_token
|
| 166 |
yield current_response_chunk
|
| 167 |
|
| 168 |
+
# --- Gradio Interface ---
|
| 169 |
chatbot_ui = gr.ChatInterface(
|
| 170 |
fn=respond,
|
| 171 |
chatbot=gr.Chatbot(
|