Spaces:

aaurelions
/

word_keeper

Sleeping

App Files Files Community

aaurelions commited on May 7, 2025

Commit

de55550

verified ·

1 Parent(s): f836522

Update app.py

Browse files

Files changed (1) hide show

app.py +5 -10

app.py CHANGED Viewed

@@ -52,12 +52,12 @@ print(f"Base model device map: {base_model.hf_device_map}") # See what accelerat
 print(f"Loading adapter: {ADAPTER_MODEL_ID}")
 try:
     # Load the PEFT model.
-    # It should respect the base_model's device_map and offload_folder settings.
-    # No need to pass device_map or offload_folder to PeftModel directly
-    # if the base model is already configured.
     model = PeftModel.from_pretrained(
         base_model,
         ADAPTER_MODEL_ID,
         # adapter_name="default" # Default adapter name
     )
     model.eval()
@@ -72,7 +72,7 @@ except Exception as e:
     raise RuntimeError(f"Failed to load LoRA adapter: {e}")
-# --- Chat Logic (remains the same as your last full version) ---
 def respond(
     message: str,
     history: list[tuple[str | None, str | None]],
@@ -124,11 +124,6 @@ def respond(
     print(f"Formatted prompt for model:\n{prompt_for_model}")
     print("------------------------------------")
-    # Determine the device for inputs. If device_map is used, model might be on multiple devices or CPU.
-    # For simplicity, if model.device is available (not a complex map), use it. Otherwise, fallback to DEVICE.
-    # input_device = model.device if hasattr(model, 'device') and not isinstance(model.device, dict) else DEVICE
-    # However, with device_map="auto", inputs should generally be prepared for CPU, and accelerate handles movement.
-    # So, sending inputs to DEVICE (which is 'cpu' here) should be correct.
     inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
     eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
@@ -170,7 +165,7 @@ def respond(
                 current_response_chunk += char_token
                 yield current_response_chunk
-# --- Gradio Interface (remains the same as your last full version) ---
 chatbot_ui = gr.ChatInterface(
     fn=respond,
     chatbot=gr.Chatbot(

 print(f"Loading adapter: {ADAPTER_MODEL_ID}")
 try:
     # Load the PEFT model.
+    # Pass offload_folder here as well, as PeftModel's internal dispatching
+    # might need it if accelerate decides to offload parts of the combined model.
     model = PeftModel.from_pretrained(
         base_model,
         ADAPTER_MODEL_ID,
+        offload_folder=OFFLOAD_FOLDER, # <--- FIX APPLIED HERE
         # adapter_name="default" # Default adapter name
     )
     model.eval()
     raise RuntimeError(f"Failed to load LoRA adapter: {e}")
+# --- Chat Logic ---
 def respond(
     message: str,
     history: list[tuple[str | None, str | None]],
     print(f"Formatted prompt for model:\n{prompt_for_model}")
     print("------------------------------------")
     inputs = tokenizer(prompt_for_model, return_tensors="pt", return_attention_mask=True).to(DEVICE)
     eos_token_id_for_generation = tokenizer.convert_tokens_to_ids("<|end|>")
                 current_response_chunk += char_token
                 yield current_response_chunk
+# --- Gradio Interface ---
 chatbot_ui = gr.ChatInterface(
     fn=respond,
     chatbot=gr.Chatbot(