joseAndres777
/

WazapSplitter-LLM

Text Generation

message-segmentation

Model card Files Files and versions

joseAndres777 commited on Sep 17, 2025

Commit

538a0aa

·

verified ·

1 Parent(s): 1e38b06

Update handler.py

Files changed (1) hide show

handler.py +11 -5

handler.py CHANGED Viewed

@@ -12,15 +12,20 @@ class EndpointHandler:
         """
         Initialize the handler with the model from the given path
         """
         # Load tokenizer
-        self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.3-70B-Instruct")
-        # Load base model - CRITICAL: Match training setup exactly
         base_model = AutoModelForCausalLM.from_pretrained(
-            "meta-llama/Llama-3.3-70B-Instruct",
             torch_dtype=torch.float16,
-            device_map="auto",  # Use auto for compatibility
-            trust_remote_code=True
         )
         # Load LoRA adapters - use force download to ensure fresh state
@@ -30,6 +35,7 @@ class EndpointHandler:
                 path,
                 is_trainable=False  # Inference mode
             )
         except Exception as e:
             print(f"Error loading adapter: {e}")
             # Fallback: try without adapter (base model only)

         """
         Initialize the handler with the model from the given path
         """
+        # Use original model that matches the trained adapter
+        model_name = "meta-llama/Llama-3.3-70B-Instruct"
         # Load tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        # Load base model with memory optimization
         base_model = AutoModelForCausalLM.from_pretrained(
+            model_name,
             torch_dtype=torch.float16,
+            device_map="auto",
+            trust_remote_code=True,
+            load_in_8bit=True,  # 8-bit quantization to save memory
+            low_cpu_mem_usage=True
         )
         # Load LoRA adapters - use force download to ensure fresh state
                 path,
                 is_trainable=False  # Inference mode
             )
+            print("Successfully loaded adapter with base model")
         except Exception as e:
             print(f"Error loading adapter: {e}")
             # Fallback: try without adapter (base model only)