joseAndres777
/

WazapSplitter-LLM

Text Generation

message-segmentation

Model card Files Files and versions

joseAndres777 commited on Sep 16, 2025

Commit

c4c2bd8

·

verified ·

1 Parent(s): 0a3560b

Update handler.py

Files changed (1) hide show

handler.py +16 -5

handler.py CHANGED Viewed

@@ -11,15 +11,26 @@ class EndpointHandler:
         # Load tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.3-70B-Instruct")
-        # Load base model
         base_model = AutoModelForCausalLM.from_pretrained(
             "meta-llama/Llama-3.3-70B-Instruct",
             torch_dtype=torch.float16,
-            device_map="auto"
         )
-        # Load LoRA adapters from the current path
-        self.model = PeftModel.from_pretrained(base_model, path)
         # Load chat template
         try:

         # Load tokenizer
         self.tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.3-70B-Instruct")
+        # Load base model - CRITICAL: Match training setup exactly
         base_model = AutoModelForCausalLM.from_pretrained(
             "meta-llama/Llama-3.3-70B-Instruct",
             torch_dtype=torch.float16,
+            device_map="auto",  # Use auto for compatibility
+            trust_remote_code=True
         )
+        # Load LoRA adapters - use force download to ensure fresh state
+        try:
+            self.model = PeftModel.from_pretrained(
+                base_model,
+                path,
+                is_trainable=False  # Inference mode
+            )
+        except Exception as e:
+            print(f"Error loading adapter: {e}")
+            # Fallback: try without adapter (base model only)
+            print("Falling back to base model without adapter")
+            self.model = base_model
         # Load chat template
         try: