Reyad-Ahmmed
/

getvars-generic

Model card Files Files and versions

Reyad-Ahmmed commited on Feb 8, 2025

Commit

9f20041

·

verified ·

1 Parent(s): 5add132

Update handler.py

Files changed (1) hide show

handler.py +19 -3

handler.py CHANGED Viewed

@@ -6,6 +6,7 @@ import torch
 import time
 import os
 #model_dir2 = os.path.abspath("json_extraction_all")
 model_dir2 = "Reyad-Ahmmed/getvars-generic"
@@ -23,8 +24,23 @@ class EndpointHandler:
         """
         model_name = model_dir2 #"./json_extraction_all"  # Pretrained model for sentiment analysis
         self.tokenizer = T5Tokenizer.from_pretrained(model_name)
-        self.model = T5ForConditionalGeneration.from_pretrained(model_name)
-        self.model.eval()  # Set model to evaluation mode (no training)
         print(f"Loaded model: {model_name}")
     def __call__(self, inputs):
@@ -53,7 +69,7 @@ class EndpointHandler:
             start_time = time.time()
             # Perform inference
-            with torch.no_grad():
                 output_ids = self.model.generate(input_ids, max_length=100, temperature=0.3)
             json_output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)

 import time
 import os
+from transformers import BitsAndBytesConfig
 #model_dir2 = os.path.abspath("json_extraction_all")
 model_dir2 = "Reyad-Ahmmed/getvars-generic"
         """
         model_name = model_dir2 #"./json_extraction_all"  # Pretrained model for sentiment analysis
         self.tokenizer = T5Tokenizer.from_pretrained(model_name)
+        #self.model = T5ForConditionalGeneration.from_pretrained(model_name)
+        #self.model.eval()  # Set model to evaluation mode (no training)
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,  # Match input dtype for faster inference
+            bnb_4bit_use_double_quant=True  # Optional: Improves quantization efficiency
+        )
+        # Load quantized model
+        self.model = T5ForConditionalGeneration.from_pretrained(
+            model_output_path,
+            quantization_config=quantization_config,
+            device_map="auto"  # Automatically uses GPU if available
+        )
         print(f"Loaded model: {model_name}")
     def __call__(self, inputs):
             start_time = time.time()
             # Perform inference
+            with torch.inference_mode():
                 output_ids = self.model.generate(input_ids, max_length=100, temperature=0.3)
             json_output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True)