import json from datetime import datetime from transformers import T5ForConditionalGeneration, T5Tokenizer from transformers import TrainingArguments, Trainer import torch import time import os from transformers import BitsAndBytesConfig #model_dir2 = os.path.abspath("json_extraction_all") model_dir2 = "Reyad-Ahmmed/getvars-generic" class EndpointHandler: def __init__(self, model_dir): self.model_dir = model_dir2 self.model = None self.tokenizer = None self.load() # Ensure model loads on initialization def load(self): """ Load a simple DistilBERT model for text classification. """ model_name = model_dir2 #"./json_extraction_all" # Pretrained model for sentiment analysis self.tokenizer = T5Tokenizer.from_pretrained(model_name) #self.model = T5ForConditionalGeneration.from_pretrained(model_name, device_map="auto") # Load model in float16 for faster inference self.model = T5ForConditionalGeneration.from_pretrained( model_name, torch_dtype=torch.float16, # Use float16 for faster computation device_map="auto" # Automatically uses GPU if available ) self.model.eval() # Set model to evaluation mode (no training) # Check if the model is on GPU device = next(self.model.parameters()).device print(f"Model is loaded on: {device}") # Should print 'cuda:0' if on GPU #self.quantization_config = BitsAndBytesConfig( # load_in_4bit=True, # bnb_4bit_compute_dtype=torch.float16, # Match input dtype for faster inference # bnb_4bit_use_double_quant=True # Optional: Improves quantization efficiency #) # Load quantized model #self.model = T5ForConditionalGeneration.from_pretrained( # model_name, # quantization_config=self.quantization_config, # device_map="auto" # Automatically uses GPU if available #) print(f"Loaded model: {model_name}") def __call__(self, inputs): """ Process user input and classify the text using DistilBERT. """ try: if self.tokenizer is None or self.model is None: raise ValueError("Model and tokenizer were not loaded properly.") # Handle different input formats if isinstance(inputs, list) and len(inputs) > 0: user_text = inputs[0] elif isinstance(inputs, dict) and "inputs" in inputs: user_text = inputs["inputs"] else: return {"error": "Invalid input format. Expected {'inputs': 'your text'} or ['your text']."} # Generate timestamp current_time = datetime.now().strftime("%Y-%m-%d %H:%M:%S") # Tokenize input text input_ids = self.tokenizer(user_text, return_tensors="pt").input_ids.to("cuda") # Measure inference time start_time = time.time() # Perform inference with torch.inference_mode(): output_ids = self.model.generate(input_ids, max_length=100, temperature=0.3) json_output = self.tokenizer.decode(output_ids[0], skip_special_tokens=True) end_time = time.time() inference_time = end_time - start_time # Calculate time taken # Print inference time print(f"Inference Time: {inference_time:.4f} seconds") # return json.loads(json_output) try: return json.loads(json_output) except: return json_output except Exception as e: return {"error": f"Unexpected error: {str(e)}"}