Danna8
/

MistralF

PEFT

Safetensors

Model card Files Files and versions

xet

Community

Danna8 commited on Mar 24, 2025

Commit

d1f0c49

verified ·

1 Parent(s): bc3f3f1

Create handler.py

Browse files

Files changed (1) hide show

handler.py +58 -0

handler.py ADDED Viewed

	@@ -0,0 +1,58 @@

+# handler.py
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from typing import Dict, List, Any
+class EndpointHandler:
+    def __init__(self, path: str = ""):
+        """
+        Initialize the model and tokenizer.
+        :param path: Path to the model repository (not used directly since we load from Hugging Face Hub).
+        """
+        # Define the base model and adapter model names
+        self.base_model_name = "mistralai/Mistral-7B-Instruct-v0.3"
+        self.adapter_model_name = "Danna8/MistralF"
+        # Load the tokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(self.adapter_model_name)
+        # Load the base model with optimizations
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.base_model_name,
+            torch_dtype=torch.float16,  # Use FP16 for efficiency
+            device_map="auto"  # Automatically map to GPU
+        )
+        # Load the adapter
+        self.model.load_adapter(self.adapter_model_name)
+        self.model.set_active_adapters("default")  # Adjust the adapter name if needed
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Handle inference requests.
+        :param data: Input data containing the text to process.
+        :return: List of generated outputs.
+        """
+        # Extract the input text from the request
+        inputs = data.get("inputs", "")
+        if not inputs:
+            return [{"error": "No input provided"}]
+        # Tokenize the input
+        tokenized_inputs = self.tokenizer(inputs, return_tensors="pt").to("cuda")
+        # Generate output
+        outputs = self.model.generate(
+            **tokenized_inputs,
+            max_new_tokens=50,
+            do_sample=True,
+            top_p=0.95,
+            temperature=0.7,
+            pad_token_id=self.tokenizer.eos_token_id  # Ensure proper padding
+        )
+        # Decode the output
+        generated_text = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
+        # Return the result in the expected format
+        return [{"generated_text": generated_text}]