khaledsayed1
/

llama_QA

@@ -1,45 +1,92 @@
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer
-# Load the model and tokenizer from Hugging Face (with GPU support)
-model_name = "khaledsayed1/llama_QA"  # Replace with your actual model name
-model = AutoModelForCausalLM.from_pretrained(model_name).to("cuda")  # Ensure it's loaded on GPU
-tokenizer = AutoTokenizer.from_pretrained(model_name)
 def predict(input_data):
     """
-    Process the input data and generate an answer from the model.
-    Args:
-        input_data (dict): The input question.
-    Returns:
-        dict: The model's generated answer.
-    """
-    question = input_data.get('question', '')
-    if not question:
-        return {"error": "No question provided."}
-    # Define the prompt with the user's question
-    formatted_prompt = f"""
-    السؤال: {question}
-    الإجابة:
     """
-    inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")  # Move inputs to GPU
-    try:
-        # Generate the output using the model
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=128,
-            temperature=0.7,
-            top_k=50,
-            top_p=0.95,
-        )
-        decoded_output = tokenizer.batch_decode(outputs, skip_special_tokens=True)
-        # Clean up the output and remove the question itself
-        clean_output = decoded_output[0].replace("السؤال:", "").replace("الإجابة:", "").strip()
-        return {"answer": clean_output}
-    except Exception as e:
-        return {"error": str(e)}

 import torch
+import os
 from transformers import AutoModelForCausalLM, AutoTokenizer
+class ModelHandler:
+    def __init__(self):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model = None
+        self.tokenizer = None
+        self.initialized = False
+    def initialize(self):
+        """Initialize the model and tokenizer"""
+        if self.initialized:
+            return
+        try:
+            # Load model and tokenizer from the local path
+            model_path = os.path.dirname(os.path.abspath(__file__))
+            self.model = AutoModelForCausalLM.from_pretrained(
+                model_path,
+                device_map="auto",
+                torch_dtype=torch.float16  # Use float16 for T4 GPU optimization
+            )
+            self.tokenizer = AutoTokenizer.from_pretrained(model_path)
+            self.initialized = True
+        except Exception as e:
+            raise RuntimeError(f"Error initializing model: {str(e)}")
+    def predict(self, input_data):
+        """
+        Process the input data and generate an answer from the model.
+        Args:
+            input_data (dict): The input question.
+        Returns:
+            dict: The model's generated answer.
+        """
+        if not self.initialized:
+            self.initialize()
+        try:
+            # Extract the question from input_data
+            question = input_data.get('question', '')
+            if not question:
+                return {"error": "No question provided."}
+            # Define the prompt with the user's question
+            alpaca_prompt = f"""
+            السؤال: {question}
+            الإجابة:
+            """
+            formatted_prompt = alpaca_prompt.strip()
+            # Tokenize the input
+            inputs = self.tokenizer([formatted_prompt], return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            # Generate with proper error handling and memory management
+            with torch.no_grad():
+                outputs = self.model.generate(
+                    **inputs,
+                    max_new_tokens=128,
+                    temperature=0.7,
+                    top_k=50,
+                    top_p=0.95,
+                    use_cache=True,
+                    pad_token_id=self.tokenizer.eos_token_id
+                )
+            # Decode the output
+            decoded_output = self.tokenizer.batch_decode(outputs, skip_special_tokens=True)
+            # Clean up the output
+            clean_output = decoded_output[0].replace("السؤال:", "").replace("الإجابة:", "").strip()
+            # Clear CUDA cache if using GPU
+            if self.device == "cuda":
+                torch.cuda.empty_cache()
+            return {"answer": clean_output}
+        except Exception as e:
+            return {"error": f"Prediction error: {str(e)}"}
+# Create a global handler instance
+handler = ModelHandler()
 def predict(input_data):
     """
+    Wrapper function for the handler's predict method
     """
+    return handler.predict(input_data)