analist
/

QwenStem-7b

+import torch
+import base64
+from io import BytesIO
+from typing import Dict, List, Any, Optional, Union
+from PIL import Image
+from transformers import AutoProcessor, AutoModelForVision2Seq, BitsAndBytesConfig
+class EndpointHandler():
+    def __init__(self, path=""):
+        """
+        Initialize the model handler
+        Args:
+            path: Path to the model weights (provided by HF Endpoints)
+        """
+        # Configuration pour la quantification 4-bit
+        self.bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=torch.float16,
+        )
+        # Chargement du processeur et du modèle
+        self.processor = AutoProcessor.from_pretrained(
+            path,
+            trust_remote_code=True
+        )
+        self.model = AutoModelForVision2Seq.from_pretrained(
+            path,
+            quantization_config=self.bnb_config,
+            device_map="auto",
+            trust_remote_code=True,
+            torch_dtype=torch.float16
+        )
+        # Configuration de génération par défaut
+        self.default_generation_config = {
+            "max_new_tokens": 512,
+            "temperature": 0.7,
+            "top_p": 0.9,
+            "do_sample": True,
+            "repetition_penalty": 1.1
+        }
+    def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        """
+        Process the incoming request
+        Args:
+            data: Dictionary containing:
+                - inputs (str or dict): Text prompt or structured input
+                - image (str, optional): Base64 encoded image
+                - parameters (dict, optional): Generation parameters
+        Returns:
+            List containing the response dictionary
+        """
+        # Extraction des données d'entrée
+        inputs = data.get("inputs", "")
+        image_data = data.get("image", None)
+        parameters = data.get("parameters", {})
+        # Fusion des paramètres de génération
+        generation_config = {**self.default_generation_config, **parameters}
+        try:
+            # Traitement selon le type d'entrée
+            if isinstance(inputs, str):
+                # Entrée texte simple
+                response = self._process_text(inputs, generation_config)
+            elif isinstance(inputs, dict):
+                # Entrée structurée (avec potentiellement une image)
+                text = inputs.get("text", "")
+                image_b64 = inputs.get("image", image_data)
+                if image_b64:
+                    response = self._process_multimodal(text, image_b64, generation_config)
+                else:
+                    response = self._process_text(text, generation_config)
+            else:
+                # Si l'image est fournie séparément
+                if image_data:
+                    response = self._process_multimodal(str(inputs), image_data, generation_config)
+                else:
+                    response = self._process_text(str(inputs), generation_config)
+            return [{"generated_text": response}]
+        except Exception as e:
+            return [{"error": str(e)}]
+    def _process_text(self, text: str, generation_config: dict) -> str:
+        """
+        Process text-only input
+        """
+        # Construction du message avec un prompt optimisé
+        messages = [
+            {"role": "system", "content": "You are an expert assistant in mathematics and sciences. Provide clear, precise, and pedagogical answers. For each problem, explain your reasoning step by step, justify your choices, and illustrate with examples when necessary. Adopt an accessible yet rigorous style."},
+            {"role": "user", "content": [
+                {"type": "text", "text": text}
+            ]}
+        ]
+        # Préparation de l'input
+        text_inputs = self.processor.apply_chat_template(
+            messages,
+            tokenize=True,
+            add_generation_prompt=True,
+            return_tensors="pt"
+        )
+        # Déplacement sur le bon device
+        text_inputs = text_inputs.to(self.model.device)
+        # Génération
+        with torch.no_grad():
+            outputs = self.model.generate(
+                text_inputs,
+                **generation_config,
+                pad_token_id=self.processor.tokenizer.eos_token_id,
+                eos_token_id=self.processor.tokenizer.eos_token_id
+            )
+        # Décodage
+        response = self.processor.decode(outputs[0], skip_special_tokens=True)
+        # Extraction de la réponse (retirer le prompt)
+        if "assistant" in response:
+            response = response.split("assistant")[-1].strip()
+        return response
+    def _process_multimodal(self, text: str, image_b64: str, generation_config: dict) -> str:
+        """
+        Process text and image input
+        """
+        # Décodage de l'image base64
+        try:
+            image_bytes = base64.b64decode(image_b64)
+            image = Image.open(BytesIO(image_bytes)).convert("RGB")
+        except Exception as e:
+            raise ValueError(f"Erreur lors du décodage de l'image: {str(e)}")
+        # Construction du message multimodal
+        messages = [
+            {"role": "system", "content": "You are an expert assistant in mathematics and sciences with multimodal reasoning capabilities. Provide clear, precise, and pedagogical answers. For each problem, explain your reasoning step by step, justify your choices, and illustrate with examples, diagrams, or visual aids when necessary. Analyze both textual and visual information carefully, and present your explanations in an accessible yet rigorous style."},
+            {"role": "user", "content": [
+                {"type": "text", "text": text},
+                {"type": "image"}
+            ]}
+        ]
+        # Préparation du prompt
+        prompt = self.processor.apply_chat_template(
+            messages,
+            add_generation_prompt=True,
+            tokenize=False
+        )
+        # Traitement avec l'image
+        inputs = self.processor(
+            text=prompt,
+            images=[image],
+            return_tensors="pt"
+        )
+        # Déplacement sur le bon device
+        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+        # Génération
+        with torch.no_grad():
+            outputs = self.model.generate(
+                **inputs,
+                **generation_config,
+                pad_token_id=self.processor.tokenizer.eos_token_id,
+                eos_token_id=self.processor.tokenizer.eos_token_id
+            )
+        # Décodage
+        response = self.processor.decode(outputs[0], skip_special_tokens=True)
+        # Extraction de la réponse
+        if "assistant" in response:
+            response = response.split("assistant")[-1].strip()
+        return response
+    def health(self) -> Dict[str, Any]:
+        """
+        Health check endpoint
+        """
+        return {
+            "status": "healthy",
+            "model": "QwenStem-7b",
+            "device": str(self.model.device),
+            "quantization": "4-bit"
+        }