Spaces:

joseififif
/

Megatron-So

Runtime error

App Files Files Community

joseififif commited on Dec 3, 2025

Commit

cf64161

verified ·

1 Parent(s): 7de1379

Create triton_inference/inference_engine.py

Browse files

Files changed (1) hide show

triton_inference/inference_engine.py +80 -0

triton_inference/inference_engine.py ADDED Viewed

	@@ -0,0 +1,80 @@

+import tritonclient.grpc as grpcclient
+import numpy as np
+from typing import Optional
+import logging
+import os
+class TritonInferenceEngine:
+    def __init__(self):
+        self.triton_url = os.getenv("TRITON_URL", "localhost:8001")
+        self.model_name = os.getenv("MODEL_NAME", "wizardlm-7b")
+        self.client = None
+    async def initialize(self):
+        try:
+            self.client = grpcclient.InferenceServerClient(
+                url=self.triton_url,
+                verbose=False
+            )
+            if not self.client.is_model_ready(self.model_name):
+                raise RuntimeError(f"Model {self.model_name} is not ready")
+            logging.info(f"Connected to Triton Inference Server at {self.triton_url}")
+        except Exception as e:
+            logging.error(f"Error connecting to Triton: {e}")
+            raise
+    async def generate(
+        self,
+        prompt: str,
+        max_tokens: int = 100,
+        temperature: float = 0.7,
+        top_p: float = 0.9
+    ) -> str:
+        # Preprocesar el prompt (aquí necesitarías tokenizar con el tokenizer adecuado)
+        # Para simplificar, asumimos que el modelo espera input_ids y attention_mask
+        # En un caso real, usarías un tokenizer como en el ejemplo de HuggingFace
+        # Este es un ejemplo simplificado. En producción, necesitarás adaptarlo a tu modelo.
+        inputs = self._prepare_inputs(prompt)
+        # Configurar inputs para Triton
+        triton_inputs = [
+            grpcclient.InferInput("input_ids", inputs["input_ids"].shape, "INT64"),
+            grpcclient.InferInput("attention_mask", inputs["attention_mask"].shape, "INT64"),
+        ]
+        triton_inputs[0].set_data_from_numpy(inputs["input_ids"])
+        triton_inputs[1].set_data_from_numpy(inputs["attention_mask"])
+        # Configurar outputs
+        outputs = [grpcclient.InferRequestedOutput("output_ids")]
+        # Realizar inferencia
+        response = self.client.infer(
+            model_name=self.model_name,
+            inputs=triton_inputs,
+            outputs=outputs
+        )
+        # Postprocesar la respuesta
+        output_ids = response.as_numpy("output_ids")
+        generated_text = self._decode_output(output_ids)
+        return generated_text
+    def _prepare_inputs(self, prompt: str):
+        # Aquí deberías tokenizar el prompt usando el tokenizer de WizardLM-7B
+        # Por ahora, devolvemos un ejemplo dummy
+        # En producción, carga el tokenizer y úsalo para tokenizar
+        return {
+            "input_ids": np.array([[1, 2, 3, 4, 5]], dtype=np.int64),
+            "attention_mask": np.array([[1, 1, 1, 1, 1]], dtype=np.int64)
+        }
+    def _decode_output(self, output_ids):
+        # Decodificar los output_ids a texto
+        # Por ahora, devolvemos un texto dummy
+        return "This is a dummy response from the AI model."
+    async def close(self):
+        if self.client:
+            self.client.close()