FRIDA-Qwen3

Any-to-Any

Transformers

Model card Files Files and versions

xet

Community

rosassebastian2003 commited on Oct 6, 2025

Commit

81ff557

1 Parent(s): f2527c6

Se volvio a el approach de usar transformers

Browse files

Files changed (2) hide show

handler.py +46 -118
requirements.txt +1 -1

handler.py CHANGED Viewed

@@ -1,125 +1,53 @@
-from transformers import pipeline
-import torch
-import base64
 from typing import Dict, List, Any
-import io
-import scipy.io.wavfile as wavfile
-import os
-import tempfile
-import numpy as np
-# Nombre del modelo (usado como fallback si 'path' no se proporciona)
-MODEL_NAME = "Qwen/Qwen3-Omni-30B-A3B-Instruct"
 class EndpointHandler():
-    def __init__(self, path=""):
-        # 1. Configuraciones críticas para la carga del modelo MoE y la funcionalidad de voz
-        model_kwargs = {
-            "device_map": "auto", # Optimización para la distribución de pesos en GPU [1]
-            "torch_dtype": torch.bfloat16 if torch.cuda.is_available() else None,
-            "enable_audio_output": True  # Clave esencial para cargar el componente Talker (generador de voz) [4]
-        }
-        # 2. Carga del pipeline genérico de generación de texto (el wrapper para LLM multimodales) [3]
-        self.pipeline = pipeline(
-            task="text-generation",
-            model=path or MODEL_NAME,
-            **model_kwargs # Inyección de los parámetros específicos de Qwen3
-        )
-        # 3. System prompt obligatorio para Qwen3-Omni para generar audio natural [4]
-        self.system_prompt = (
-            "You are Qwen, a virtual human developed by the Qwen Team, Alibaba Group, "
-            "capable of perceiving auditory and visual inputs, as well as generating text and speech."
         )
-        # 4. Tasa de muestreo del modelo (necesaria para la serialización de audio en __call__)
-        self.sampling_rate = getattr(self.pipeline.model.config, 'sampling_rate', 24000)
-    def _handle_audio_input(self, data: Dict[str, Any]) -> str:
-        """ Decodifica la entrada de audio Base64 y la guarda temporalmente como un archivo WAV. """
-        audio_data_base64 = data.get("audio_data")
-        if not audio_data_base64:
-            return None
-        temp_file_path = None
-        try:
-            audio_bytes = base64.b64decode(audio_data_base64)
-            # Guardar en un archivo temporal para que el pipeline lo pueda procesar [5]
-            temp_file = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
-            temp_file.write(audio_bytes)
-            temp_file.close()
-            temp_file_path = temp_file.name
-            return temp_file_path
-        except Exception as e:
-            if temp_file_path and os.path.exists(temp_file_path):
-                 os.remove(temp_file_path)
-            raise ValueError(f"Error al decodificar y guardar el audio Base64: {e}")
-    def _handle_audio_output(self, generated_audio: torch.Tensor, sampling_rate: int) -> str:
-        """ Convierte el tensor de audio de salida a un buffer WAV y lo codifica en Base64. """
-        audio_array = generated_audio.cpu().numpy().squeeze()
-        if audio_array.dtype!= np.float32:
-             audio_array = audio_array.astype(np.float32)
-        with io.BytesIO() as buffer:
-            # Escribir el array como WAV [2]
-            wavfile.write(buffer, rate=sampling_rate, data=audio_array)
-            buffer.seek(0)
-            # Codificar a Base64 para la respuesta JSON
-            encoded_audio = base64.b64encode(buffer.read()).decode('utf-8')
-            return encoded_audio
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
-        prompt = data.get("inputs")
-        if not prompt:
-            raise ValueError("El campo 'inputs' (prompt de texto) es obligatorio.")
-        generation_kwargs = data.get("parameters", {})
-        audio_file_path = None
-        try:
-            # 1. Manejo de I/O de audio (Base64 -> Archivo Temporal)
-            audio_file_path = self._handle_audio_input(data)
-            # 2. El pipeline espera una lista de entradas multimodales (Texto o Audio)
-            inputs_list = [prompt]
-            if audio_file_path:
-                inputs_list.append(audio_file_path)
-            # 3. Configuración de generación
-            generation_kwargs.update({
-                "system_prompt": self.system_prompt, # Requerido para la calidad de la voz [4]
-                "return_audio": True, # Solicitamos que la salida contenga el tensor de audio [4]
-                "max_new_tokens": generation_kwargs.get("max_new_tokens", 512),
-            })
-            # 4. Ejecutar el pipeline
-            raw_output = self.pipeline(inputs_list, **generation_kwargs)
-            # El pipeline devuelve una lista de diccionarios, extraemos el primer resultado
-            response = raw_output
-            final_response = {
-                "generated_text": response.get("generated_text"),
-                "audio_output": None
-            }
-            # 5. Post-procesamiento (Tensor -> Base64-WAV)
-            if "audio_array" in response:
-                encoded_audio = self._handle_audio_output(response["audio_array"], self.sampling_rate)
-                final_response["audio_output"] = encoded_audio
-            return [final_response]
-        except Exception as e:
-            # Manejo de errores
-            return [{"error": str(e)}]
-        finally:
-            # 6. Limpieza de archivos temporales (Mantenimiento crítico)
-            if audio_file_path and os.path.exists(audio_file_path):
-                os.remove(audio_file_path)

 from typing import Dict, List, Any
+import soundfile as sf
+from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
+from qwen_omni_utils import process_mm_info
 class EndpointHandler():
+    def __init__(self, path="./"):
+        self.model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
+            path,
+            dtype="auto",
+            device_map="auto",
         )
+        self.processor = Qwen3OmniMoeProcessor.from_pretrained(path)
     def __call__(self, data: Dict[str, Any]) -> List[Dict[str, Any]]:
+        messages = data.get("messages", [])
+        use_audio_in_video = data.get("use_audio_in_video", True)
+        speaker = data.get("speaker", "Ethan")
+        text = self.processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True,
+        )
+        audios, images, videos = process_mm_info(messages, use_audio_in_video=use_audio_in_video)
+        inputs = self.processor(
+            text=text,
+            audio=audios,
+            images=images,
+            videos=videos,
+            return_tensors="pt",
+            padding=True,
+            use_audio_in_video=use_audio_in_video
+        )
+        inputs = inputs.to(self.model.device).to(self.model.dtype)
+        text_ids, audio = self.model.generate(
+            **inputs,
+            speaker=speaker,
+            thinker_return_dict_in_generate=True,
+            use_audio_in_video=use_audio_in_video
+        )
+        text_output = self.processor.batch_decode(
+            text_ids.sequences[:, inputs["input_ids"].shape[1]:],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False
+        )
+        result = {"generated_text": text_output}
+        if audio is not None:
+            # Guarda el audio en un archivo temporal y retorna la ruta
+            sf.write("output.wav", audio.reshape(-1).detach().cpu().numpy(), samplerate=24000)
+            result["audio_path"] = "output.wav"
+        return [result]

requirements.txt CHANGED Viewed

@@ -1,5 +1,5 @@
 soundfile
-transformers>=4.51.0
 torch
 qwen-omni-utils
 torchvision

 soundfile
+transformers
 torch
 qwen-omni-utils
 torchvision