File size: 5,803 Bytes
899a643
43626b5
899a643
 
 
 
 
 
 
 
 
 
 
43626b5
899a643
 
 
 
43626b5
 
 
 
cf795d0
43626b5
cf795d0
43626b5
cf795d0
 
 
 
43626b5
 
cf795d0
43626b5
 
 
899a643
 
 
 
 
 
 
cf795d0
899a643
 
 
 
43626b5
899a643
 
 
 
 
 
43626b5
 
 
cf795d0
 
 
 
 
 
 
 
 
43626b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
899a643
43626b5
 
 
899a643
43626b5
cf795d0
 
899a643
 
 
 
43626b5
 
 
899a643
 
 
43626b5
 
 
cf795d0
 
 
 
43626b5
 
 
cf795d0
899a643
cf795d0
 
 
 
43626b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cf795d0
43626b5
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
"""
Handler final pour Kyutai TTS - Compatible HF Endpoints
"""

import torch
import base64
import io
import numpy as np
from typing import Dict, Any

class EndpointHandler:
    def __init__(self, path=""):
        """
        Initialise le handler avec un fallback audio simple
        """
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"🔧 Initialisation sur {self.device}")
        
        # Configuration
        self.sample_rate = 24000
        self.model_loaded = False
        
        try:
            # Essayer de charger moshi
            from moshi.models import loaders
            print("📥 Tentative de chargement avec moshi...")
            self.lm_model = loaders.get_pretrained_lm_model(
                device=self.device,
                repo_id="kyutai/tts-1.6b-en_fr"
            )
            self.model_loaded = True
            print("✅ Modèle Kyutai chargé avec succès!")
        except Exception as e:
            print(f"⚠️ Impossible de charger Kyutai TTS: {e}")
            print("🔄 Mode fallback activé - génération audio basique")
            self.model_loaded = False
        
    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Traite les requêtes TTS
        """
        text = data.get("inputs", "")
        if not text:
            raise ValueError("Le paramètre 'inputs' est requis")
            
        params = data.get("parameters", {})
        language = params.get("language", "auto")
        
        # Détection de la langue
        if language == "auto":
            fr_chars = set("àâäéèêëïîôùûçœ")
            has_french = any(c in text.lower() for c in fr_chars)
            language = "fr" if has_french else "en"
        
        try:
            if self.model_loaded:
                # Utiliser le vrai modèle Kyutai
                print(f"🎤 Synthèse Kyutai TTS: {len(text)} caractères en {language}")
                with torch.no_grad():
                    audio_tensor = self.lm_model.synthesize(
                        text=text,
                        language=language,
                        speaker_id=0,
                        speed=1.0
                    )
                audio_np = audio_tensor.cpu().numpy()
            else:
                # Fallback: générer un placeholder audio
                print(f"🎵 Mode fallback: génération audio simple pour {len(text)} caractères")
                duration = min(len(text) * 0.06, 10.0)  # ~60ms par caractère, max 10s
                samples = int(self.sample_rate * duration)
                
                # Générer une voix synthétique simple
                t = np.linspace(0, duration, samples)
                # Fréquences pour simuler une voix
                f1 = 200 + 50 * np.sin(2 * np.pi * 3 * t)  # Modulation lente
                f2 = 400 + 100 * np.sin(2 * np.pi * 2 * t)
                
                # Combiner plusieurs harmoniques
                audio_np = 0.3 * np.sin(2 * np.pi * f1 * t)
                audio_np += 0.2 * np.sin(2 * np.pi * f2 * t)
                audio_np += 0.1 * np.sin(2 * np.pi * 800 * t)
                
                # Enveloppe pour rendre plus naturel
                envelope = np.exp(-t / duration * 2)
                audio_np *= envelope
            
            # Normaliser l'audio
            if np.max(np.abs(audio_np)) > 0:
                audio_np = audio_np / np.max(np.abs(audio_np)) * 0.8
            
            # Convertir en WAV
            audio_bytes = self.numpy_to_wav(audio_np, self.sample_rate)
            audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
            
            return {
                "audio": audio_base64,
                "sampling_rate": self.sample_rate,
                "duration": len(audio_np) / self.sample_rate,
                "model_loaded": self.model_loaded,
                "language": language
            }
            
        except Exception as e:
            print(f"❌ Erreur lors de la synthèse: {str(e)}")
            # Retourner un court silence en cas d'erreur
            silence = np.zeros(int(self.sample_rate * 0.5))  # 0.5s de silence
            audio_bytes = self.numpy_to_wav(silence, self.sample_rate)
            return {
                "audio": base64.b64encode(audio_bytes).decode('utf-8'),
                "sampling_rate": self.sample_rate,
                "duration": 0.5,
                "error": str(e),
                "model_loaded": self.model_loaded
            }
    
    def numpy_to_wav(self, audio_np, sample_rate):
        """Convertit numpy array en WAV bytes"""
        import struct
        
        # Ensure audio is 1D
        if audio_np.ndim > 1:
            audio_np = audio_np.flatten()
        
        # Convert to int16
        audio_int16 = (audio_np * 32767).astype(np.int16)
        
        # Create WAV header
        num_samples = len(audio_int16)
        num_channels = 1
        bits_per_sample = 16
        byte_rate = sample_rate * num_channels * bits_per_sample // 8
        block_align = num_channels * bits_per_sample // 8
        
        # WAV file structure
        wav_header = struct.pack(
            '<4sI4s4sIHHIIHH4sI',
            b'RIFF',
            36 + num_samples * 2,  # ChunkSize
            b'WAVE',
            b'fmt ',
            16,  # Subchunk1Size (PCM)
            1,   # AudioFormat (PCM)
            num_channels,
            sample_rate,
            byte_rate,
            block_align,
            bits_per_sample,
            b'data',
            num_samples * 2  # Subchunk2Size
        )
        
        # Combine header and audio data
        wav_data = wav_header + audio_int16.tobytes()
        
        return wav_data