File size: 1,483 Bytes
0109f78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
# DocVoice.py
import torch
from transformers import pipeline

# -------------------
# 1️⃣ Detect GPU
# -------------------
use_cuda = torch.cuda.is_available()
device_index = 0 if use_cuda else -1
device_str = "cuda" if use_cuda else "cpu"
dtype = torch.float16 if use_cuda else torch.float32

# -------------------
# 2️⃣ Load TTS model from Hugging Face
# -------------------
tts_model_id = "espnet/kan-bayashi_ljspeech_vits"  # Example TTS model, English voice

tts_pipe = pipeline(
    "text-to-speech",
    model=tts_model_id,
    device=device_index,
    torch_dtype=dtype
)

print("🔊 TTS pipeline ready using Hugging Face.")

# -------------------
# 3️⃣ TTS Helper Function
# -------------------
def text_to_speech(text: str, filename="assistant_response.wav"):
    """
    Generate speech from text and save as WAV file.
    """
    if not text.strip():
        return
    
    print(f"📝 Generating audio for: {text}")
    
    # Generate audio
    speech_array = tts_pipe(text)["audio"]
    
    # Convert to int16 and save as WAV
    import numpy as np
    import scipy.io.wavfile as wav

    wav.write(filename, 22050, (speech_array * 32767).astype(np.int16))
    print(f"✅ Audio saved as {filename}")
    
    # Optional: play audio automatically (requires sounddevice)
    try:
        import sounddevice as sd
        sd.play(speech_array, samplerate=22050)
    except Exception as e:
        print(f"⚠️ Could not play audio automatically: {e}")