# DocVoice.py import torch from transformers import pipeline # ------------------- # 1️⃣ Detect GPU # ------------------- use_cuda = torch.cuda.is_available() device_index = 0 if use_cuda else -1 device_str = "cuda" if use_cuda else "cpu" dtype = torch.float16 if use_cuda else torch.float32 # ------------------- # 2️⃣ Load TTS model from Hugging Face # ------------------- tts_model_id = "espnet/kan-bayashi_ljspeech_vits" # Example TTS model, English voice tts_pipe = pipeline( "text-to-speech", model=tts_model_id, device=device_index, torch_dtype=dtype ) print("🔊 TTS pipeline ready using Hugging Face.") # ------------------- # 3️⃣ TTS Helper Function # ------------------- def text_to_speech(text: str, filename="assistant_response.wav"): """ Generate speech from text and save as WAV file. """ if not text.strip(): return print(f"📝 Generating audio for: {text}") # Generate audio speech_array = tts_pipe(text)["audio"] # Convert to int16 and save as WAV import numpy as np import scipy.io.wavfile as wav wav.write(filename, 22050, (speech_array * 32767).astype(np.int16)) print(f"✅ Audio saved as {filename}") # Optional: play audio automatically (requires sounddevice) try: import sounddevice as sd sd.play(speech_array, samplerate=22050) except Exception as e: print(f"⚠️ Could not play audio automatically: {e}")