Spaces:
Sleeping
Sleeping
| # DocVoice.py | |
| import torch | |
| from transformers import pipeline | |
| # ------------------- | |
| # 1️⃣ Detect GPU | |
| # ------------------- | |
| use_cuda = torch.cuda.is_available() | |
| device_index = 0 if use_cuda else -1 | |
| device_str = "cuda" if use_cuda else "cpu" | |
| dtype = torch.float16 if use_cuda else torch.float32 | |
| # ------------------- | |
| # 2️⃣ Load TTS model from Hugging Face | |
| # ------------------- | |
| tts_model_id = "espnet/kan-bayashi_ljspeech_vits" # Example TTS model, English voice | |
| tts_pipe = pipeline( | |
| "text-to-speech", | |
| model=tts_model_id, | |
| device=device_index, | |
| torch_dtype=dtype | |
| ) | |
| print("🔊 TTS pipeline ready using Hugging Face.") | |
| # ------------------- | |
| # 3️⃣ TTS Helper Function | |
| # ------------------- | |
| def text_to_speech(text: str, filename="assistant_response.wav"): | |
| """ | |
| Generate speech from text and save as WAV file. | |
| """ | |
| if not text.strip(): | |
| return | |
| print(f"📝 Generating audio for: {text}") | |
| # Generate audio | |
| speech_array = tts_pipe(text)["audio"] | |
| # Convert to int16 and save as WAV | |
| import numpy as np | |
| import scipy.io.wavfile as wav | |
| wav.write(filename, 22050, (speech_array * 32767).astype(np.int16)) | |
| print(f"✅ Audio saved as {filename}") | |
| # Optional: play audio automatically (requires sounddevice) | |
| try: | |
| import sounddevice as sd | |
| sd.play(speech_array, samplerate=22050) | |
| except Exception as e: | |
| print(f"⚠️ Could not play audio automatically: {e}") | |