Spaces:

Adedoyinjames
/

Texttospeech

Runtime error

App Files Files Community

Texttospeech / app.py

Adedoyinjames

Update app.py

771533b verified about 2 months ago

raw

history blame contribute delete

7.65 kB

	import io
	import scipy.io.wavfile as wavf
	from fastapi import FastAPI, Response
	from pydantic import BaseModel
	import gradio as gr
	from transformers import pipeline, AutoTokenizer, VitsModel # VitsModel kept for compatibility fallback
	import torch
	import uvicorn
	import numpy as np
	import copy # For potential TTS voice embeddings if you add custom inference

	# ================================================
	# IMPORTANT SETUP NOTES FOR VibeVoice 0.5B (Realtime TTS)
	# ================================================
	# 1. The VibeVoice-Realtime-0.5B is NOT a standard VITS model.
	# It uses a custom Qwen2.5-0.5B + diffusion head + streaming processor.
	# 2. To use it properly:
	# git clone https://github.com/microsoft/VibeVoice.git
	# cd VibeVoice
	# pip install -e .[streamingtts]
	# 3. The full inference code (including streaming & voice presets) is in:
	# demo/realtime_model_inference_from_file.py
	# demo/vibevoice_realtime_demo.py
	# 4. For now, this script keeps the old VITS-style TTS as a fallback.
	# Replace the synthesize_speech() function with the custom logic from the repo
	# once installed (see the snippets in the comments below).
	# 5. VibeVoice-ASR works out-of-the-box with the standard Transformers pipeline.

	print("Loading VibeVoice-ASR (for recording classes → instant text)...")
	asr_pipeline = pipeline(
	"automatic-speech-recognition",
	model="microsoft/VibeVoice-ASR",
	trust_remote_code=True
	)
	print("✅ VibeVoice-ASR loaded successfully!")

	# TTS Model (0.5B Realtime) - placeholder loading
	# Uncomment and adapt the lines below AFTER installing the VibeVoice package:
	# from transformers import AutoModelForCausalLM
	# from VibeVoice.tokenizer import VibeVoiceTextTokenizerFast # custom from repo
	# from VibeVoice.model import VibeVoiceStreamingProcessor, VibeVoiceStreamingForConditionalGenerationInference
	#
	# MODEL_PATH = "microsoft/VibeVoice-Realtime-0.5B"
	# processor = VibeVoiceStreamingProcessor.from_pretrained(MODEL_PATH)
	# model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
	# MODEL_PATH, trust_remote_code=True
	# )
	# # Example voice preset (Carter, Emma, etc.)
	# # voice_embeddings = torch.load("demo/voices/streaming_model/carter.pt", map_location="cpu")

	model_name = "microsoft/VibeVoice-Realtime-0.5B" # ← Changed to your requested 0.5B model
	print(f"Loading TTS model: {model_name} (note: custom inference required for full realtime streaming)")
	tokenizer = AutoTokenizer.from_pretrained(model_name) # May fail until custom tokenizer is installed
	model = VitsModel.from_pretrained(model_name) # Will fail for 0.5B until you swap to custom classes
	print("✅ TTS model loaded (fallback mode)")

	# ================================================
	# Core TTS Synthesis Function (0.5B model)
	# ================================================
	def synthesize_speech(text: str):
	"""
	TODO: Replace this entire function with the official VibeVoice realtime inference
	once you have installed the package (see notes above).

	Example skeleton from the repo (adapt as needed):

	inputs = processor(text, return_tensors="pt")
	# Add voice preset embeddings here if desired
	with torch.no_grad():
	output = model.generate(
	**inputs,
	tokenizer=processor.tokenizer,
	cfg_scale=1.5,
	ddpm_steps=10, # lower = faster, higher = better quality
	verbose=False
	)
	# output will contain waveform or latent → decode to 24kHz audio
	"""
	# Current fallback (will work only until you replace with custom code)
	inputs = tokenizer(text, return_tensors="pt")
	with torch.no_grad():
	output = model(**inputs).waveform
	audio_data = output.squeeze().numpy()
	sample_rate = model.config.sampling_rate
	wav_io = io.BytesIO()
	wavf.write(wav_io, sample_rate, audio_data)
	wav_io.seek(0)
	return wav_io.read(), sample_rate, audio_data


	# ================================================
	# Core ASR Function (Record classes → instant text)
	# ================================================
	def transcribe_audio(audio):
	"""Record audio (classes/lectures) → returns clean text instantly."""
	if audio is None:
	return "No audio recorded."
	sr, data = audio
	# VibeVoice-ASR handles numpy array + sample rate directly
	result = asr_pipeline({"sampling_rate": sr, "raw": data.astype(np.float32)})
	return result["text"]


	# ================================================
	# FastAPI Setup + API Endpoint (kept exactly as requested)
	# ================================================
	app = FastAPI(title="VibeVoice 0.5B Realtime TTS + ASR API")

	class TTSRequest(BaseModel):
	text: str

	@app.post("/api/tts", summary="Generate Speech from Text (0.5B model)")
	async def api_tts(request: TTSRequest):
	"""
	POST {"text": "your text here"}
	Returns downloadable WAV (maintains original API).
	"""
	wav_bytes, _, _ = synthesize_speech(request.text)
	return Response(content=wav_bytes, media_type="audio/wav")


	# ================================================
	# Gradio Interface (now with BOTH features in tabs)
	# ================================================
	with gr.Blocks(title="VibeVoice 0.5B • Record Classes + TTS", theme=gr.themes.Soft()) as ui:
	gr.Markdown("# ⚡ VibeVoice 0.5B Realtime TTS + ASR\nRecord class lectures → get text instantly. Text → speech in one click.")

	with gr.Tabs():
	# ====================== ASR TAB ======================
	with gr.Tab("🎤 Record Class → Instant Text"):
	gr.Markdown("Record your class/lecture audio → copy the transcribed text instantly")
	audio_input = gr.Audio(
	sources=["microphone"],
	type="numpy",
	label="Record Audio (or upload)",
	waveform_options=gr.WaveformOptions(waveform_color="#4F46E5")
	)
	transcribe_btn = gr.Button("📝 Transcribe Now", variant="primary", size="large")
	text_output = gr.Textbox(label="Transcribed Text", lines=8, show_copy_button=True)

	transcribe_btn.click(
	fn=transcribe_audio,
	inputs=audio_input,
	outputs=text_output
	)

	# ====================== TTS TAB ======================
	with gr.Tab("🔊 Text → Speech (0.5B Realtime)"):
	gr.Markdown("Type text → generate natural speech (uses VibeVoice-Realtime-0.5B)")
	text_input = gr.Textbox(
	label="Enter Text",
	lines=4,
	placeholder="Type or paste your script here...",
	value="Hello, this is a test of Microsoft's new VibeVoice 0.5B real-time TTS."
	)
	tts_btn = gr.Button("🔊 Generate Speech", variant="primary", size="large")
	audio_output = gr.Audio(label="Generated Speech", type="numpy")

	tts_btn.click(
	fn=lambda t: synthesize_speech(t)[1:3], # returns (rate, audio) for Gradio
	inputs=text_input,
	outputs=audio_output
	)

	gr.Markdown(
	"How to use the API: `curl -X POST http://localhost:7860/api/tts -d '{\"text\":\"Hello world\"}' --output speech.wav`\n\n"
	"Full realtime streaming & voice presets available in the official VibeVoice repo."
	)

	# Mount Gradio onto FastAPI (keeps original behavior)
	app = gr.mount_gradio_app(app, ui, path="/")

	# Run the server
	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=7860)