Spaces:
Sleeping
Sleeping
| from flask import Flask, request, jsonify, send_from_directory | |
| from flask_cors import CORS | |
| import whisper | |
| import soundfile as sf | |
| import numpy as np | |
| import uuid, os | |
| from piper.voice import PiperVoice | |
| app = Flask(__name__, static_folder="../frontend", static_url_path="") | |
| CORS(app) | |
| OUTPUT = "backend/output" | |
| os.makedirs(OUTPUT, exist_ok=True) | |
| print("Loading Piper…") | |
| voice = PiperVoice.load( | |
| "backend/models/piper/en_US-lessac-medium.onnx", | |
| "backend/models/piper/en_US-lessac-medium.onnx.json" | |
| ) | |
| print("Piper ready") | |
| print("Loading Whisper…") | |
| whisper_model = whisper.load_model("base") | |
| print("Whisper ready") | |
| PHONEME_TO_VISEME = { | |
| "AA": "A", "AE": "A", "AH": "A", | |
| "EH": "E", "IY": "E", | |
| "OW": "O", "UH": "O", | |
| "M": "BMP", "B": "BMP", "P": "BMP", | |
| "F": "FV", "V": "FV", | |
| "S": "SZ", "Z": "SZ", | |
| "L": "L", | |
| } | |
| def index(): | |
| return app.send_static_file("index.html") | |
| def speak(): | |
| text = request.json.get("text", "").strip() | |
| if not text: | |
| return jsonify({"error": "Empty text"}), 400 | |
| uid = str(uuid.uuid4()) | |
| wav_path = f"{OUTPUT}/{uid}.wav" | |
| audio_chunks = [] | |
| # Piper synthesis (robust) | |
| for chunk in voice.synthesize(text): | |
| # Case 1: bytes | |
| if isinstance(chunk, (bytes, bytearray)): | |
| audio_chunks.append( | |
| np.frombuffer(chunk, dtype=np.int16) | |
| ) | |
| # Case 2: numpy array | |
| elif isinstance(chunk, np.ndarray): | |
| audio_chunks.append(chunk) | |
| # Case 3: object with samples | |
| elif hasattr(chunk, "samples"): | |
| audio_chunks.append(chunk.samples) | |
| if not audio_chunks: | |
| return jsonify({"error": "Piper produced no audio"}), 500 | |
| audio = np.concatenate(audio_chunks) | |
| sf.write(wav_path, audio, 22050) | |
| # Whisper alignment | |
| result = whisper_model.transcribe(wav_path, word_timestamps=True) | |
| timeline = [] | |
| for seg in result["segments"]: | |
| for w in seg.get("words", []): | |
| timeline.append({ | |
| "t": w["start"], | |
| "v": "A" # simple viseme placeholder | |
| }) | |
| return jsonify({ | |
| "audio": f"/audio/{uid}.wav", | |
| "timeline": timeline | |
| }) | |
| def audio(name): | |
| return send_from_directory(OUTPUT, name) | |
| if __name__ == "__main__": | |
| app.run(host="0.0.0.0", port=7860) | |