audiobook-ru-tts / synth_batch.py
danilahs's picture
Upload folder using huggingface_hub
4f6648e verified
#!/usr/bin/env python3
import argparse, os, sys, re
import numpy as np
import soundfile as sf
from tqdm import tqdm
from utils.text import normalize_text, split_into_paragraphs, maybe_ruaccent
from utils.audio import crossfade_concat, normalize_lufs, save_wav
from backends.espeech_backend import EspeechBackend
def read_input(path: str) -> str:
if path.lower().endswith(".txt"):
with open(path, "r", encoding="utf-8") as f:
return f.read()
elif path.lower().endswith(".epub"):
try:
from ebooklib import epub
from bs4 import BeautifulSoup
except Exception:
print("Для EPUB установите: pip install ebooklib beautifulsoup4 lxml", file=sys.stderr)
sys.exit(2)
book = epub.read_epub(path)
texts = []
for item in book.get_items():
if item.get_type() == 9: # DOCUMENT
soup = BeautifulSoup(item.get_body_content(), "lxml")
texts.append(soup.get_text(" ", strip=True))
return "\n\n".join(texts)
else:
raise ValueError("Поддерживаются .txt и .epub")
def load_default_ref_text():
"""Load default reference text from local sample"""
try:
with open("samples/001/sample.text", "r", encoding="utf-8") as f:
return f.read().strip()
except FileNotFoundError:
return ""
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--input", required=True, help="Путь к TXT/EPUB")
ap.add_argument("--outdir", required=True, help="Каталог для результата")
ap.add_argument("--ref-audio", required=False, default="samples/001/sample.mp3", help="Путь к референс-аудио (6–12с)")
ap.add_argument("--ref-text", required=False, default=load_default_ref_text(), help="Референс-текст")
ap.add_argument("--model-repo", default=os.getenv("MODEL_REPO", "ESpeech/ESpeech-TTS-1_RL-V2"))
ap.add_argument("--speed", type=float, default=1.0)
ap.add_argument("--nfe-steps", type=int, default=48)
ap.add_argument("--crossfade-ms", type=int, default=150)
ap.add_argument("--target-lufs", type=float, default=-20.0)
args = ap.parse_args()
os.makedirs(args.outdir, exist_ok=True)
backend = EspeechBackend(model_id=args.model_repo)
raw = read_input(args.input)
text = normalize_text(raw)
paragraphs = split_into_paragraphs(text)
paragraphs = [maybe_ruaccent(p) for p in paragraphs]
print(f"Абзацев: {len(paragraphs)}")
pieces = []
sr = None
for i, para in enumerate(tqdm(paragraphs, desc="Генерация")):
audio, sr = backend.synthesize(
text=para,
ref_audio_path=args.ref_audio,
ref_text=args.ref_text,
speed=args.speed,
nfe_steps=args.nfe_steps,
seed=None,
)
pieces.append(audio)
# Сохраняем черновики по абзацам (опционально)
# sf.write(os.path.join(args.outdir, f"para_{i:05d}.wav"), audio, sr)
final = crossfade_concat(pieces, crossfade_ms=args.crossfade_ms, sample_rate=sr)
final = normalize_lufs(final, sr, target_lufs=args.target_lufs)
out_path = os.path.join(args.outdir, "book.wav")
save_wav(out_path, final, sr)
print(f"ГОТОВО: {out_path}")
if __name__ == "__main__":
main()