F5-TTS-pt-br / voice_clone.py
fuuuzzy's picture
Add files using upload-large-folder tool
cf7fa42 verified
#!/usr/bin/env python3
"""
F5-TTS Voice Cloning Script (Portuguese/Multi-lingual)
Wraps AgentF5TTSChunk for convenient CLI usage.
Usage:
Single mode: python voice_clone.py --text "Olá mundo" --ref-audio voice.wav --checkpoint models/model.safetensors
Batch mode: python voice_clone.py --srt subtitles.srt --ref-dir ./speakers --checkpoint models/model.safetensors
"""
import argparse
import os
import re
import sys
import logging
import torch
from typing import List, Dict, Optional, Tuple
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
try:
from tqdm import tqdm
except ImportError:
# Fallback if tqdm is not installed
def tqdm(iterable, **kwargs):
return iterable
try:
from AgentF5TTSChunk import AgentF5TTS
except ImportError:
# If not in same dir, try adding current dir to path
sys.path.append(os.getcwd())
try:
from AgentF5TTSChunk import AgentF5TTS
except ImportError:
logger.error("Error: AgentF5TTSChunk.py not found.")
sys.exit(1)
def parse_srt(srt_file: str) -> List[Dict]:
"""
Parse SRT file and extract subtitle entries
Returns list of dicts with 'id', 'start', 'end', 'text'
"""
logger.info(f"Parsing SRT file: {srt_file}")
with open(srt_file, 'r', encoding='utf-8') as f:
content = f.read()
# Normalize newlines
content = content.replace('\r\n', '\n')
# Split by double newlines to separate subtitle blocks, handle multiple newlines
blocks = re.split(r'\n{2,}', content.strip())
subtitles = []
for block in blocks:
lines = [l.strip() for l in block.split('\n') if l.strip()]
if len(lines) >= 2: # At least ID and Timestamp
try:
# First line should be the ID
if lines[0].isdigit():
subtitle_id = int(lines[0])
timestamp_line_idx = 1
else:
# Sometimes ID is missing or merged? Try to find timestamp line
subtitle_id = len(subtitles) + 1
timestamp_line_idx = 0
if '-->' not in lines[0]:
logger.warning(f"Skipping malformed block (no timestamp): {block[:50]}...")
continue
timestamp = lines[timestamp_line_idx]
# Remaining lines are the text
text = ' '.join(lines[timestamp_line_idx + 1:]).strip()
if text:
subtitles.append({
'id': subtitle_id,
'timestamp': timestamp,
'text': text
})
except (ValueError, IndexError) as e:
logger.warning(f"Skipping malformed block: {block[:50]}... Error: {e}")
continue
logger.info(f"Parsed {len(subtitles)} subtitle entries")
return subtitles
def find_reference_audio(reference_dir: str, subtitle_id: int, audio_prefix: str = 'segment') -> Optional[str]:
"""
Fallback: Find reference audio by ID (e.g., segment_001.wav)
"""
if not reference_dir:
return None
patterns = [
f"{audio_prefix}_{subtitle_id:03d}.wav",
f"{audio_prefix}_{subtitle_id:03d}.mp3",
f"{audio_prefix}_{subtitle_id:03d}.MP4",
f"{audio_prefix}_{subtitle_id}.wav",
f"{audio_prefix}_{subtitle_id}.mp3",
f"{audio_prefix}_{subtitle_id}.MP4",
f"{audio_prefix}{subtitle_id:03d}.wav",
f"{audio_prefix}{subtitle_id:03d}.mp3",
f"{audio_prefix}{subtitle_id:03d}.MP4",
]
for pattern in patterns:
audio_path = os.path.join(reference_dir, pattern)
if os.path.exists(audio_path):
return audio_path
return None
def resolve_speaker_ref(agent: AgentF5TTS, text: str, reference_dir: str, default_ref: Optional[str] = None) -> Tuple[str, Optional[str]]:
"""
Use agent's logic to parse speaker/emotion, then resolve file.
"""
# Use the agent's internal parser
# Note: Accessing protected member _determine_speaker_emotion
speaker, emotion = agent._determine_speaker_emotion(text)
# Remove tags from text
clean_text = re.sub(r'\[speaker:.*?\]\s*', '', text).strip()
ref_audio = default_ref
if speaker and reference_dir:
# Candidate filenames to look for
candidates = []
if emotion and emotion != "neutral":
candidates.append(f"{speaker}_{emotion}.wav")
candidates.append(f"{speaker}_{emotion}.mp3")
candidates.append(f"{speaker}.wav")
candidates.append(f"{speaker}.mp3")
# Lowercase fallback
if emotion and emotion != "neutral":
candidates.append(f"{speaker.lower()}_{emotion.lower()}.wav")
candidates.append(f"{speaker.lower()}.wav")
found = False
for cand in candidates:
path = os.path.join(reference_dir, cand)
if os.path.exists(path):
ref_audio = path
found = True
break
if found:
logger.debug(f"Role matched: {os.path.basename(ref_audio)} (Speaker: {speaker}, Emotion: {emotion})")
return clean_text, ref_audio
def parse_args():
parser = argparse.ArgumentParser(
description='F5-TTS Voice Cloning Script (Wraps AgentF5TTS)',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
EXAMPLES:
# Single Mode
python voice_clone.py --text "Olá, tudo bem?" --ref-audio ref.wav --checkpoint models/model.safetensors
# Batch Mode (SRT)
python voice_clone.py --srt subs.srt --ref-dir ./speakers --checkpoint models/model.safetensors
"""
)
# Input Mode
mode_group = parser.add_mutually_exclusive_group(required=True)
mode_group.add_argument('--text', type=str, help='Text to synthesize')
mode_group.add_argument('--srt', type=str, help='Path to SRT subtitle file')
# Reference Audio
ref_group = parser.add_mutually_exclusive_group()
ref_group.add_argument('--ref-audio', type=str, help='[Single] Reference audio path')
ref_group.add_argument('--ref-dir', type=str, help='[Batch] Directory with reference audios (speakers or segments)')
# Alias for backward compatibility or typo tolerance
ref_group.add_argument('--reference-dir', dest='ref_dir', help=argparse.SUPPRESS)
# Reference Text (Optional, prevents model from transcribing audio)
parser.add_argument('--ref-text', type=str, default="", help='Reference text for the reference audio (optional)')
# Model Configuration
parser.add_argument('--checkpoint', type=str, required=True, help='Path to F5-TTS safetensors checkpoint')
parser.add_argument('--vocoder', type=str, default='vocos', choices=['vocos', 'bigvgan'], help='Vocoder type')
parser.add_argument('--device', type=str, default=None, help='Device (cuda:0, cpu, mps)')
parser.add_argument('--speed', type=float, default=1.0, help='Speed factor for speech generation (default: 1.0)')
# Output Configuration
parser.add_argument('--output', type=str, default='outputs', help='Output directory')
parser.add_argument('--output-prefix', type=str, default='clone', help='Output filename prefix')
parser.add_argument('--skip-existing', action='store_true', help='Skip existing output files')
# Batch specialized
parser.add_argument('--audio-prefix', type=str, default='segment', help='Prefix for ID-based reference lookup')
return parser.parse_args()
def main():
args = parse_args()
# Device Setup
if args.device:
device = args.device
else:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
logger.info(f"Using device: {device}")
# Create Output Dir
os.makedirs(args.output, exist_ok=True)
# Initialize Agent
logger.info(f"Initializing AgentF5TTS with checkpoint: {args.checkpoint}")
try:
agent = AgentF5TTS(
ckpt_file=args.checkpoint,
vocoder_name=args.vocoder,
device=device
)
except Exception as e:
logger.error(f"Failed to initialize agent: {e}")
return
# Single Mode
if args.text:
logger.info("-" * 40)
logger.info("SINGLE MODE PROCESSING")
logger.info("-" * 40)
if not args.ref_audio or not os.path.exists(args.ref_audio):
logger.error(f"Reference audio not found: {args.ref_audio}")
return
# Try to parse speaker tags just in case
clean_text, effective_ref = resolve_speaker_ref(
agent,
args.text,
os.path.dirname(args.ref_audio),
default_ref=args.ref_audio
)
output_path = os.path.join(args.output, "output_single.wav")
logger.info(f"Text: {clean_text}")
logger.info(f"Ref: {effective_ref}")
try:
agent.infer(
ref_file=effective_ref,
ref_text=args.ref_text,
gen_text=clean_text,
file_wave=output_path,
remove_silence=True,
speed=args.speed
)
logger.info(f"✓ Saved: {output_path}")
except Exception as e:
logger.error(f"✗ Error: {e}")
# Batch Mode
elif args.srt:
logger.info("-" * 40)
logger.info("BATCH MODE PROCESSING")
logger.info("-" * 40)
subtitles = parse_srt(args.srt)
if not subtitles:
logger.error("No subtitles found.")
return
logger.info(f"Processing {len(subtitles)} entries...")
success = 0
errors = 0
skipped = 0
# Use tqdm for progress bar
pbar = tqdm(subtitles, desc="Synthesizing", unit="line")
for sub in pbar:
sid = sub['id']
raw_text = sub['text']
# Update progress bar description
pbar.set_description(f"Processing ID {sid}")
# Determine Output Path
out_name = f"{args.output_prefix}_{sid:03d}.wav"
out_path = os.path.join(args.output, out_name)
if args.skip_existing and os.path.exists(out_path):
skipped += 1
continue
# Resolve Speaker/Reference
if args.ref_audio:
default_ref = args.ref_audio
else:
default_ref = find_reference_audio(args.ref_dir, sid, args.audio_prefix)
clean_text, ref_audio = resolve_speaker_ref(agent, raw_text, args.ref_dir, default_ref)
if not ref_audio or not os.path.exists(ref_audio):
logger.warning(f"ID {sid}: No reference audio found. Skipping.")
errors += 1
continue
# Generate via Agent
try:
agent.infer(
ref_file=ref_audio,
ref_text=args.ref_text if args.ref_audio else "", # Use ref_text only if using single ref audio
gen_text=clean_text,
file_wave=out_path,
remove_silence=True,
speed=args.speed
)
success += 1
except Exception as e:
logger.error(f"ID {sid}: Generation failed: {e}")
errors += 1
logger.info("-" * 40)
logger.info(f"Done. Success: {success}, Skipped: {skipped}, Errors: {errors}")
if __name__ == "__main__":
main()