F5-TTS-pt-br / voice_clone.py

Add files using upload-large-folder tool

cf7fa42 verified 25 days ago

11.8 kB

	#!/usr/bin/env python3
	"""
	F5-TTS Voice Cloning Script (Portuguese/Multi-lingual)
	Wraps AgentF5TTSChunk for convenient CLI usage.

	Usage:
	Single mode: python voice_clone.py --text "Olá mundo" --ref-audio voice.wav --checkpoint models/model.safetensors
	Batch mode: python voice_clone.py --srt subtitles.srt --ref-dir ./speakers --checkpoint models/model.safetensors
	"""

	import argparse
	import os
	import re
	import sys
	import logging
	import torch
	from typing import List, Dict, Optional, Tuple

	# Setup logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(levelname)s - %(message)s',
	datefmt='%Y-%m-%d %H:%M:%S'
	)
	logger = logging.getLogger(__name__)

	try:
	from tqdm import tqdm
	except ImportError:
	# Fallback if tqdm is not installed
	def tqdm(iterable, **kwargs):
	return iterable

	try:
	from AgentF5TTSChunk import AgentF5TTS
	except ImportError:
	# If not in same dir, try adding current dir to path
	sys.path.append(os.getcwd())
	try:
	from AgentF5TTSChunk import AgentF5TTS
	except ImportError:
	logger.error("Error: AgentF5TTSChunk.py not found.")
	sys.exit(1)


	def parse_srt(srt_file: str) -> List[Dict]:
	"""
	Parse SRT file and extract subtitle entries
	Returns list of dicts with 'id', 'start', 'end', 'text'
	"""
	logger.info(f"Parsing SRT file: {srt_file}")

	with open(srt_file, 'r', encoding='utf-8') as f:
	content = f.read()

	# Normalize newlines
	content = content.replace('\r\n', '\n')

	# Split by double newlines to separate subtitle blocks, handle multiple newlines
	blocks = re.split(r'\n{2,}', content.strip())

	subtitles = []
	for block in blocks:
	lines = [l.strip() for l in block.split('\n') if l.strip()]
	if len(lines) >= 2: # At least ID and Timestamp
	try:
	# First line should be the ID
	if lines[0].isdigit():
	subtitle_id = int(lines[0])
	timestamp_line_idx = 1
	else:
	# Sometimes ID is missing or merged? Try to find timestamp line
	subtitle_id = len(subtitles) + 1
	timestamp_line_idx = 0
	if '-->' not in lines[0]:
	logger.warning(f"Skipping malformed block (no timestamp): {block[:50]}...")
	continue

	timestamp = lines[timestamp_line_idx]
	# Remaining lines are the text
	text = ' '.join(lines[timestamp_line_idx + 1:]).strip()

	if text:
	subtitles.append({
	'id': subtitle_id,
	'timestamp': timestamp,
	'text': text
	})
	except (ValueError, IndexError) as e:
	logger.warning(f"Skipping malformed block: {block[:50]}... Error: {e}")
	continue

	logger.info(f"Parsed {len(subtitles)} subtitle entries")
	return subtitles


	def find_reference_audio(reference_dir: str, subtitle_id: int, audio_prefix: str = 'segment') -> Optional[str]:
	"""
	Fallback: Find reference audio by ID (e.g., segment_001.wav)
	"""
	if not reference_dir:
	return None

	patterns = [
	f"{audio_prefix}_{subtitle_id:03d}.wav",
	f"{audio_prefix}_{subtitle_id:03d}.mp3",
	f"{audio_prefix}_{subtitle_id:03d}.MP4",
	f"{audio_prefix}_{subtitle_id}.wav",
	f"{audio_prefix}_{subtitle_id}.mp3",
	f"{audio_prefix}_{subtitle_id}.MP4",
	f"{audio_prefix}{subtitle_id:03d}.wav",
	f"{audio_prefix}{subtitle_id:03d}.mp3",
	f"{audio_prefix}{subtitle_id:03d}.MP4",
	]

	for pattern in patterns:
	audio_path = os.path.join(reference_dir, pattern)
	if os.path.exists(audio_path):
	return audio_path

	return None


	def resolve_speaker_ref(agent: AgentF5TTS, text: str, reference_dir: str, default_ref: Optional[str] = None) -> Tuple[str, Optional[str]]:
	"""
	Use agent's logic to parse speaker/emotion, then resolve file.
	"""
	# Use the agent's internal parser
	# Note: Accessing protected member _determine_speaker_emotion
	speaker, emotion = agent._determine_speaker_emotion(text)

	# Remove tags from text
	clean_text = re.sub(r'\[speaker:.?\]\s', '', text).strip()

	ref_audio = default_ref

	if speaker and reference_dir:
	# Candidate filenames to look for
	candidates = []
	if emotion and emotion != "neutral":
	candidates.append(f"{speaker}_{emotion}.wav")
	candidates.append(f"{speaker}_{emotion}.mp3")

	candidates.append(f"{speaker}.wav")
	candidates.append(f"{speaker}.mp3")

	# Lowercase fallback
	if emotion and emotion != "neutral":
	candidates.append(f"{speaker.lower()}_{emotion.lower()}.wav")
	candidates.append(f"{speaker.lower()}.wav")

	found = False
	for cand in candidates:
	path = os.path.join(reference_dir, cand)
	if os.path.exists(path):
	ref_audio = path
	found = True
	break

	if found:
	logger.debug(f"Role matched: {os.path.basename(ref_audio)} (Speaker: {speaker}, Emotion: {emotion})")

	return clean_text, ref_audio


	def parse_args():
	parser = argparse.ArgumentParser(
	description='F5-TTS Voice Cloning Script (Wraps AgentF5TTS)',
	formatter_class=argparse.RawDescriptionHelpFormatter,
	epilog="""
	EXAMPLES:
	# Single Mode
	python voice_clone.py --text "Olá, tudo bem?" --ref-audio ref.wav --checkpoint models/model.safetensors

	# Batch Mode (SRT)
	python voice_clone.py --srt subs.srt --ref-dir ./speakers --checkpoint models/model.safetensors
	"""
	)

	# Input Mode
	mode_group = parser.add_mutually_exclusive_group(required=True)
	mode_group.add_argument('--text', type=str, help='Text to synthesize')
	mode_group.add_argument('--srt', type=str, help='Path to SRT subtitle file')

	# Reference Audio
	ref_group = parser.add_mutually_exclusive_group()
	ref_group.add_argument('--ref-audio', type=str, help='[Single] Reference audio path')
	ref_group.add_argument('--ref-dir', type=str, help='[Batch] Directory with reference audios (speakers or segments)')
	# Alias for backward compatibility or typo tolerance
	ref_group.add_argument('--reference-dir', dest='ref_dir', help=argparse.SUPPRESS)

	# Reference Text (Optional, prevents model from transcribing audio)
	parser.add_argument('--ref-text', type=str, default="", help='Reference text for the reference audio (optional)')

	# Model Configuration
	parser.add_argument('--checkpoint', type=str, required=True, help='Path to F5-TTS safetensors checkpoint')
	parser.add_argument('--vocoder', type=str, default='vocos', choices=['vocos', 'bigvgan'], help='Vocoder type')
	parser.add_argument('--device', type=str, default=None, help='Device (cuda:0, cpu, mps)')
	parser.add_argument('--speed', type=float, default=1.0, help='Speed factor for speech generation (default: 1.0)')

	# Output Configuration
	parser.add_argument('--output', type=str, default='outputs', help='Output directory')
	parser.add_argument('--output-prefix', type=str, default='clone', help='Output filename prefix')
	parser.add_argument('--skip-existing', action='store_true', help='Skip existing output files')

	# Batch specialized
	parser.add_argument('--audio-prefix', type=str, default='segment', help='Prefix for ID-based reference lookup')

	return parser.parse_args()


	def main():
	args = parse_args()

	# Device Setup
	if args.device:
	device = args.device
	else:
	device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

	logger.info(f"Using device: {device}")

	# Create Output Dir
	os.makedirs(args.output, exist_ok=True)

	# Initialize Agent
	logger.info(f"Initializing AgentF5TTS with checkpoint: {args.checkpoint}")
	try:
	agent = AgentF5TTS(
	ckpt_file=args.checkpoint,
	vocoder_name=args.vocoder,
	device=device
	)
	except Exception as e:
	logger.error(f"Failed to initialize agent: {e}")
	return

	# Single Mode
	if args.text:
	logger.info("-" * 40)
	logger.info("SINGLE MODE PROCESSING")
	logger.info("-" * 40)

	if not args.ref_audio or not os.path.exists(args.ref_audio):
	logger.error(f"Reference audio not found: {args.ref_audio}")
	return

	# Try to parse speaker tags just in case
	clean_text, effective_ref = resolve_speaker_ref(
	agent,
	args.text,
	os.path.dirname(args.ref_audio),
	default_ref=args.ref_audio
	)

	output_path = os.path.join(args.output, "output_single.wav")
	logger.info(f"Text: {clean_text}")
	logger.info(f"Ref: {effective_ref}")

	try:
	agent.infer(
	ref_file=effective_ref,
	ref_text=args.ref_text,
	gen_text=clean_text,
	file_wave=output_path,
	remove_silence=True,
	speed=args.speed
	)
	logger.info(f"✓ Saved: {output_path}")
	except Exception as e:
	logger.error(f"✗ Error: {e}")

	# Batch Mode
	elif args.srt:
	logger.info("-" * 40)
	logger.info("BATCH MODE PROCESSING")
	logger.info("-" * 40)

	subtitles = parse_srt(args.srt)
	if not subtitles:
	logger.error("No subtitles found.")
	return

	logger.info(f"Processing {len(subtitles)} entries...")
	success = 0
	errors = 0
	skipped = 0

	# Use tqdm for progress bar
	pbar = tqdm(subtitles, desc="Synthesizing", unit="line")

	for sub in pbar:
	sid = sub['id']
	raw_text = sub['text']

	# Update progress bar description
	pbar.set_description(f"Processing ID {sid}")

	# Determine Output Path
	out_name = f"{args.output_prefix}_{sid:03d}.wav"
	out_path = os.path.join(args.output, out_name)

	if args.skip_existing and os.path.exists(out_path):
	skipped += 1
	continue

	# Resolve Speaker/Reference
	if args.ref_audio:
	default_ref = args.ref_audio
	else:
	default_ref = find_reference_audio(args.ref_dir, sid, args.audio_prefix)

	clean_text, ref_audio = resolve_speaker_ref(agent, raw_text, args.ref_dir, default_ref)

	if not ref_audio or not os.path.exists(ref_audio):
	logger.warning(f"ID {sid}: No reference audio found. Skipping.")
	errors += 1
	continue

	# Generate via Agent
	try:
	agent.infer(
	ref_file=ref_audio,
	ref_text=args.ref_text if args.ref_audio else "", # Use ref_text only if using single ref audio
	gen_text=clean_text,
	file_wave=out_path,
	remove_silence=True,
	speed=args.speed
	)
	success += 1
	except Exception as e:
	logger.error(f"ID {sid}: Generation failed: {e}")
	errors += 1

	logger.info("-" * 40)
	logger.info(f"Done. Success: {success}, Skipped: {skipped}, Errors: {errors}")

	if __name__ == "__main__":
	main()