Syspin_Hackathon / infer.py
SpireLab's picture
initial commit
b28205b verified
import torch
from TTS.api import TTS
import os
from tqdm import tqdm
import argparse
# Parse arguments
parser = argparse.ArgumentParser(description="Text-to-Speech Synthesis")
parser.add_argument('-t', '--text_file', type=str, required=True,
help='Path to text file containing text and audio reference files')
parser.add_argument('-r', '--ref_dir', type=str, required=True,
help='Root directory containing reference audio files')
parser.add_argument('-s', '--savedir', type=str, required=True,
help='Directory to store synthesized audio files')
parser.add_argument('-d', '--device', type=str, required=True,
help='Device to use for synthesis (cpu or cuda)')
parser.add_argument('-m', '--model_path', type=str,
default="/app/models/best_model_479919.pth",
help='Path to the model file')
parser.add_argument('-c', '--config_path', type=str,
default="/app/models/config.json",
help='Path to the config file')
args = parser.parse_args()
# Get device
device = args.device if args.device in ["cpu", "cuda"] else "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")
# Initialize TTS model
print(f"Loading model from {args.model_path} with config {args.config_path}")
tts = TTS(
model_path=args.model_path,
config_path=args.config_path,
progress_bar=False,
).to(device)
# Create output directory
os.makedirs(args.savedir, exist_ok=True)
print(f"Output directory: {args.savedir}")
# Read the text file
print(f"Reading text file: {args.text_file}")
with open(args.text_file, 'r') as f:
lines = f.readlines()
# Process each line
print(f"Processing {len(lines)} entries...")
for i, line in enumerate(tqdm(lines)):
parts = line.strip().split('\t')
if len(parts) != 4:
print(f"Warning: Line {i+1} does not have 4 tab-separated parts. Skipping.")
continue
idx, lang, text, ref_file = parts
ref_path = os.path.join(args.ref_dir, ref_file)
save_path = os.path.join(args.savedir, f"{idx}_{lang}_{os.path.basename(ref_file)}")
print(f"Synthesizing: {text[:30]}... using reference {ref_path}")
tts.tts_to_file(text=text, speaker_wav=ref_path, language=lang, file_path=save_path)
print(f"Saved to: {save_path}")
print("Synthesis complete!")
# import torch
# from TTS.api import TTS
# import os
# from tqdm import tqdm
# import argparse
# # Get device
# device = "cuda:3" if torch.cuda.is_available() else "cpu"
# sentences_dict = {
# "te": ["వడ్రంగి, క్షురక వృత్తులలో పెట్టుబడి ప్రధానమై ఇతరులు కూడా ఈ వృత్తిలో ప్రవేశించి వ్యాపారంగా మార్చేసార",
# "నేను ఈ రోజు నాకు ఇష్టమైన పుస్తకాన్ని చదివాను మరియు తరువాత నా స్నేహితుడితో సినిమాకు వెళ్ళాను",
# "ఈ వేసవి సెలవులలో నేను నా కుటుంబంతో కలిసి ఒక అందమైన బీచ్‌కి వెళ్ళాలని అనుకుంటున్నాను"],
# "mr": ["जायकवाडी धरणातून तब्बल अडीच ते तीन लाख हेक्टर शेतीच्या सिंचनासाठी पाणी सोडलं जातं",
# "मी आज माझ्या आवडत्या पुस्तकाचे वाचन केले आणि नंतर माझ्या मित्रासोबत चित्रपटाला गेलो",
# "या उन्हाळी सुट्टीत मी माझ्या कुटुंबासोबत एक सुंदर समुद्रकिनाऱ्यावर जाण्याचा विचार करतो"],
# "bho": ["बिहार के बक्सर जिला के बक्सर नगर निगम क्षेत्र में गंगा नदी पर बने बक्सर पुल का उद्घाटन आज प्रधानमंत्री नरेंद्र मोदी करेंगे",
# "एन्ट्रापी कंप्यूटिंग में एन्ट्रोपी ऊ ऑपरेटिंग सिस्टम ह जे पे सरा क्रिप्टोग्राफिक फंक्शन सब काम करे लें",
# "हमार मंडराये वाली जहाज़ सर्पमीनन से भरी है"],
# }
# tts = TTS(
# model_path="/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/yourtts_syspin_baseline-April-19-2025_10+55AM-0b13ea658/best_model_479919.pth",
# config_path="/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/yourtts_syspin_baseline-April-19-2025_10+55AM-0b13ea658/config.json",
# progress_bar=False,
# ).to(device)
# parser = argparse.ArgumentParser(description="Text-to-Speech Synthesis")
# parser.add_argument('-t', '--text_file', type=str, required=True,
# help='Path to text file containing text and audio reference files')
# parser.add_argument('-r', '--ref_dir', type=str, required=True,
# help='Root directory containing reference audio files')
# parser.add_argument('-s', '--savedir', type=str, required=True,
# help='Directory to store synthesized audio files')
# parser.add_argument('-d', '--device', type=str, required=True,
# help='Device to use for synthesis (cpu or cuda)')
# args = parser.parse_args()
# os.makedirs(args.savedir, exist_ok=True)
# # Read the text file
# with open(args.text_file, 'r') as f:
# lines = f.readlines()
# for line in lines:
# idx, lang, text, ref_file = line.strip().split('\t')
# ref_file = os.path.join(args.ref_dir, ref_file)
# save_path = os.path.join(args.savedir, f"{idx}_{lang}_{os.path.basename(ref_file)}")
# tts.tts_to_file(text=text, speaker_wav=ref_file, language=lang, file_path=save_path)
# # ref_files = [os.path.join("/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/test_samples/", x) for x in os.listdir("/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/test_samples/")]
# # for ref_file in ref_files:
# # for language_key in sentences_dict.keys():
# # for s_idx, sentence in enumerate(sentences_dict[language_key]):
# # save_path = os.path.join("/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/test_infers/", f"test_{language_key}_{s_idx}_{os.path.basename(ref_file)}")
# # tts.tts_to_file(text=sentence, speaker_wav=ref_file, language=language_key, file_path=save_path)
# # tts.tts_to_file(text="ಹಸ್ದೇವ್ ನದಿ, ರಿಹಂಡ್ ನದಿ ಮತ್ತು ಕನ್ಹರ್ ನದಿಗಳು ಸುರ್ಗುಜಾದ ಮುಖಜ ಭೂಮಿಯಲ್ಲಿ ಹರಿಯುತ್ತವೆ.", speaker_wav="/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/syspin_data/Chhattisgarhi_Male/wavs/IISc_SYSPINProject_chha_m_AGRI_00001.wav", language="kn", file_path="test_kn.wav")