initial commit
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +23 -0
- Dockerfile +42 -0
- build_and_run.sh +51 -0
- data/input/reference_audio/phase3_assamese_female_48k.wav +3 -0
- data/input/reference_audio/phase3_assamese_male_48k.wav +3 -0
- data/input/reference_audio/phase3_bengali_male_48k.wav +3 -0
- data/input/reference_audio/phase3_bodo_female_48k.wav +3 -0
- data/input/reference_audio/phase3_bodo_male_48k.wav +3 -0
- data/input/reference_audio/phase3_gujarati_female_48k.wav +3 -0
- data/input/reference_audio/phase3_gujarati_male_48k.wav +3 -0
- data/input/reference_audio/phase3_hindi_female_48k.wav +3 -0
- data/input/reference_audio/phase3_hindi_male_48k.wav +3 -0
- data/input/reference_audio/phase3_manipuri_female_48k.wav +3 -0
- data/input/reference_audio/phase3_manipuri_male_48k.wav +3 -0
- data/input/reference_audio/phase3_nepali_female_48k.wav +3 -0
- data/input/reference_audio/phase3_nepali_male_48k.wav +3 -0
- data/input/reference_audio/phase3_punjabi_female_48k.wav +3 -0
- data/input/reference_audio/phase3_punjabi_male_48k.wav +3 -0
- data/input/reference_audio/phase3_sanskrit_female_48k.wav +3 -0
- data/input/reference_audio/phase3_sanskrit_male_48k.wav +3 -0
- data/input/reference_audio/phase3_tamil_female_48k.wav +3 -0
- data/input/reference_audio/phase3_tamil_male_48k.wav +3 -0
- data/input/reference_audio/phase3_telugu_female_48k.wav +3 -0
- data/input/reference_audio/phase3_telugu_male_48k.wav +3 -0
- data/input/texts.txt +2 -0
- data/output/1_te_phase3_telugu_male_48k.wav +3 -0
- data/output/2_mr_phase3_manipuri_male_48k.wav +3 -0
- infer.py +132 -0
- model_related/Bengali_Female/speakers.pth +3 -0
- model_related/Bengali_Male/speakers.pth +3 -0
- model_related/Bhojpuri_Female/speakers.pth +3 -0
- model_related/Bhojpuri_Male/speakers.pth +3 -0
- model_related/Chhattisgarhi_Female/speakers.pth +3 -0
- model_related/Chhattisgarhi_Male/speakers.pth +3 -0
- model_related/English_Female/speakers.pth +3 -0
- model_related/English_Male/speakers.pth +3 -0
- model_related/Gujarati_Female/speakers.pth +3 -0
- model_related/Gujarati_Male/speakers.pth +3 -0
- model_related/Hindi_Female/speakers.pth +3 -0
- model_related/Hindi_Male/speakers.pth +3 -0
- model_related/Kannada_Female/speakers.pth +3 -0
- model_related/Kannada_Male/speakers.pth +3 -0
- model_related/Magahi_Female/speakers.pth +3 -0
- model_related/Magahi_Male/speakers.pth +3 -0
- model_related/Maithili_Female/speakers.pth +3 -0
- model_related/Maithili_Male/speakers.pth +3 -0
- model_related/Marathi_Female/speakers.pth +3 -0
- model_related/Marathi_Male/speakers.pth +3 -0
- model_related/Telugu_Female/speakers.pth +3 -0
- model_related/Telugu_Male/speakers.pth +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,26 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/input/reference_audio/phase3_assamese_female_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
data/input/reference_audio/phase3_assamese_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
data/input/reference_audio/phase3_bengali_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
data/input/reference_audio/phase3_bodo_female_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
data/input/reference_audio/phase3_bodo_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
data/input/reference_audio/phase3_gujarati_female_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
data/input/reference_audio/phase3_gujarati_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
data/input/reference_audio/phase3_hindi_female_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
data/input/reference_audio/phase3_hindi_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
data/input/reference_audio/phase3_manipuri_female_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
data/input/reference_audio/phase3_manipuri_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
data/input/reference_audio/phase3_nepali_female_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
data/input/reference_audio/phase3_nepali_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
data/input/reference_audio/phase3_punjabi_female_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
data/input/reference_audio/phase3_punjabi_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
data/input/reference_audio/phase3_sanskrit_female_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
data/input/reference_audio/phase3_sanskrit_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
data/input/reference_audio/phase3_tamil_female_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
data/input/reference_audio/phase3_tamil_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
data/input/reference_audio/phase3_telugu_female_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
data/input/reference_audio/phase3_telugu_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
data/output/1_te_phase3_telugu_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
data/output/2_mr_phase3_manipuri_male_48k.wav filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM pytorch/pytorch:2.0.1-cuda11.7-cudnn8-runtime
|
| 2 |
+
|
| 3 |
+
# Set working directory
|
| 4 |
+
WORKDIR /app
|
| 5 |
+
|
| 6 |
+
# Verify Python version meets requirements (>=3.9.0, <3.12)
|
| 7 |
+
RUN python --version
|
| 8 |
+
|
| 9 |
+
# Install system dependencies
|
| 10 |
+
RUN apt-get update && apt-get install -y \
|
| 11 |
+
git \
|
| 12 |
+
libsndfile1 \
|
| 13 |
+
build-essential \
|
| 14 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 15 |
+
|
| 16 |
+
# Install Cython first (crucial for building extensions)
|
| 17 |
+
RUN pip install --no-cache-dir Cython packaging
|
| 18 |
+
|
| 19 |
+
# Clone the repository
|
| 20 |
+
RUN git clone https://github.com/PranavDBhat/LIMMITS-24-Coquiai.git /app/LIMMITS-24-Coquiai
|
| 21 |
+
|
| 22 |
+
# Install only the main requirements (not dev/notebooks requirements)
|
| 23 |
+
# This directly uses the requirements.txt file from the repository
|
| 24 |
+
RUN cd /app/LIMMITS-24-Coquiai && \
|
| 25 |
+
pip install --no-cache-dir -r requirements.txt
|
| 26 |
+
|
| 27 |
+
# Install the package in development mode
|
| 28 |
+
# This will build the Cython extensions
|
| 29 |
+
RUN cd /app/LIMMITS-24-Coquiai && \
|
| 30 |
+
pip install -e .
|
| 31 |
+
|
| 32 |
+
# Create directories for models, input, and output
|
| 33 |
+
RUN mkdir -p /app/models /app/data/input /app/data/output
|
| 34 |
+
|
| 35 |
+
# Copy the inference script
|
| 36 |
+
COPY infer.py /app/
|
| 37 |
+
|
| 38 |
+
# Set the entrypoint to run the inference script
|
| 39 |
+
ENTRYPOINT ["python", "infer.py"]
|
| 40 |
+
|
| 41 |
+
# Default command (can be overridden)
|
| 42 |
+
CMD ["--help"]
|
build_and_run.sh
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Ensure directories exist
|
| 4 |
+
mkdir -p models data/input/reference_audio data/output
|
| 5 |
+
|
| 6 |
+
# Check if model files exist
|
| 7 |
+
if [ ! -f "models/best_model_479919.pth" ] || [ ! -f "models/config.json" ]; then
|
| 8 |
+
echo "ERROR: Model files not found in models/ directory!"
|
| 9 |
+
echo "Please place model files in models/ directory:"
|
| 10 |
+
echo "- models/best_model_479919.pth"
|
| 11 |
+
echo "- models/config.json"
|
| 12 |
+
exit 1
|
| 13 |
+
fi
|
| 14 |
+
|
| 15 |
+
# Check if sample inputs exist
|
| 16 |
+
if [ ! -f "data/input/texts.txt" ]; then
|
| 17 |
+
echo "WARNING: No texts.txt found. Creating sample file..."
|
| 18 |
+
echo -e "1\tte\tవడ్రంగి, క్షురక వృత్తులలో పెట్టుబడి ప్రధానమై ఇతరులు కూడా ఈ వృత్తిలో ప్రవేశించి వ్యాపారంగా మార్చేసార\tspeaker1.wav" > data/input/texts.txt
|
| 19 |
+
echo "Please add reference audio files to data/input/reference_audio/"
|
| 20 |
+
fi
|
| 21 |
+
|
| 22 |
+
# Build the Docker image
|
| 23 |
+
echo "Building Docker image (this may take some time)..."
|
| 24 |
+
docker build -t tts-baseline .
|
| 25 |
+
|
| 26 |
+
echo ""
|
| 27 |
+
echo "Build complete!"
|
| 28 |
+
echo ""
|
| 29 |
+
|
| 30 |
+
docker run --gpus all \
|
| 31 |
+
-v "$(pwd)/models:/app/models" \
|
| 32 |
+
-v "$(pwd)/data/input:/app/data/input" \
|
| 33 |
+
-v "$(pwd)/data/output:/app/data/output" \
|
| 34 |
+
-v "$(pwd)/model_related:/app/model_related" \
|
| 35 |
+
tts-baseline \
|
| 36 |
+
--text_file /app/data/input/texts.txt \
|
| 37 |
+
--ref_dir /app/data/input/reference_audio \
|
| 38 |
+
--savedir /app/data/output \
|
| 39 |
+
--device cuda
|
| 40 |
+
|
| 41 |
+
# For CPU inference
|
| 42 |
+
# docker run \
|
| 43 |
+
# -v "$(pwd)/models:/app/models" \
|
| 44 |
+
# -v "$(pwd)/data/input:/app/data/input" \
|
| 45 |
+
# -v "$(pwd)/data/output:/app/data/output" \
|
| 46 |
+
# -v "$(pwd)/model_related:/app/model_related" \
|
| 47 |
+
# tts-baseline \
|
| 48 |
+
# --text_file /app/data/input/texts.txt \
|
| 49 |
+
# --ref_dir /app/data/input/reference_audio \
|
| 50 |
+
# --savedir /app/data/output \
|
| 51 |
+
# --device cpu
|
data/input/reference_audio/phase3_assamese_female_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cac46ed6d99adee425d8e56f26b94672632f77b8bb4e9149538327625bf8590f
|
| 3 |
+
size 878636
|
data/input/reference_audio/phase3_assamese_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2ae4ba4cd789dc4a260d2ee96ac3b56250545d148bc1b8a5ad306c32fae87b64
|
| 3 |
+
size 622636
|
data/input/reference_audio/phase3_bengali_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5774878765083e411d9d45eb1e593b2b593114d469411082b40172c36990589b
|
| 3 |
+
size 645164
|
data/input/reference_audio/phase3_bodo_female_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5aa6b50a42e585f8a5165c0d1364f7867b178eb3c77f28cb77cc5649ff1246b7
|
| 3 |
+
size 585772
|
data/input/reference_audio/phase3_bodo_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c7cbbab9e5fdd8f0d213278a9055ee248ef25e23489d497036c049bba33d5462
|
| 3 |
+
size 528428
|
data/input/reference_audio/phase3_gujarati_female_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:92bbae9ecf28b340056a68a0c41f1827a16b6ad90d04ef5fb1466044ce8342d3
|
| 3 |
+
size 438316
|
data/input/reference_audio/phase3_gujarati_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ca7b082fc6ffa75a001b462b144881c67b3a574b6fc23ffb89b2c4e0d7a9a6db
|
| 3 |
+
size 452652
|
data/input/reference_audio/phase3_hindi_female_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:84b8ca1f8cf8d14eab7f10887b5ea02d5fe91a135aef9b11b572d9fa2af1a289
|
| 3 |
+
size 520236
|
data/input/reference_audio/phase3_hindi_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fdbda05aa4e2cbfa1947a16416cc9cca2af29a6a02a947204d41ebaf31470fbc
|
| 3 |
+
size 428076
|
data/input/reference_audio/phase3_manipuri_female_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ebbddaa5903c08c85a190ba7dbbacfa2ed0f3e81f960de2bf35053eb5efad732
|
| 3 |
+
size 696364
|
data/input/reference_audio/phase3_manipuri_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d3d91517ef59af3ad21592f7046c6c3f9ca20844804473caa578dd89c69f5fcd
|
| 3 |
+
size 688172
|
data/input/reference_audio/phase3_nepali_female_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:484f636799da0a1c7c5b24062aa53c55de5bed3ad0b7fe769b8b4dacc61e32ea
|
| 3 |
+
size 628780
|
data/input/reference_audio/phase3_nepali_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a21ef0dcf83df7a98c9522181bb0e6af121c58be58dedf989ad62b305c8fcc76
|
| 3 |
+
size 518188
|
data/input/reference_audio/phase3_punjabi_female_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9382ebca3054cec66e70d3264622d8896d17bcd46ec19ab17c8334fc3d0097ef
|
| 3 |
+
size 413740
|
data/input/reference_audio/phase3_punjabi_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fb6c66455e0669f22a24f0b06ea4457337dd0a9f4bfc3e253d58a2f9a0953da8
|
| 3 |
+
size 378924
|
data/input/reference_audio/phase3_sanskrit_female_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:382581b53343e6bcad461cec7f24a687d9439b4388791396021b34473bd396dc
|
| 3 |
+
size 1557548
|
data/input/reference_audio/phase3_sanskrit_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c8dda08b50cd05d50b20558ada30516c483464b8241fe81a33471411b841737f
|
| 3 |
+
size 1234988
|
data/input/reference_audio/phase3_tamil_female_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f9a67918b9466c725329998bdaed0b2279b12b9dfe43f88fd1613c8fe6411eb
|
| 3 |
+
size 2347052
|
data/input/reference_audio/phase3_tamil_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:36990873db9fedce95bd28d78ae5689b02459ebdee9cbbd81e5f947da0be08af
|
| 3 |
+
size 2007084
|
data/input/reference_audio/phase3_telugu_female_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:baa4775b307b9a5c80b7c6b61e302adcbef6b288890c4d54efa903f0a2f89a19
|
| 3 |
+
size 571436
|
data/input/reference_audio/phase3_telugu_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3b72435776f2ee7ea0ac0b6fa14419436846cf303a5ad1343e23f071c6dc34f9
|
| 3 |
+
size 499756
|
data/input/texts.txt
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
1 te వడ్రంగి, క్షురక వృత్తులలో పెట్టుబడి ప్రధానమై ఇతరులు కూడా ఈ వృత్తిలో ప్రవేశించి వ్యాపారంగా మార్చేసార phase3_telugu_male_48k.wav
|
| 2 |
+
2 mr जायकवाडी धरणातून तब्बल अडीच ते तीन लाख हेक्टर शेतीच्या सिंचनासाठी पाणी सोडलं जातं phase3_manipuri_male_48k.wav
|
data/output/1_te_phase3_telugu_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d92f6ba7245ff2a222132eb4bfb6cf23239ff15f3f71b5dcfad9e3addc5f1454
|
| 3 |
+
size 425036
|
data/output/2_mr_phase3_manipuri_male_48k.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73dd8edae221d33baaa2276bb364e9e836d1ee39dffc8a379668bc2256796c99
|
| 3 |
+
size 499276
|
infer.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from TTS.api import TTS
|
| 3 |
+
import os
|
| 4 |
+
from tqdm import tqdm
|
| 5 |
+
import argparse
|
| 6 |
+
|
| 7 |
+
# Parse arguments
|
| 8 |
+
parser = argparse.ArgumentParser(description="Text-to-Speech Synthesis")
|
| 9 |
+
parser.add_argument('-t', '--text_file', type=str, required=True,
|
| 10 |
+
help='Path to text file containing text and audio reference files')
|
| 11 |
+
parser.add_argument('-r', '--ref_dir', type=str, required=True,
|
| 12 |
+
help='Root directory containing reference audio files')
|
| 13 |
+
parser.add_argument('-s', '--savedir', type=str, required=True,
|
| 14 |
+
help='Directory to store synthesized audio files')
|
| 15 |
+
parser.add_argument('-d', '--device', type=str, required=True,
|
| 16 |
+
help='Device to use for synthesis (cpu or cuda)')
|
| 17 |
+
parser.add_argument('-m', '--model_path', type=str,
|
| 18 |
+
default="/app/models/best_model_479919.pth",
|
| 19 |
+
help='Path to the model file')
|
| 20 |
+
parser.add_argument('-c', '--config_path', type=str,
|
| 21 |
+
default="/app/models/config.json",
|
| 22 |
+
help='Path to the config file')
|
| 23 |
+
|
| 24 |
+
args = parser.parse_args()
|
| 25 |
+
|
| 26 |
+
# Get device
|
| 27 |
+
device = args.device if args.device in ["cpu", "cuda"] else "cuda" if torch.cuda.is_available() else "cpu"
|
| 28 |
+
print(f"Using device: {device}")
|
| 29 |
+
|
| 30 |
+
# Initialize TTS model
|
| 31 |
+
print(f"Loading model from {args.model_path} with config {args.config_path}")
|
| 32 |
+
tts = TTS(
|
| 33 |
+
model_path=args.model_path,
|
| 34 |
+
config_path=args.config_path,
|
| 35 |
+
progress_bar=False,
|
| 36 |
+
).to(device)
|
| 37 |
+
|
| 38 |
+
# Create output directory
|
| 39 |
+
os.makedirs(args.savedir, exist_ok=True)
|
| 40 |
+
print(f"Output directory: {args.savedir}")
|
| 41 |
+
|
| 42 |
+
# Read the text file
|
| 43 |
+
print(f"Reading text file: {args.text_file}")
|
| 44 |
+
with open(args.text_file, 'r') as f:
|
| 45 |
+
lines = f.readlines()
|
| 46 |
+
|
| 47 |
+
# Process each line
|
| 48 |
+
print(f"Processing {len(lines)} entries...")
|
| 49 |
+
for i, line in enumerate(tqdm(lines)):
|
| 50 |
+
parts = line.strip().split('\t')
|
| 51 |
+
if len(parts) != 4:
|
| 52 |
+
print(f"Warning: Line {i+1} does not have 4 tab-separated parts. Skipping.")
|
| 53 |
+
continue
|
| 54 |
+
|
| 55 |
+
idx, lang, text, ref_file = parts
|
| 56 |
+
ref_path = os.path.join(args.ref_dir, ref_file)
|
| 57 |
+
save_path = os.path.join(args.savedir, f"{idx}_{lang}_{os.path.basename(ref_file)}")
|
| 58 |
+
|
| 59 |
+
print(f"Synthesizing: {text[:30]}... using reference {ref_path}")
|
| 60 |
+
tts.tts_to_file(text=text, speaker_wav=ref_path, language=lang, file_path=save_path)
|
| 61 |
+
print(f"Saved to: {save_path}")
|
| 62 |
+
|
| 63 |
+
print("Synthesis complete!")
|
| 64 |
+
|
| 65 |
+
# import torch
|
| 66 |
+
# from TTS.api import TTS
|
| 67 |
+
# import os
|
| 68 |
+
# from tqdm import tqdm
|
| 69 |
+
# import argparse
|
| 70 |
+
|
| 71 |
+
# # Get device
|
| 72 |
+
# device = "cuda:3" if torch.cuda.is_available() else "cpu"
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# sentences_dict = {
|
| 76 |
+
# "te": ["వడ్రంగి, క్షురక వృత్తులలో పెట్టుబడి ప్రధానమై ఇతరులు కూడా ఈ వృత్తిలో ప్రవేశించి వ్యాపారంగా మార్చేసార",
|
| 77 |
+
# "నేను ఈ రోజు నాకు ఇష్టమైన పుస్తకాన్ని చదివాను మరియు తరువాత నా స్నేహితుడితో సినిమాకు వెళ్ళాను",
|
| 78 |
+
# "ఈ వేసవి సెలవులలో నేను నా కుటుంబంతో కలిసి ఒక అందమైన బీచ్కి వెళ్ళాలని అనుకుంటున్నాను"],
|
| 79 |
+
# "mr": ["जायकवाडी धरणातून तब्बल अडीच ते तीन लाख हेक्टर शेतीच्या सिंचनासाठी पाणी सोडलं जातं",
|
| 80 |
+
# "मी आज माझ्या आवडत्या पुस्तकाचे वाचन केले आणि नंतर माझ्या मित्रासोबत चित्रपटाला गेलो",
|
| 81 |
+
# "या उन्हाळी सुट्टीत मी माझ्या कुटुंबासोबत एक सुंदर समुद्रकिनाऱ्यावर जाण्याचा विचार करतो"],
|
| 82 |
+
# "bho": ["बिहार के बक्सर जिला के बक्सर नगर निगम क्षेत्र में गंगा नदी पर बने बक्सर पुल का उद्घाटन आज प्रधानमंत्री नरेंद्र मोदी करेंगे",
|
| 83 |
+
# "एन्ट्रापी कंप्यूटिंग में एन्ट्रोपी ऊ ऑपरेटिंग सिस्टम ह जे पे सरा क्रिप्टोग्राफिक फंक्शन सब काम करे लें",
|
| 84 |
+
# "हमार मंडराये वाली जहाज़ सर्पमीनन से भरी है"],
|
| 85 |
+
# }
|
| 86 |
+
|
| 87 |
+
# tts = TTS(
|
| 88 |
+
# model_path="/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/yourtts_syspin_baseline-April-19-2025_10+55AM-0b13ea658/best_model_479919.pth",
|
| 89 |
+
# config_path="/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/yourtts_syspin_baseline-April-19-2025_10+55AM-0b13ea658/config.json",
|
| 90 |
+
# progress_bar=False,
|
| 91 |
+
# ).to(device)
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# parser = argparse.ArgumentParser(description="Text-to-Speech Synthesis")
|
| 95 |
+
# parser.add_argument('-t', '--text_file', type=str, required=True,
|
| 96 |
+
# help='Path to text file containing text and audio reference files')
|
| 97 |
+
# parser.add_argument('-r', '--ref_dir', type=str, required=True,
|
| 98 |
+
# help='Root directory containing reference audio files')
|
| 99 |
+
# parser.add_argument('-s', '--savedir', type=str, required=True,
|
| 100 |
+
# help='Directory to store synthesized audio files')
|
| 101 |
+
# parser.add_argument('-d', '--device', type=str, required=True,
|
| 102 |
+
# help='Device to use for synthesis (cpu or cuda)')
|
| 103 |
+
|
| 104 |
+
# args = parser.parse_args()
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
# os.makedirs(args.savedir, exist_ok=True)
|
| 110 |
+
|
| 111 |
+
# # Read the text file
|
| 112 |
+
# with open(args.text_file, 'r') as f:
|
| 113 |
+
# lines = f.readlines()
|
| 114 |
+
|
| 115 |
+
# for line in lines:
|
| 116 |
+
# idx, lang, text, ref_file = line.strip().split('\t')
|
| 117 |
+
# ref_file = os.path.join(args.ref_dir, ref_file)
|
| 118 |
+
# save_path = os.path.join(args.savedir, f"{idx}_{lang}_{os.path.basename(ref_file)}")
|
| 119 |
+
# tts.tts_to_file(text=text, speaker_wav=ref_file, language=lang, file_path=save_path)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
# # ref_files = [os.path.join("/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/test_samples/", x) for x in os.listdir("/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/test_samples/")]
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
|
| 126 |
+
# # for ref_file in ref_files:
|
| 127 |
+
# # for language_key in sentences_dict.keys():
|
| 128 |
+
# # for s_idx, sentence in enumerate(sentences_dict[language_key]):
|
| 129 |
+
# # save_path = os.path.join("/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/test_infers/", f"test_{language_key}_{s_idx}_{os.path.basename(ref_file)}")
|
| 130 |
+
# # tts.tts_to_file(text=sentence, speaker_wav=ref_file, language=language_key, file_path=save_path)
|
| 131 |
+
|
| 132 |
+
# # tts.tts_to_file(text="ಹಸ್ದೇವ್ ನದಿ, ರಿಹಂಡ್ ನದಿ ಮತ್ತು ಕನ್ಹರ್ ನದಿಗಳು ಸುರ್ಗುಜಾದ ಮುಖಜ ಭೂಮಿಯಲ್ಲಿ ಹರಿಯುತ್ತವೆ.", speaker_wav="/home1/jesuraj/speechlm/espnet/egs2/LIMMITS_25/speechlm1/downloads/syspin_data/Chhattisgarhi_Male/wavs/IISc_SYSPINProject_chha_m_AGRI_00001.wav", language="kn", file_path="test_kn.wav")
|
model_related/Bengali_Female/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:56f54ff1a806ebe607b0095b489f553546cce0b9904d38c531e78e4a86f09e67
|
| 3 |
+
size 100098080
|
model_related/Bengali_Male/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a860399950439f46083e8e6e7e5c057e10cd7994c6232a09b33fd85f0dd1c1cb
|
| 3 |
+
size 119368032
|
model_related/Bhojpuri_Female/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:80b05743e5e5c2d8cc10ac96a7e66006f7ed046fc7c7663f83971607afed31e3
|
| 3 |
+
size 129451808
|
model_related/Bhojpuri_Male/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:60b03f732526e59e5165fe17c243fdb3338dc69d6b4539cd94009e6152955511
|
| 3 |
+
size 123140960
|
model_related/Chhattisgarhi_Female/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3ab3dc8e3a8946d1a170030f7da2a829441be4a70d9dd9f9689d03360d0bc876
|
| 3 |
+
size 130934816
|
model_related/Chhattisgarhi_Male/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:973c6233369aade6066051802d3d8165c9199be7bc0144ea0a4ffee18cb315cc
|
| 3 |
+
size 108560992
|
model_related/English_Female/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ef4b8a8c9e23c9ab0f2e0c053ce85e55b0f3ea34e23dccf2b463502218b7ba1f
|
| 3 |
+
size 113899296
|
model_related/English_Male/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f1a36e7af9cd38a35a70227b25ec4de0ea9cfdf171bf445bec5731abb65eded4
|
| 3 |
+
size 118719648
|
model_related/Gujarati_Female/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:81e435824571f839e07984422a3528cb6243da654137560df67b03d2858a75b5
|
| 3 |
+
size 35408544
|
model_related/Gujarati_Male/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:315f8348df2662c0d04297df825a02190108ede342b3b2926fb9c83b22cbb3cc
|
| 3 |
+
size 35875360
|
model_related/Hindi_Female/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d36722d9cfa2714f7682d8cd932182b0ecf71663151131a12db01352c6d32672
|
| 3 |
+
size 103680032
|
model_related/Hindi_Male/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ea4a7831a9fa5961f56bbab0d97d63df34a8189fad2d5294dee2effc5a4e2f24
|
| 3 |
+
size 116714528
|
model_related/Kannada_Female/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:12ac883789029ddab4f2f6cf0340a0ad52210035513ba2a6676f21f6438a4ee3
|
| 3 |
+
size 80812384
|
model_related/Kannada_Male/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:2af52780734b88535adff48246df32ca4e5b2cf20f0c9f7efcb849d8f024eba4
|
| 3 |
+
size 97633824
|
model_related/Magahi_Female/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8000b755a4ffd5e6b0a95e2a06dabd02d6f59dbae46776d9a662e214f7f93fb9
|
| 3 |
+
size 145807392
|
model_related/Magahi_Male/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b7a9007a83759fb8f1cd4bcef79a5b083839d09e1703484caa9d727dd5a1bce8
|
| 3 |
+
size 153767712
|
model_related/Maithili_Female/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a4f587283f8604bcc95cba0b6e2320041619b76a715b6b0c4253a584acc595a2
|
| 3 |
+
size 163039520
|
model_related/Maithili_Male/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e7f910296937f346e330e06fc5c7c0afcad3f05f252afd3e941711f2557d197b
|
| 3 |
+
size 152221920
|
model_related/Marathi_Female/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7e67881879aac91f6653e96f559af5129e023e2879f76fa7bc8e24b77a8236b3
|
| 3 |
+
size 103038496
|
model_related/Marathi_Male/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7bde8d12edc5745f6c1151f7a0e53bce657a416ad16259a98da14480e330472d
|
| 3 |
+
size 98007520
|
model_related/Telugu_Female/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:70be39a4e6967100c91ce142d617d1ee23fbba24bd84141bfc67cc71a3cd6ed8
|
| 3 |
+
size 103993632
|
model_related/Telugu_Male/speakers.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b419a9b129289c861f6d2f6d8e0f176e9a245879b5a0c43e5b75ccf0ad9ed93d
|
| 3 |
+
size 101432480
|