aryo100 commited on
Commit
fbb4cab
·
1 Parent(s): 55da47c

first commit

Browse files
Dockerfile ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dockerfile untuk TTS Bahasa Indonesia API
2
+ FROM python:3.10-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /app
6
+
7
+ # Install system dependencies
8
+ RUN apt-get update && apt-get install -y \
9
+ build-essential \
10
+ curl \
11
+ && rm -rf /var/lib/apt/lists/*
12
+
13
+ # Copy requirements first untuk layer caching
14
+ COPY requirements.txt .
15
+
16
+ # Install Python dependencies
17
+ RUN pip install --no-cache-dir -r requirements.txt
18
+
19
+ # Copy application files
20
+ COPY . .
21
+
22
+ # Create outputs directory
23
+ RUN mkdir -p outputs
24
+
25
+ # Expose port (default 7860, bisa diubah dengan environment variable PORT)
26
+ EXPOSE 7860
27
+
28
+ # Set environment variables
29
+ ENV PYTHONUNBUFFERED=1
30
+ ENV PORT=7860
31
+
32
+ # Health check untuk Docker deployment (termasuk HuggingFace Spaces dengan Docker SDK)
33
+ # start-period 60s untuk memberikan waktu model TTS untuk load
34
+ HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
35
+ CMD curl -f http://localhost:7860/api/health || exit 1
36
+
37
+ # Run the application
38
+ CMD ["python", "api.py"]
39
+
api.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ FastAPI Server untuk TTS Bahasa Indonesia
4
+ Berdasarkan tts_standalone.py
5
+
6
+ API ini menyediakan endpoint untuk menghasilkan audio dari teks Bahasa Indonesia
7
+ dengan menggunakan model TTS yang sudah dilatih khusus untuk bahasa Indonesia.
8
+
9
+ Fitur:
10
+ - Limit maksimal 10 file di folder outputs
11
+ - Auto cleanup file lama ketika melebihi limit
12
+ - Kompatibel dengan HuggingFace Spaces
13
+
14
+ Endpoints:
15
+ - POST /api/tts: Generate audio dari teks
16
+ - GET /api/health: Health check endpoint
17
+ - GET /api/speakers: Daftar speaker tersedia
18
+ - GET /api/download/{file_path}: Download file audio
19
+ """
20
+
21
+ from fastapi import FastAPI, HTTPException, File, UploadFile
22
+ from fastapi.responses import FileResponse, JSONResponse
23
+ from fastapi.middleware.cors import CORSMiddleware
24
+ from pydantic import BaseModel, Field
25
+ from typing import Optional
26
+ import uuid
27
+ from pathlib import Path
28
+ import os
29
+ from datetime import datetime
30
+ from tts_standalone import generate_tts, params
31
+
32
+ app = FastAPI(
33
+ title="TTS Bahasa Indonesia API",
34
+ description="API untuk Text-to-Speech Bahasa Indonesia",
35
+ version="1.0.0"
36
+ )
37
+
38
+ # Enable CORS
39
+ app.add_middleware(
40
+ CORSMiddleware,
41
+ allow_origins=["*"],
42
+ allow_credentials=True,
43
+ allow_methods=["*"],
44
+ allow_headers=["*"],
45
+ )
46
+
47
+ # Direktori untuk menyimpan file audio hasil TTS
48
+ OUTPUT_DIR = Path("outputs")
49
+ OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
50
+
51
+ # Limit maksimal file yang disimpan
52
+ MAX_FILES = 10
53
+
54
+
55
+ # Pydantic models untuk request/response
56
+ class TTSRequest(BaseModel):
57
+ text: str = Field(..., description="Teks yang akan diubah menjadi suara")
58
+ speaker: Optional[str] = Field(default=None, description="Nama speaker (default: dari config)")
59
+ file_path: Optional[str] = Field(default=None, description="Custom file path (optional)")
60
+
61
+
62
+ class TTSResponse(BaseModel):
63
+ success: bool
64
+ message: str
65
+ file_path: Optional[str] = None
66
+ download_url: Optional[str] = None
67
+ text: Optional[str] = None
68
+ speaker: Optional[str] = None
69
+
70
+
71
+ class HealthResponse(BaseModel):
72
+ status: str
73
+ message: str
74
+
75
+
76
+ def cleanup_old_files(max_files: int = MAX_FILES):
77
+ """
78
+ Menghapus file lama jika jumlah file melebihi max_files.
79
+ File yang paling lama akan dihapus terlebih dahulu.
80
+
81
+ Args:
82
+ max_files: Jumlah maksimal file yang diizinkan
83
+ """
84
+ try:
85
+ # Get semua file .wav di OUTPUT_DIR
86
+ wav_files = list(OUTPUT_DIR.glob("*.wav"))
87
+
88
+ if len(wav_files) <= max_files:
89
+ return
90
+
91
+ # Sort berdasarkan waktu modifikasi (file lama di depan)
92
+ wav_files.sort(key=lambda x: x.stat().st_mtime)
93
+
94
+ # Hapus file lama sampai jumlah file <= max_files
95
+ files_to_delete = wav_files[:len(wav_files) - max_files + 1]
96
+
97
+ for file_path in files_to_delete:
98
+ try:
99
+ file_path.unlink()
100
+ print(f"Deleted old file: {file_path}")
101
+ except Exception as e:
102
+ print(f"Error deleting file {file_path}: {e}")
103
+
104
+ except Exception as e:
105
+ print(f"Error in cleanup_old_files: {e}")
106
+
107
+
108
+ @app.get("/api/health", response_model=HealthResponse)
109
+ async def health_check():
110
+ """
111
+ Health check endpoint
112
+ """
113
+ return HealthResponse(
114
+ status="healthy",
115
+ message="TTS API is running"
116
+ )
117
+
118
+
119
+ @app.post("/api/tts", response_model=TTSResponse)
120
+ async def tts_endpoint(request: TTSRequest):
121
+ """
122
+ Endpoint untuk generate TTS audio dari teks
123
+
124
+ Request Body (JSON):
125
+ {
126
+ "text": "Teks yang akan diubah menjadi suara",
127
+ "speaker": "gadis", // Optional, default: dari config
128
+ "file_path": "custom_filename.wav" // Optional
129
+ }
130
+ """
131
+ try:
132
+ # Validasi text
133
+ if not request.text or not request.text.strip():
134
+ raise HTTPException(
135
+ status_code=400,
136
+ detail="Parameter 'text' wajib diisi dan tidak boleh kosong"
137
+ )
138
+
139
+ # Get speaker (gunakan default dari config jika tidak ada)
140
+ speaker = request.speaker or params.get("speaker", "gadis")
141
+
142
+ # Generate file path
143
+ if request.file_path:
144
+ # Jika user memberikan custom file path
145
+ if not request.file_path.endswith(".wav"):
146
+ request.file_path += ".wav"
147
+ file_path = OUTPUT_DIR / request.file_path
148
+ else:
149
+ # Generate UUID untuk file name
150
+ short_uuid = str(uuid.uuid4())[:8]
151
+ file_path = OUTPUT_DIR / f"{speaker}-{short_uuid}.wav"
152
+
153
+ # Cleanup file lama sebelum generate file baru
154
+ cleanup_old_files(MAX_FILES)
155
+
156
+ # Generate TTS
157
+ success = generate_tts(request.text, str(file_path), speaker)
158
+
159
+ if success:
160
+ # Cleanup lagi setelah generate (untuk memastikan)
161
+ cleanup_old_files(MAX_FILES)
162
+
163
+ return TTSResponse(
164
+ success=True,
165
+ message="Audio berhasil dibuat",
166
+ file_path=str(file_path),
167
+ download_url=f"/api/download/{file_path}",
168
+ text=request.text,
169
+ speaker=speaker
170
+ )
171
+ else:
172
+ raise HTTPException(
173
+ status_code=500,
174
+ detail="Gagal membuat audio. Periksa log untuk detail."
175
+ )
176
+
177
+ except HTTPException:
178
+ raise
179
+ except Exception as e:
180
+ raise HTTPException(
181
+ status_code=500,
182
+ detail=f"Internal server error: {str(e)}"
183
+ )
184
+
185
+
186
+ @app.get("/api/download/{file_path:path}")
187
+ async def download_file(file_path: str):
188
+ """
189
+ Endpoint untuk download file audio
190
+
191
+ Args:
192
+ file_path: Path relatif ke file audio (contoh: outputs/gadis-abc12345.wav)
193
+ """
194
+ try:
195
+ file_path_obj = Path(file_path)
196
+
197
+ # Security check: pastikan file ada di dalam OUTPUT_DIR
198
+ try:
199
+ file_path_obj.resolve().relative_to(OUTPUT_DIR.resolve())
200
+ except ValueError:
201
+ raise HTTPException(
202
+ status_code=400,
203
+ detail="Invalid file path"
204
+ )
205
+
206
+ if not file_path_obj.exists():
207
+ raise HTTPException(
208
+ status_code=404,
209
+ detail="File tidak ditemukan"
210
+ )
211
+
212
+ return FileResponse(
213
+ path=str(file_path_obj),
214
+ filename=file_path_obj.name,
215
+ media_type="audio/wav"
216
+ )
217
+
218
+ except HTTPException:
219
+ raise
220
+ except Exception as e:
221
+ raise HTTPException(
222
+ status_code=500,
223
+ detail=f"Error: {str(e)}"
224
+ )
225
+
226
+
227
+ @app.get("/api/speakers")
228
+ async def get_speakers():
229
+ """
230
+ Endpoint untuk mendapatkan daftar speaker yang tersedia
231
+ """
232
+ speakers = [
233
+ {"id": "wibowo", "name": "Wibowo - Suara jantan berwibawa"},
234
+ {"id": "ardi", "name": "Ardi - Suara lembut dan hangat"},
235
+ {"id": "gadis", "name": "Gadis - Suara perempuan yang merdu"},
236
+ {"id": "JV-00264", "name": "Juminten - Suara perempuan jawa (bahasa jawa)"},
237
+ {"id": "SU-00060", "name": "Asep - Suara lelaki sunda (bahasa sunda)"}
238
+ ]
239
+
240
+ return {
241
+ "success": True,
242
+ "speakers": speakers,
243
+ "default": params.get("speaker", "gadis")
244
+ }
245
+
246
+
247
+ @app.get("/")
248
+ async def root():
249
+ """
250
+ Root endpoint dengan informasi API
251
+ """
252
+ return {
253
+ "message": "TTS Bahasa Indonesia API",
254
+ "version": "1.0.0",
255
+ "docs": "/docs",
256
+ "endpoints": {
257
+ "POST /api/tts": "Generate audio dari teks",
258
+ "GET /api/health": "Health check",
259
+ "GET /api/speakers": "Daftar speaker tersedia",
260
+ "GET /api/download/{file_path}": "Download file audio"
261
+ }
262
+ }
263
+
264
+
265
+ if __name__ == "__main__":
266
+ import uvicorn
267
+
268
+ # Get port from environment variable (untuk HuggingFace Spaces)
269
+ port = int(os.environ.get("PORT", 7860))
270
+
271
+ print("=" * 60)
272
+ print("TTS API Server - Bahasa Indonesia (FastAPI)")
273
+ print("=" * 60)
274
+ print("API Endpoints:")
275
+ print(" POST /api/tts - Generate audio dari teks")
276
+ print(" GET /api/health - Health check")
277
+ print(" GET /api/speakers - Daftar speaker tersedia")
278
+ print(" GET /api/download/{file_path} - Download file audio")
279
+ print(" GET /docs - Swagger UI documentation")
280
+ print("=" * 60)
281
+ print(f"\nServer starting on http://0.0.0.0:{port}")
282
+ print(f"Max files in outputs folder: {MAX_FILES}")
283
+ print("Press CTRL+C to stop\n")
284
+
285
+ uvicorn.run(app, host="0.0.0.0", port=port)
app.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ████████╗████████╗███████╗
3
+ ╚══██╔══╝╚══██╔══╝██╔════╝
4
+ ██║ ██║ ███████╗
5
+ ██║ ██║ ╚════██║
6
+ ██║ ██║ ███████║
7
+ ╚═╝ ╚═╝ ╚══════╝
8
+ ██╗███╗ ██╗██████╗ ██████╗ ███╗ ██╗███████╗███████╗██╗ █████╗ ██╗ ██╗██╗ ██╗
9
+ ██║████╗ ██║██╔══██╗██╔═══██╗████╗ ██║██╔════╝██╔════╝██║██╔══██╗██║ ██╔╝██║ ██║
10
+ ██║██╔██╗ ██║██║ ██║██║ ██║██╔██╗ ██║█████╗ ███████╗██║███████║█████╔╝ ██║ ██║
11
+ ██║██║╚██╗██║██║ ██║██║ ██║██║╚██╗██║██╔══╝ ╚════██║██║██╔══██║██╔═██╗ ██║ ██║
12
+ ██║██║ ╚████║██████╔╝╚██████╔╝██║ ╚████║███████╗███████║██║██║ ██║██║ ██╗╚██████╔╝
13
+ ╚═╝╚═╝ ╚═══╝╚═════╝ ╚═════╝ ╚═╝ ╚═══╝╚══════╝╚══════╝╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝
14
+
15
+ Script ini dibuat oleh __drat
16
+
17
+ Petunjuk:
18
+ 1. Script ini digunakan untuk menghasilkan suara berbasis teks dengan berbagai pilihan pembicara.
19
+ 2. Teknologi yang digunakan meliputi model text-to-speech (TTS) yang canggih dengan konversi teks ke fonem (G2P).
20
+ 3. Model yang dipakai dilatih khusus untuk bahasa Indonesia, Jawa, dan Sunda.
21
+ 4. Antarmuka dibuat dengan menggunakan Gradio dengan tema kustom bernama MetafisikTheme.
22
+
23
+ Cara Menggunakan:
24
+ 1. Masukkan teks yang ingin diubah menjadi suara.
25
+ 2. Pilih kecepatan bicara yang diinginkan.
26
+ 3. Pilih bahasa dan pembicara yang diinginkan.
27
+ 4. Klik tombol "Lakukan Inferensi Audio" untuk menghasilkan suara.
28
+ """
29
+
30
+ import gradio as gr
31
+ import platform
32
+ import json
33
+ from pathlib import Path
34
+ import uuid
35
+ import html
36
+ import subprocess
37
+ import time
38
+ from g2pid import G2P
39
+ from themes import MetafisikTheme # Impor tema custom dari themes.py
40
+
41
+ # Inisialisasi G2P (Grapheme to Phoneme)
42
+ g2p = G2P()
43
+
44
+ # Fungsi untuk mengecek apakah sistem operasi adalah macOS
45
+ def is_mac_os():
46
+ return platform.system() == 'Darwin'
47
+
48
+ # Parameter default untuk konfigurasi
49
+ params = {
50
+ "activate": True,
51
+ "autoplay": True,
52
+ "show_text": True,
53
+ "remove_trailing_dots": False,
54
+ "voice": "default.wav",
55
+ "language": "Indonesian",
56
+ "model_path": "checkpoint_1260000-inference.pth",
57
+ "config_path": "config.json",
58
+ "out_path": "output.wav"
59
+ }
60
+
61
+ SAMPLE_RATE = 16000
62
+ device = None
63
+
64
+ # Set nama pembicara default
65
+ default_speaker_name = "ardi"
66
+
67
+ # Fungsi untuk mengubah teks menjadi urutan yang sesuai untuk model
68
+ def text_to_sequence(text):
69
+ # Implementasikan sesuai dengan kebutuhan model Anda
70
+ # Sebagai contoh, ini adalah placeholder
71
+ sequence = [ord(char) for char in text]
72
+ return sequence
73
+
74
+ # Fungsi untuk menghasilkan suara dengan progress bar
75
+ def gen_voice(text, speaker_label, speed, language, progress=gr.Progress()):
76
+ speaker_mapping = {
77
+ "Wibowo - Suara jantan berwibawa": "wibowo",
78
+ "Ardi - Suara lembut dan hangat": "ardi",
79
+ "Gadis - Suara perempuan yang merdu": "gadis",
80
+ "Juminten - Suara perempuan jawa (bahasa jawa)": "JV-00264",
81
+ "Asep - Suara lelaki sunda (bahasa sunda)": "SU-00060"
82
+ }
83
+ speaker = speaker_mapping.get(speaker_label, default_speaker_name)
84
+
85
+ progress(0, desc="Menginisialisasi G2P")
86
+ text = html.unescape(text)
87
+ text_to_tts = g2p(text) # Konversi teks ke format TTS menggunakan G2P
88
+ time.sleep(1)
89
+ progress(0.2, desc="Mengonversi teks ke TTS")
90
+
91
+ short_uuid = str(uuid.uuid4())[:8]
92
+ output_file = Path(f'outputs/{speaker}-{short_uuid}.wav')
93
+
94
+ # Perintah untuk menjalankan TTS
95
+ command = [
96
+ "tts",
97
+ "--text", text_to_tts,
98
+ "--model_path", params["model_path"],
99
+ "--config_path", params["config_path"],
100
+ "--speaker_idx", speaker,
101
+ "--out_path", str(output_file)
102
+ ]
103
+
104
+ progress(0.5, desc="Menjalankan proses TTS")
105
+ result = subprocess.run(command, capture_output=True, text=True)
106
+ time.sleep(1)
107
+ if result.returncode != 0:
108
+ print(f"Error: {result.stderr}")
109
+ return None
110
+
111
+ progress(1, desc="Selesai")
112
+ return str(output_file)
113
+
114
+ # Fungsi untuk memperbarui daftar pembicara
115
+ def update_speakers():
116
+ speakers = [
117
+ ("Wibowo - Suara jantan berwibawa", "wibowo"),
118
+ ("Ardi - Suara lembut dan hangat", "ardi"),
119
+ ("Gadis - Suara perempuan yang merdu", "gadis"),
120
+ ("Juminten - Suara perempuan jawa (bahasa jawa)", "JV-00264"),
121
+ ("Asep - Suara lelaki sunda (bahasa sunda)", "SU-00060")
122
+ ]
123
+ return speakers
124
+
125
+ # Fungsi untuk memperbarui dropdown pembicara
126
+ def update_dropdown(_=None, selected_speaker=default_speaker_name):
127
+ choices = update_speakers()
128
+ dropdown_choices = {label: label for label, value in choices}
129
+ return gr.Dropdown(choices=dropdown_choices, value=selected_speaker, label="Pilih Pembicara", interactive=True, allow_custom_value=True)
130
+
131
+ # Memuat data bahasa
132
+ with open(Path('languages.json'), encoding='utf8') as f:
133
+ languages = json.load(f)
134
+
135
+ # Antarmuka Gradio dengan tema MetafisikTheme
136
+ with gr.Blocks(theme=MetafisikTheme()) as app:
137
+
138
+ gr.Markdown("### TTS Bahasa Indonesia", elem_id="main-title")
139
+
140
+ with gr.Row():
141
+ with gr.Column():
142
+ text_input = gr.Textbox(lines=2, label="Teks", value="Halo, saya adalah pembicara virtual.", elem_id="text-input")
143
+ speed_slider = gr.Slider(label='Kecepatan Bicara', minimum=0.1, maximum=1.99, value=0.8, step=0.01, elem_id="speed-slider")
144
+ language_dropdown = gr.Dropdown(list(languages.keys()), label="Bahasa", value="Indonesian", elem_id="language-dropdown")
145
+ submit_button = gr.Button("🗣️ Lakukan Inferensi Audio", elem_id="submit-button")
146
+ explanation = gr.HTML("""
147
+ <div style="margin-top: 20px; color: gray;">
148
+ <h4>Kegunaan Aplikasi</h4>
149
+ <p>Aplikasi ini digunakan untuk menghasilkan suara berbasis teks dengan berbagai pilihan pembicara.
150
+ Teknologi yang digunakan meliputi model text-to-speech (TTS) yang canggih dengan konversi teks ke fonem.
151
+ Model yang dipakai dilatih khusus untuk bahasa Indonesia, Jawa dan Sunda.</p>
152
+ <h4>Cara Penggunaan</h4>
153
+ <ol>
154
+ <li>Masukkan teks yang ingin diubah menjadi suara.</li>
155
+ <li>Pilih kecepatan bicara yang diinginkan.</li>
156
+ <li>Pilih bahasa dan pembicara yang diinginkan.</li>
157
+ <li>Klik tombol "Lakukan Inferensi Audio" untuk menghasilkan suara.</li>
158
+ </ol>
159
+ <p></p>
160
+ <p>Semoga <b>Energi Semesta Digital</b> selalu bersama Anda!</p>
161
+ </div>
162
+ """)
163
+
164
+ with gr.Column():
165
+ with gr.Row():
166
+ gr.Image("ardi.jpg", label="Ardi")
167
+ gr.Image("gadis.jpg", label="Gadis")
168
+ gr.Image("wibowo.jpg", label="Wibowo")
169
+
170
+ speaker_dropdown = update_dropdown()
171
+ refresh_button = gr.Button("👨‍👨‍👦 Segarkan Pembicara", elem_id="refresh-button")
172
+ audio_output = gr.Audio(elem_id="audio-output")
173
+
174
+ refresh_button.click(fn=update_dropdown, inputs=[], outputs=speaker_dropdown)
175
+
176
+ submit_button.click(
177
+ fn=gen_voice,
178
+ inputs=[text_input, speaker_dropdown, speed_slider, language_dropdown],
179
+ outputs=audio_output
180
+ )
181
+
182
+ gr.HTML("""
183
+ <footer style="text-align: center; margin-top: 20px; color:silver;">
184
+ Energi Semesta Digital © 2024 __drat. | 🇮🇩 Untuk Indonesia Jaya!
185
+ </footer>
186
+ """)
187
+
188
+ if __name__ == "__main__":
189
+ app.launch()
ardi.jpg ADDED
config.json ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/workspace/TTS",
3
+ "logger_uri": null,
4
+ "run_name": "vits_indonesian_multispeaker",
5
+ "project_name": null,
6
+ "run_description": "\ud83d\udc38Coqui trainer run.",
7
+ "print_step": 25,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "log_model_step": 10000,
13
+ "save_step": 10000,
14
+ "save_n_checkpoints": 5,
15
+ "save_checkpoints": true,
16
+ "save_all_best": false,
17
+ "save_best_after": 10000,
18
+ "target_loss": null,
19
+ "print_eval": true,
20
+ "test_delay_epochs": -1,
21
+ "run_eval": true,
22
+ "run_eval_steps": null,
23
+ "distributed_backend": "nccl",
24
+ "distributed_url": "tcp://localhost:54321",
25
+ "mixed_precision": false,
26
+ "epochs": 1000,
27
+ "batch_size": 32,
28
+ "eval_batch_size": 8,
29
+ "grad_clip": [
30
+ 1000,
31
+ 1000
32
+ ],
33
+ "scheduler_after_epoch": true,
34
+ "lr": 0.001,
35
+ "optimizer": "AdamW",
36
+ "optimizer_params": {
37
+ "betas": [
38
+ 0.8,
39
+ 0.99
40
+ ],
41
+ "eps": 1e-09,
42
+ "weight_decay": 0.01
43
+ },
44
+ "lr_scheduler": "",
45
+ "lr_scheduler_params": {},
46
+ "use_grad_scaler": false,
47
+ "cudnn_enable": true,
48
+ "cudnn_deterministic": false,
49
+ "cudnn_benchmark": false,
50
+ "training_seed": 54321,
51
+ "model": "vits",
52
+ "num_loader_workers": 4,
53
+ "num_eval_loader_workers": 4,
54
+ "use_noise_augment": false,
55
+ "audio": {
56
+ "fft_size": 1024,
57
+ "sample_rate": 22050,
58
+ "win_length": 1024,
59
+ "hop_length": 256,
60
+ "num_mels": 80,
61
+ "mel_fmin": 0,
62
+ "mel_fmax": null
63
+ },
64
+ "use_phonemes": false,
65
+ "phonemizer": null,
66
+ "phoneme_language": "en-us",
67
+ "compute_input_seq_cache": true,
68
+ "text_cleaner": "basic_cleaners",
69
+ "enable_eos_bos_chars": false,
70
+ "test_sentences_file": "",
71
+ "phoneme_cache_path": "/workspace/TTS/phoneme_cache",
72
+ "characters": {
73
+ "characters_class": "TTS.tts.models.vits.VitsCharacters",
74
+ "vocab_dict": null,
75
+ "pad": "<PAD>",
76
+ "eos": "<EOS>",
77
+ "bos": "<BOS>",
78
+ "blank": "<BLNK>",
79
+ "characters": "abdefhijklmnoprstuwxz\u014b\u0254\u0259\u025b\u0261\u026a\u0272\u0283\u028a\u0292\u0294\u02c8",
80
+ "punctuations": " !,.?",
81
+ "phonemes": null,
82
+ "is_unique": true,
83
+ "is_sorted": true
84
+ },
85
+ "add_blank": true,
86
+ "batch_group_size": 0,
87
+ "loss_masking": null,
88
+ "min_audio_len": 1,
89
+ "max_audio_len": Infinity,
90
+ "min_text_len": 1,
91
+ "max_text_len": Infinity,
92
+ "compute_f0": false,
93
+ "compute_linear_spec": true,
94
+ "precompute_num_workers": 0,
95
+ "start_by_longest": false,
96
+ "datasets": [
97
+ {
98
+ "name": "coqui",
99
+ "path": "dataset",
100
+ "meta_file_train": "metadata-wibowo.csv",
101
+ "ignored_speakers": null,
102
+ "language": "",
103
+ "meta_file_val": "",
104
+ "meta_file_attn_mask": ""
105
+ },
106
+ {
107
+ "name": "coqui",
108
+ "path": "dataset",
109
+ "meta_file_train": "metadata-ardi.csv",
110
+ "ignored_speakers": null,
111
+ "language": "",
112
+ "meta_file_val": "",
113
+ "meta_file_attn_mask": ""
114
+ },
115
+ {
116
+ "name": "coqui",
117
+ "path": "dataset",
118
+ "meta_file_train": "metadata-gadis.csv",
119
+ "ignored_speakers": null,
120
+ "language": "",
121
+ "meta_file_val": "",
122
+ "meta_file_attn_mask": ""
123
+ },
124
+ {
125
+ "name": "coqui",
126
+ "path": "dataset",
127
+ "meta_file_train": "metadata-javanese.csv",
128
+ "ignored_speakers": null,
129
+ "language": "",
130
+ "meta_file_val": "",
131
+ "meta_file_attn_mask": ""
132
+ },
133
+ {
134
+ "name": "coqui",
135
+ "path": "dataset",
136
+ "meta_file_train": "metadata-sundanese.csv",
137
+ "ignored_speakers": null,
138
+ "language": "",
139
+ "meta_file_val": "",
140
+ "meta_file_attn_mask": ""
141
+ }
142
+ ],
143
+ "test_sentences": [
144
+ [
145
+ "\u02c8budi \u02c8makan \u02c8tahu, \u02c8soto, dan \u02c8tempe",
146
+ "wibowo",
147
+ null,
148
+ null
149
+ ],
150
+ [
151
+ "\u02c8tadi \u02c8pa\u0261i \u02c8ali dan \u02c8\u0283afi s\u0259\u02c8dan\u0294 m\u0259n\u0294\u02c8\u0261unakan \u02c8m\u0254t\u0254r di \u02c8kantor \u02c8m\u0259r\u025bka.",
152
+ "ardi",
153
+ null,
154
+ null
155
+ ],
156
+ [
157
+ "\u02c8ardi dan \u02c8thomas \u02c8m\u0259nud\u0292u \u02c8k\u0259 \u02c8s\u0259kolah \u02c8pada \u02c8puk\u028al \u02c8s\u0259pul\u028ah \u02c8pa\u0261i.",
158
+ "gadis",
159
+ null,
160
+ null
161
+ ],
162
+ [
163
+ "\u02c8ardi dan \u02c8thomas \u02c8m\u0259nud\u0292u \u02c8k\u0259 \u02c8s\u0259kolah \u02c8pada \u02c8puk\u028al \u02c8s\u0259pul\u028ah \u02c8pa\u0261i.",
164
+ "JV-00264",
165
+ null,
166
+ null
167
+ ],
168
+ [
169
+ "\u02c8ardi dan \u02c8thomas \u02c8m\u0259nud\u0292u \u02c8k\u0259 \u02c8s\u0259kolah \u02c8pada \u02c8puk\u028al \u02c8s\u0259pul\u028ah \u02c8pa\u0261i.",
170
+ "SU-00060",
171
+ null,
172
+ null
173
+ ]
174
+ ],
175
+ "eval_split_max_size": null,
176
+ "eval_split_size": 0.01,
177
+ "use_speaker_weighted_sampler": false,
178
+ "speaker_weighted_sampler_alpha": 1.0,
179
+ "use_language_weighted_sampler": false,
180
+ "language_weighted_sampler_alpha": 1.0,
181
+ "use_length_weighted_sampler": false,
182
+ "length_weighted_sampler_alpha": 1.0,
183
+ "model_args": {
184
+ "num_chars": 40,
185
+ "out_channels": 513,
186
+ "spec_segment_size": 32,
187
+ "hidden_channels": 192,
188
+ "hidden_channels_ffn_text_encoder": 768,
189
+ "num_heads_text_encoder": 2,
190
+ "num_layers_text_encoder": 6,
191
+ "kernel_size_text_encoder": 3,
192
+ "dropout_p_text_encoder": 0.1,
193
+ "dropout_p_duration_predictor": 0.5,
194
+ "kernel_size_posterior_encoder": 5,
195
+ "dilation_rate_posterior_encoder": 1,
196
+ "num_layers_posterior_encoder": 16,
197
+ "kernel_size_flow": 5,
198
+ "dilation_rate_flow": 1,
199
+ "num_layers_flow": 4,
200
+ "resblock_type_decoder": "1",
201
+ "resblock_kernel_sizes_decoder": [
202
+ 3,
203
+ 7,
204
+ 11
205
+ ],
206
+ "resblock_dilation_sizes_decoder": [
207
+ [
208
+ 1,
209
+ 3,
210
+ 5
211
+ ],
212
+ [
213
+ 1,
214
+ 3,
215
+ 5
216
+ ],
217
+ [
218
+ 1,
219
+ 3,
220
+ 5
221
+ ]
222
+ ],
223
+ "upsample_rates_decoder": [
224
+ 8,
225
+ 8,
226
+ 2,
227
+ 2
228
+ ],
229
+ "upsample_initial_channel_decoder": 512,
230
+ "upsample_kernel_sizes_decoder": [
231
+ 16,
232
+ 16,
233
+ 4,
234
+ 4
235
+ ],
236
+ "periods_multi_period_discriminator": [
237
+ 2,
238
+ 3,
239
+ 5,
240
+ 7,
241
+ 11
242
+ ],
243
+ "use_sdp": true,
244
+ "noise_scale": 1.0,
245
+ "inference_noise_scale": 0.33,
246
+ "length_scale": 1,
247
+ "noise_scale_dp": 1.0,
248
+ "inference_noise_scale_dp": 0.33,
249
+ "max_inference_len": null,
250
+ "init_discriminator": true,
251
+ "use_spectral_norm_disriminator": false,
252
+ "use_speaker_embedding": true,
253
+ "num_speakers": 83,
254
+ "speakers_file": "speakers.pth",
255
+ "d_vector_file": null,
256
+ "speaker_embedding_channels": 256,
257
+ "use_d_vector_file": false,
258
+ "d_vector_dim": 0,
259
+ "detach_dp_input": true,
260
+ "use_language_embedding": false,
261
+ "embedded_language_dim": 4,
262
+ "num_languages": 0,
263
+ "language_ids_file": null,
264
+ "use_speaker_encoder_as_loss": false,
265
+ "speaker_encoder_config_path": "",
266
+ "speaker_encoder_model_path": "",
267
+ "condition_dp_on_speaker": true,
268
+ "freeze_encoder": false,
269
+ "freeze_DP": false,
270
+ "freeze_PE": false,
271
+ "freeze_flow_decoder": false,
272
+ "freeze_waveform_decoder": false,
273
+ "encoder_sample_rate": null,
274
+ "interpolate_z": true,
275
+ "reinit_DP": true,
276
+ "reinit_text_encoder": false
277
+ },
278
+ "lr_gen": 0.0002,
279
+ "lr_disc": 0.0002,
280
+ "lr_scheduler_gen": "ExponentialLR",
281
+ "lr_scheduler_gen_params": {
282
+ "gamma": 0.999875,
283
+ "last_epoch": -1
284
+ },
285
+ "lr_scheduler_disc": "ExponentialLR",
286
+ "lr_scheduler_disc_params": {
287
+ "gamma": 0.999875,
288
+ "last_epoch": -1
289
+ },
290
+ "kl_loss_alpha": 1.0,
291
+ "disc_loss_alpha": 1.0,
292
+ "gen_loss_alpha": 1.0,
293
+ "feat_loss_alpha": 1.0,
294
+ "mel_loss_alpha": 45.0,
295
+ "dur_loss_alpha": 1.0,
296
+ "speaker_encoder_loss_alpha": 1.0,
297
+ "return_wav": true,
298
+ "r": 1,
299
+ "num_speakers": 0,
300
+ "use_speaker_embedding": true,
301
+ "speakers_file": "speakers.pth",
302
+ "speaker_embedding_channels": 256,
303
+ "language_ids_file": null,
304
+ "use_language_embedding": false,
305
+ "use_d_vector_file": false,
306
+ "d_vector_file": null,
307
+ "d_vector_dim": 0
308
+ }
docker-compose.yml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ tts-api:
5
+ build:
6
+ context: .
7
+ dockerfile: Dockerfile
8
+ container_name: tts-indonesia-api
9
+ ports:
10
+ - "7860:7860"
11
+ environment:
12
+ - PORT=7860
13
+ volumes:
14
+ # Mount outputs folder untuk persist data (optional)
15
+ - ./outputs:/app/outputs
16
+ restart: unless-stopped
17
+ healthcheck:
18
+ test: ["CMD", "curl", "-f", "http://localhost:7860/api/health"]
19
+ interval: 30s
20
+ timeout: 10s
21
+ retries: 3
22
+ start_period: 40s
23
+
g2pid/.gitignore ADDED
@@ -0,0 +1,164 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # poetry
98
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102
+ #poetry.lock
103
+
104
+ # pdm
105
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106
+ #pdm.lock
107
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108
+ # in version control.
109
+ # https://pdm.fming.dev/#use-with-ide
110
+ .pdm.toml
111
+
112
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113
+ __pypackages__/
114
+
115
+ # Celery stuff
116
+ celerybeat-schedule
117
+ celerybeat.pid
118
+
119
+ # SageMath parsed files
120
+ *.sage.py
121
+
122
+ # Environments
123
+ .env
124
+ .venv
125
+ env/
126
+ venv/
127
+ ENV/
128
+ env.bak/
129
+ venv.bak/
130
+
131
+ # Spyder project settings
132
+ .spyderproject
133
+ .spyproject
134
+
135
+ # Rope project settings
136
+ .ropeproject
137
+
138
+ # mkdocs documentation
139
+ /site
140
+
141
+ # mypy
142
+ .mypy_cache/
143
+ .dmypy.json
144
+ dmypy.json
145
+
146
+ # Pyre type checker
147
+ .pyre/
148
+
149
+ # pytype static type analyzer
150
+ .pytype/
151
+
152
+ # Cython debug symbols
153
+ cython_debug/
154
+
155
+ # PyCharm
156
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
159
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
160
+ #.idea/
161
+
162
+ .DS_Store
163
+ .backup/
164
+ .data/
g2pid/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .g2p import G2P
2
+
3
+ __version__ = "0.0.5"
g2pid/data/dict.json ADDED
The diff for this file is too large to render. See raw diff
 
g2pid/g2p.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+
5
+ import numpy as np
6
+ import onnxruntime
7
+ from nltk.tokenize import TweetTokenizer
8
+ from sacremoses import MosesDetokenizer
9
+
10
+ from .syllable_splitter import SyllableSplitter
11
+
12
+ ABJAD_MAPPING = {
13
+ "a": "a",
14
+ "b": "bé",
15
+ "c": "cé",
16
+ "d": "dé",
17
+ "e": "é",
18
+ "f": "èf",
19
+ "g": "gé",
20
+ "h": "ha",
21
+ "i": "i",
22
+ "j": "jé",
23
+ "k": "ka",
24
+ "l": "èl",
25
+ "m": "èm",
26
+ "n": "èn",
27
+ "o": "o",
28
+ "p": "pé",
29
+ "q": "ki",
30
+ "r": "èr",
31
+ "s": "ès",
32
+ "t": "té",
33
+ "u": "u",
34
+ "v": "vé",
35
+ "w": "wé",
36
+ "x": "èks",
37
+ "y": "yé",
38
+ "z": "zèt",
39
+ }
40
+
41
+ PHONETIC_MAPPING = {
42
+ "sy": "ʃ",
43
+ "ny": "ɲ",
44
+ "ng": "ŋ",
45
+ "dj": "dʒ",
46
+ "'": "ʔ",
47
+ "c": "tʃ",
48
+ "é": "e",
49
+ "è": "ɛ",
50
+ "ê": "ə",
51
+ "g": "ɡ",
52
+ "I": "ɪ",
53
+ "j": "dʒ",
54
+ "ô": "ɔ",
55
+ "q": "k",
56
+ "U": "ʊ",
57
+ "v": "f",
58
+ "x": "ks",
59
+ "y": "j",
60
+ }
61
+
62
+
63
+ dirname = os.path.dirname(__file__)
64
+
65
+ # Predict pronounciation with BERT Masking
66
+ # Read more: https://w11wo.github.io/posts/2022/04/predicting-phonemes-with-bert/
67
+ class Predictor:
68
+ def __init__(self, model_path):
69
+ # fmt: off
70
+ self.vocab = ['', '[UNK]', 'a', 'n', 'ê', 'e', 'i', 'r', 'k', 's', 't', 'g', 'm', 'u', 'l', 'p', 'o', 'd', 'b', 'h', 'c', 'j', 'y', 'f', 'w', 'v', 'z', 'x', 'q', '[mask]']
71
+ self.mask_token_id = self.vocab.index("[mask]")
72
+ # fmt: on
73
+ self.session = onnxruntime.InferenceSession(model_path)
74
+
75
+ def predict(self, word: str) -> str:
76
+ """
77
+ Predict the phonetic representation of a word.
78
+
79
+ Args:
80
+ word (str): The word to predict.
81
+
82
+ Returns:
83
+ str: The predicted phonetic representation of the word.
84
+ """
85
+ text = [self.vocab.index(c) if c != "e" else self.mask_token_id for c in word]
86
+ text.extend([0] * (32 - len(text))) # Pad to 32 tokens
87
+ inputs = np.array([text], dtype=np.int64)
88
+ (predictions,) = self.session.run(None, {"input_4": inputs})
89
+
90
+ # find masked idx token
91
+ _, masked_index = np.where(inputs == self.mask_token_id)
92
+
93
+ # get prediction at those masked index only
94
+ mask_prediction = predictions[0][masked_index]
95
+ predicted_ids = np.argmax(mask_prediction, axis=1)
96
+
97
+ # replace mask with predicted token
98
+ for i, idx in enumerate(masked_index):
99
+ text[idx] = predicted_ids[i]
100
+
101
+ return "".join([self.vocab[i] for i in text if i != 0])
102
+
103
+
104
+ class G2P:
105
+ def __init__(self):
106
+ self.tokenizer = TweetTokenizer()
107
+ self.detokenizer = MosesDetokenizer(lang="id")
108
+
109
+ dict_path = os.path.join(dirname, "data/dict.json")
110
+ with open(dict_path) as f:
111
+ self.dict = json.load(f)
112
+
113
+ model_path = os.path.join(dirname, "model/bert_pron.onnx")
114
+ self.predictor = Predictor(model_path)
115
+
116
+ self.syllable_splitter = SyllableSplitter()
117
+
118
+ def __call__(self, text: str) -> str:
119
+ """
120
+ Convert text to phonetic representation.
121
+
122
+ Args:
123
+ text (str): The text to convert.
124
+
125
+ Returns:
126
+ str: The phonetic representation of the text.
127
+ """
128
+ text = text.lower()
129
+ text = re.sub(r"[^ a-z0-9'\.,?!-]", "", text)
130
+ text = text.replace("-", " ")
131
+
132
+ prons = []
133
+ words = self.tokenizer.tokenize(text)
134
+ for word in words:
135
+ # PUEBI pronunciation
136
+ if word in self.dict:
137
+ pron = self.dict[word]
138
+ elif len(word) == 1 and word in ABJAD_MAPPING:
139
+ pron = ABJAD_MAPPING[word]
140
+ elif "e" not in word or not word.isalpha():
141
+ pron = word
142
+ elif "e" in word:
143
+ pron = self.predictor.predict(word)
144
+
145
+ # Replace alofon /e/ with e (temporary)
146
+ pron = pron.replace("é", "e")
147
+ pron = pron.replace("è", "e")
148
+
149
+ # Replace /x/ with /s/
150
+ if pron.startswith("x"):
151
+ pron = "s" + pron[1:]
152
+
153
+ sylls = self.syllable_splitter.split_syllables(pron)
154
+ # Decide where to put the stress
155
+ stress_loc = len(sylls) - 1
156
+ if len(sylls) > 1 and "ê" in sylls[-2]:
157
+ if "ê" in sylls[-1]:
158
+ stress_loc = len(sylls) - 2
159
+ else:
160
+ stress_loc = len(sylls)
161
+
162
+ # Apply rules on syllable basis
163
+ # All alophone are set to tense by default
164
+ # and will be changed to lax if needed
165
+ alophone = {"e": "é", "o": "o"}
166
+ alophone_map = {"i": "I", "u": "U", "e": "è", "o": "ô"}
167
+ for i, syll in enumerate(sylls, start=1):
168
+ # Put Syllable stress
169
+ if i == stress_loc:
170
+ syll = "ˈ" + syll
171
+
172
+ # Alophone syllable rules
173
+ for v in ["e", "o"]:
174
+ # Replace with lax allphone [��, ɔ] if
175
+ # in closed final syllables
176
+ if v in syll and not syll.endswith(v) and i == len(sylls):
177
+ alophone[v] = alophone_map[v]
178
+
179
+ # Alophone syllable stress rules
180
+ for v in ["i", "u"]:
181
+ # Replace with lax allphone [ɪ, ʊ] if
182
+ # in the middle of syllable without stress
183
+ # and not ends with coda nasal [m, n, ng] (except for final syllable)
184
+ if (
185
+ v in syll
186
+ and not syll.startswith("ˈ")
187
+ and not syll.endswith(v)
188
+ and (
189
+ not any(syll.endswith(x) for x in ["m", "n", "ng"])
190
+ or i == len(sylls)
191
+ )
192
+ ):
193
+ syll = syll.replace(v, alophone_map[v])
194
+
195
+ if syll.endswith("nk"):
196
+ syll = syll[:-2] + "ng"
197
+ elif syll.endswith("d"):
198
+ syll = syll[:-1] + "t"
199
+ elif syll.endswith("b"):
200
+ syll = syll[:-1] + "p"
201
+ elif syll.endswith("k") or (
202
+ syll.endswith("g") and not syll.endswith("ng")
203
+ ):
204
+ syll = syll[:-1] + "'"
205
+ sylls[i - 1] = syll
206
+
207
+ pron = "".join(sylls)
208
+ # Apply phonetic and alophone mapping
209
+ for v in alophone:
210
+ if v == "o" and pron.count("o") == 1:
211
+ continue
212
+ pron = pron.replace(v, alophone[v])
213
+ for g, p in PHONETIC_MAPPING.items():
214
+ pron = pron.replace(g, p)
215
+ pron = pron.replace("kh", "x")
216
+
217
+ prons.append(pron)
218
+ prons.append(" ")
219
+
220
+ return self.detokenizer.detokenize(prons)
g2pid/syllable_splitter.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copied from https://github.com/fahadh4ilyas/syllable_splitter
2
+ # MIT License
3
+ import re
4
+
5
+
6
+ class SyllableSplitter:
7
+ def __init__(self):
8
+ self.consonant = set(
9
+ [
10
+ "b",
11
+ "c",
12
+ "d",
13
+ "f",
14
+ "g",
15
+ "h",
16
+ "j",
17
+ "k",
18
+ "l",
19
+ "m",
20
+ "n",
21
+ "p",
22
+ "q",
23
+ "r",
24
+ "s",
25
+ "t",
26
+ "v",
27
+ "w",
28
+ "x",
29
+ "y",
30
+ "z",
31
+ "ng",
32
+ "ny",
33
+ "sy",
34
+ "ch",
35
+ "dh",
36
+ "gh",
37
+ "kh",
38
+ "ph",
39
+ "sh",
40
+ "th",
41
+ ]
42
+ )
43
+ self.double_consonant = set(["ll", "ks", "rs", "rt", "nk", "nd"])
44
+ self.vocal = set(["a", "e", "ê", "é", "è", "i", "o", "u"])
45
+
46
+ def split_letters(self, string):
47
+ letters = []
48
+ arrange = []
49
+
50
+ while string != "":
51
+ letter = string[:2]
52
+
53
+ if letter in self.double_consonant:
54
+ if string[2:] != "" and string[2] in self.vocal:
55
+ letters += [letter[0]]
56
+ arrange += ["c"]
57
+ string = string[1:]
58
+ else:
59
+ letters += [letter]
60
+ arrange += ["c"]
61
+ string = string[2:]
62
+ elif letter in self.consonant:
63
+ letters += [letter]
64
+ arrange += ["c"]
65
+ string = string[2:]
66
+ elif letter in self.vocal:
67
+ letters += [letter]
68
+ arrange += ["v"]
69
+ string = string[2:]
70
+ else:
71
+ letter = string[0]
72
+
73
+ if letter in self.consonant:
74
+ letters += [letter]
75
+ arrange += ["c"]
76
+ string = string[1:]
77
+ elif letter in self.vocal:
78
+ letters += [letter]
79
+ arrange += ["v"]
80
+ string = string[1:]
81
+ else:
82
+ letters += [letter]
83
+ arrange += ["s"]
84
+ string = string[1:]
85
+
86
+ return letters, "".join(arrange)
87
+
88
+ def split_syllables_from_letters(self, letters, arrange):
89
+ consonant_index = re.search(r"vc{2,}", arrange)
90
+ while consonant_index:
91
+ i = consonant_index.start() + 1
92
+ letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
93
+ arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
94
+ consonant_index = re.search(r"vc{2,}", arrange)
95
+
96
+ vocal_index = re.search(r"v{2,}", arrange)
97
+ while vocal_index:
98
+ i = vocal_index.start()
99
+ letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
100
+ arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
101
+ vocal_index = re.search(r"v{2,}", arrange)
102
+
103
+ vcv_index = re.search(r"vcv", arrange)
104
+ while vcv_index:
105
+ i = vcv_index.start()
106
+ letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
107
+ arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
108
+ vcv_index = re.search(r"vcv", arrange)
109
+
110
+ sep_index = re.search(r"[cvs]s", arrange)
111
+ while sep_index:
112
+ i = sep_index.start()
113
+ letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
114
+ arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
115
+ sep_index = re.search(r"[cvs]s", arrange)
116
+
117
+ sep_index = re.search(r"s[cvs]", arrange)
118
+ while sep_index:
119
+ i = sep_index.start()
120
+ letters = letters[: i + 1] + ["|"] + letters[i + 1 :]
121
+ arrange = arrange[: i + 1] + "|" + arrange[i + 1 :]
122
+ sep_index = re.search(r"s[cvs]", arrange)
123
+ return "".join(letters).split("|")
124
+
125
+ def split_syllables(self, string):
126
+ letters, arrange = self.split_letters(string)
127
+ return self.split_syllables_from_letters(letters, arrange)
gadis.jpg ADDED
languages.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Arabic": "ar",
3
+ "Chinese": "zh-cn",
4
+ "Czech": "cs",
5
+ "Dutch": "nl",
6
+ "English": "en",
7
+ "French": "fr",
8
+ "German": "de",
9
+ "Hungarian": "hu",
10
+ "Indonesian": "id",
11
+ "Italian": "it",
12
+ "Japanese": "ja",
13
+ "Korean": "ko",
14
+ "Polish": "pl",
15
+ "Portuguese": "pt",
16
+ "Russian": "ru",
17
+ "Spanish": "es",
18
+ "Turkish": "tr"
19
+ }
outputs/.DS_Store ADDED
Binary file (6.15 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ TTS
2
+ sacremoses>=0.0.41
3
+ nltk>=3.7
4
+ onnxruntime>=1.7.0
5
+ torch
6
+ fastapi>=0.100.0
7
+ uvicorn[standard]>=0.23.0
8
+ python-multipart>=0.0.6
targets/.DS_Store ADDED
Binary file (6.15 kB). View file
 
themes.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ ████████╗████████╗███████╗
3
+ ╚══██╔══╝╚══██╔══╝██╔════╝
4
+ ██║ ██║ ███████╗
5
+ ██║ ██║ ╚════██║
6
+ ██║ ██║ ███████║
7
+ ╚═╝ ╚═╝ ╚══════╝
8
+ ██╗███╗ ██╗██████╗ ██████╗ ███╗ ██╗███████╗███████╗██╗ █████╗ ██╗ ██╗██╗ ██╗
9
+ ██║████╗ ██║██╔══██╗██╔═══██╗████╗ ██║██╔════╝██╔════╝██║██╔══██╗██║ ██╔╝██║ ██║
10
+ ██║██╔██╗ ██║██║ ██║██║ ██║██╔██╗ ██║█████╗ ███████╗██║███████║█████╔╝ ██║ ██║
11
+ ██║██║╚██╗██║██║ ██║██║ ██║██║╚██╗██║██╔══╝ ╚════██║██║██╔══██║██╔═██╗ ██║ ██║
12
+ ██║██║ ╚████║██████╔╝╚██████╔╝██║ ╚████║███████╗███████║██║██║ ██║██║ ██╗╚██████╔╝
13
+ ╚═╝╚═╝ ╚═══╝╚═════╝ ╚═════╝ ╚═╝ ╚═══╝╚══════╝╚══════╝╚═╝╚═╝ ╚═╝╚═╝ ╚═╝ ╚═════╝
14
+
15
+ Script ini dibuat oleh __drat
16
+
17
+ Petunjuk:
18
+ 1. Script ini digunakan untuk menghasilkan suara berbasis teks dengan berbagai pilihan pembicara.
19
+ 2. Teknologi yang digunakan meliputi model text-to-speech (TTS) yang canggih dengan konversi teks ke fonem (G2P).
20
+ 3. Model yang dipakai dilatih khusus untuk bahasa Indonesia, Jawa, dan Sunda.
21
+ 4. Antarmuka dibuat dengan menggunakan Gradio dengan tema kustom bernama MetafisikTheme.
22
+
23
+ Cara Menggunakan:
24
+ 1. Masukkan teks yang ingin diubah menjadi suara.
25
+ 2. Pilih kecepatan bicara yang diinginkan.
26
+ 3. Pilih bahasa dan pembicara yang diinginkan.
27
+ 4. Klik tombol "Lakukan Inferensi Audio" untuk menghasilkan suara.
28
+ """
29
+
30
+ from __future__ import annotations
31
+ from typing import Iterable
32
+ from gradio.themes.base import Base
33
+ from gradio.themes.utils import colors, fonts, sizes
34
+
35
+ class MetafisikTheme(Base):
36
+ def __init__(
37
+ self,
38
+ *,
39
+ primary_hue: colors.Color | str = colors.orange,
40
+ secondary_hue: colors.Color | str = colors.yellow,
41
+ neutral_hue: colors.Color | str = colors.gray,
42
+ spacing_size: sizes.Size | str = sizes.spacing_md,
43
+ radius_size: sizes.Size | str = sizes.radius_md,
44
+ text_size: sizes.Size | str = sizes.text_lg,
45
+ font: fonts.Font
46
+ | str
47
+ | Iterable[fonts.Font | str] = (
48
+ fonts.GoogleFont("Quicksand"),
49
+ "ui-sans-serif",
50
+ "sans-serif",
51
+ ),
52
+ font_mono: fonts.Font
53
+ | str
54
+ | Iterable[fonts.Font | str] = (
55
+ fonts.GoogleFont("IBM Plex Mono"),
56
+ "ui-monospace",
57
+ "monospace",
58
+ ),
59
+ ):
60
+ super().__init__(
61
+ primary_hue=primary_hue,
62
+ secondary_hue=secondary_hue,
63
+ neutral_hue=neutral_hue,
64
+ spacing_size=spacing_size,
65
+ radius_size=radius_size,
66
+ text_size=text_size,
67
+ font=font,
68
+ font_mono=font_mono,
69
+ )
70
+ super().set(
71
+ body_background_fill="linear-gradient(to bottom, #FFFFE0, #FFFFFF)", # Gradient from light yellow to white
72
+ body_background_fill_dark="linear-gradient(to bottom, #FFFFE0, #FFFFFF)", # Same gradient for dark mode
73
+ button_primary_background_fill="linear-gradient(90deg, #FFA500, #FF4500)", # Orange to dark orange gradient
74
+ button_primary_background_fill_hover="linear-gradient(90deg, #FFB347, #FF6347)", # Lighter orange gradient
75
+ button_primary_text_color="white",
76
+ button_primary_background_fill_dark="linear-gradient(90deg, #FF8C00, #FF4500)", # Darker orange gradient
77
+ slider_color="*secondary_300",
78
+ slider_color_dark="*secondary_600",
79
+ block_title_text_weight="600",
80
+ block_border_width="3px",
81
+ block_shadow="*shadow_drop_lg",
82
+ button_shadow="*shadow_drop_lg",
83
+ button_large_padding="32px",
84
+ )
tts_standalone.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Standalone TTS Program untuk Bahasa Indonesia
4
+ Berdasarkan app.py - TTS Indonesiaku Gratis
5
+
6
+ Program ini menghasilkan file audio dari teks Bahasa Indonesia
7
+ dengan menggunakan model TTS yang sudah dilatih khusus untuk bahasa Indonesia.
8
+
9
+ Parameter:
10
+ - text: Teks yang akan diubah menjadi suara
11
+ - file_path: Path file output audio (.wav)
12
+ """
13
+
14
+ import subprocess
15
+ import html
16
+ from pathlib import Path
17
+
18
+ try:
19
+ from g2pid import G2P
20
+ # Inisialisasi G2P (Grapheme to Phoneme)
21
+ g2p = G2P()
22
+ G2P_AVAILABLE = True
23
+ except ImportError:
24
+ print("WARNING: G2P module tidak tersedia. Install dependencies dengan: pip install -r requirements.txt")
25
+ G2P_AVAILABLE = False
26
+
27
+ # Parameter default untuk konfigurasi
28
+ params = {
29
+ "model_path": "checkpoint_1260000-inference.pth",
30
+ "config_path": "config.json",
31
+ # "speaker": "ardi" # Default speaker
32
+ "speaker": "gadis" # Default speaker
33
+ }
34
+ # params = {
35
+ # "model_path": "kobov2.pth",
36
+ # "config_path": "config.json",
37
+ # # "speaker": "kobov2.index" # Default speaker
38
+ # "speaker": "gadis",
39
+ # }
40
+
41
+ def generate_tts(text, file_path, speaker="ardi"):
42
+ """
43
+ Generate TTS audio dari teks Bahasa Indonesia
44
+
45
+ Args:
46
+ text (str): Teks yang akan diubah menjadi suara
47
+ file_path (str): Path file output audio (.wav)
48
+ speaker (str): Nama speaker (default: "ardi")
49
+
50
+ Returns:
51
+ bool: True jika berhasil, False jika gagal
52
+ """
53
+ try:
54
+ print(f"Memproses teks: {text}")
55
+
56
+ # Konversi teks ke format TTS menggunakan G2P jika tersedia
57
+ if G2P_AVAILABLE:
58
+ print("Mengonversi teks ke fonem...")
59
+ text_to_tts = g2p(text)
60
+ print(f"Teks setelah konversi G2P: {text_to_tts}")
61
+ else:
62
+ print("WARNING: Menggunakan teks asli tanpa konversi G2P")
63
+ text_to_tts = text
64
+
65
+ # Pastikan direktori output ada
66
+ output_path = Path(file_path)
67
+ output_path.parent.mkdir(parents=True, exist_ok=True)
68
+
69
+ # Perintah untuk menjalankan TTS
70
+ command = [
71
+ "tts",
72
+ "--text", text_to_tts,
73
+ "--model_path", params["model_path"],
74
+ "--config_path", params["config_path"],
75
+ "--speaker_idx", speaker,
76
+ "--out_path", str(output_path)
77
+ ]
78
+
79
+ print("Menjalankan proses TTS...")
80
+ print(f"Command: {' '.join(command)}")
81
+
82
+ # Jalankan proses TTS
83
+ result = subprocess.run(command, capture_output=True, text=True)
84
+
85
+ if result.returncode != 0:
86
+ print(f"Error: {result.stderr}")
87
+ return False
88
+
89
+ print(f"SUCCESS: Audio berhasil dibuat: {file_path}")
90
+ return True
91
+
92
+ except Exception as e:
93
+ print(f"ERROR: {str(e)}")
94
+ return False
95
+
96
+ def main():
97
+ """
98
+ Fungsi utama dengan parameter yang diminta
99
+ """
100
+ # Parameter sesuai permintaan
101
+ text = "Selamat pagi, nama saya Aryo."
102
+ file_path = "output_id_coqui.wav"
103
+
104
+ print("=" * 60)
105
+ print("TTS Standalone - Bahasa Indonesia")
106
+ print("=" * 60)
107
+ print(f"Teks: {text}")
108
+ print(f"Output file: {file_path}")
109
+ print(f"Speaker: {params['speaker']}")
110
+ print("=" * 60)
111
+
112
+ # Generate TTS
113
+ success = generate_tts(text, file_path, params["speaker"])
114
+
115
+ if success:
116
+ print("\nSUCCESS: Proses TTS selesai!")
117
+ print(f"File audio tersimpan di: {file_path}")
118
+ else:
119
+ print("\nERROR: Proses TTS gagal!")
120
+ print("Pastikan:")
121
+ print("1. Model checkpoint_1260000-inference.pth tersedia")
122
+ print("2. File config.json tersedia")
123
+ print("3. TTS library terinstall (pip install TTS)")
124
+ print("4. Speaker tersedia dalam model")
125
+ print("5. Dependencies terinstall: pip install -r requirements.txt")
126
+
127
+ if __name__ == "__main__":
128
+ main()
wibowo.jpg ADDED