henomoto commited on
Commit
5e1ba78
·
verified ·
1 Parent(s): 9c80886

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +200 -156
app.py CHANGED
@@ -1,6 +1,8 @@
1
  import datetime
 
2
  import os
3
  import random
 
4
 
5
  import gradio as gr
6
  import jwt
@@ -18,6 +20,8 @@ MAX_QUEUE_SIZE = 15
18
  # Global Client Instance
19
  CLIENT = None
20
 
 
 
21
 
22
  def get_client():
23
  global CLIENT
@@ -60,114 +64,75 @@ examples = [
60
  "おい!そんなこと今更言ったってしょうがねぇだろ!いい大人なんだから、さっさと諦めろよ!",
61
  ]
62
 
63
- # Mapping from rank (1-100) to speaker_id
64
- # rank 1 = highest pitch, rank 100 = lowest pitch
65
- rank_to_num = [
66
- 10,
67
- 14,
68
- 93,
69
- 90,
70
- 36,
71
- 83,
72
- 26,
73
- 15,
74
- 4,
75
- 96,
76
- 30,
77
- 67,
78
- 66,
79
- 51,
80
- 65,
81
- 27,
82
- 24,
83
- 39,
84
- 61,
85
- 85,
86
- 55,
87
- 72,
88
- 8,
89
- 84,
90
- 25,
91
- 40,
92
- 38,
93
- 95,
94
- 53,
95
- 63,
96
- 94,
97
- 19,
98
- 59,
99
- 56,
100
- 60,
101
- 69,
102
- 29,
103
- 58,
104
- 18,
105
- 17,
106
- 35,
107
- 2,
108
- 57,
109
- 64,
110
- 91,
111
- 82,
112
- 43,
113
- 62,
114
- 7,
115
- 92,
116
- 16,
117
- 98,
118
- 32,
119
- 11,
120
- 99,
121
- 97,
122
- 52,
123
- 77,
124
- 87,
125
- 3,
126
- 75,
127
- 13,
128
- 80,
129
- 1,
130
- 5,
131
- 33,
132
- 47,
133
- 20,
134
- 76,
135
- 41,
136
- 46,
137
- 79,
138
- 73,
139
- 45,
140
- 50,
141
- 81,
142
- 49,
143
- 70,
144
- 86,
145
- 23,
146
- 44,
147
- 54,
148
- 68,
149
- 42,
150
- 88,
151
- 100,
152
- 9,
153
- 89,
154
- 22,
155
- 31,
156
- 12,
157
- 74,
158
- 34,
159
- 71,
160
- 28,
161
- 78,
162
- 48,
163
- 37,
164
- 21,
165
- 6,
166
- ]
167
-
168
-
169
- def rank_to_speaker_id(rank: int) -> int:
170
- return rank_to_num[rank - 1]
171
 
172
 
173
  # --- Backend Interaction ---
@@ -264,7 +229,68 @@ with gr.Blocks(title="Paratts 音声合成デモ") as app:
264
  )
265
 
266
  with gr.Column(variant="panel"):
267
- gr.Markdown("### 1. テキストを入力")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
268
  text_inp = gr.Textbox(
269
  show_label=False,
270
  placeholder="読み上げてほしい日本語の文章を入力してください。",
@@ -280,29 +306,6 @@ with gr.Blocks(title="Paratts 音声合成デモ") as app:
280
  inputs=[text_inp],
281
  examples_per_page=20,
282
  )
283
-
284
- with gr.Column(variant="panel"):
285
- gr.Markdown("### 2. 話者を選ぶ")
286
- with gr.Row(equal_height=True):
287
- with gr.Column():
288
- speaker_inp = gr.Slider(
289
- label="話者ID",
290
- minimum=1,
291
- maximum=100,
292
- step=1,
293
- value=10,
294
- )
295
- speaker_random_btn = gr.Button("🎲ランダム話者", variant="secondary")
296
- with gr.Column():
297
- speaker_rank_inp = gr.Slider(
298
- label="声の高さで選ぶ",
299
- info="1 = 一番声が高い, 100 = 一番声が低い。選んだら「🔄話者IDに変換」を押してください。",
300
- minimum=1,
301
- maximum=100,
302
- step=1,
303
- value=1,
304
- )
305
- rank_to_spk_id_btn = gr.Button("🔄話者IDに変換")
306
  with gr.Column(variant="panel"):
307
  gr.Markdown("### 3. 音声生成")
308
  with gr.Row(equal_height=True):
@@ -340,25 +343,74 @@ with gr.Blocks(title="Paratts 音声合成デモ") as app:
340
  out_usage = gr.Number(label="現在の使用量 (文字)", interactive=False)
341
  out_remaining = gr.Number(label="残り使用可能数 (文字)", interactive=False)
342
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
343
  # --- Event Wiring ---
344
- # Speaker Rank change
345
- speaker_rank_inp.input(
346
- fn=lambda: (
347
- gr.Button(variant="primary"),
348
- gr.Button(variant="secondary", interactive=False),
349
- ),
350
- outputs=[rank_to_spk_id_btn, gen_btn],
351
  api_visibility="private",
352
  )
353
- # Rank to Speaker ID
354
- rank_to_spk_id_btn.click(
355
- fn=lambda rank: (
356
- rank_to_speaker_id(rank),
357
- gr.Button(variant="secondary"),
358
- gr.Button(variant="primary", interactive=True),
359
- ),
360
- inputs=[speaker_rank_inp],
361
- outputs=[speaker_inp, rank_to_spk_id_btn, gen_btn],
 
 
 
362
  api_visibility="private",
363
  )
364
 
@@ -370,14 +422,6 @@ with gr.Blocks(title="Paratts 音声合成デモ") as app:
370
  api_visibility="private",
371
  )
372
 
373
- # Random Speaker
374
- speaker_random_btn.click(
375
- fn=lambda: random.randint(1, 100),
376
- inputs=[],
377
- outputs=[speaker_inp],
378
- api_visibility="private",
379
- )
380
-
381
  # Generation
382
  gen_btn.click(
383
  fn=generate_speech,
 
1
  import datetime
2
+ import json
3
  import os
4
  import random
5
+ from pathlib import Path
6
 
7
  import gradio as gr
8
  import jwt
 
20
  # Global Client Instance
21
  CLIENT = None
22
 
23
+ ICON_HEIGHT = 140
24
+
25
 
26
  def get_client():
27
  global CLIENT
 
64
  "おい!そんなこと今更言ったってしょうがねぇだろ!いい大人なんだから、さっさと諦めろよ!",
65
  ]
66
 
67
+ # --- Speaker Mapping Data ---
68
+ # Use relative paths for Gradio compatibility (works on HF Spaces where cwd is /app)
69
+ ASSETS_PATH = Path("assets")
70
+ ICONS_PATH = ASSETS_PATH / "icons"
71
+ VOICES_PATH = ASSETS_PATH / "voices"
72
+
73
+
74
+ def load_speaker_mapping() -> tuple[list, dict, dict]:
75
+ """Load mapping.json and build lookup dictionaries."""
76
+ with open("mapping.json", "r", encoding="utf-8") as f:
77
+ data = json.load(f)
78
+
79
+ # Build lookup: jvs_id -> internal_id ("jvs006" -> 6)
80
+ jvs_to_internal: dict[str, int] = {}
81
+ # Build lookup: internal_id -> speaker_info (6 -> {name, icon_id, jvs_id, ...})
82
+ internal_to_info: dict[int, dict] = {}
83
+
84
+ for category in data:
85
+ category_id = category["id"]
86
+ # Skip character voices (not supported by backend yet)
87
+ if category_id == "character":
88
+ continue
89
+ for speaker in category["speakers"]:
90
+ jvs_id = speaker["id"]
91
+ # Extract internal ID: "jvs006" -> 6
92
+ internal_id = int(jvs_id[3:])
93
+ jvs_to_internal[jvs_id] = internal_id
94
+ internal_to_info[internal_id] = {
95
+ "jvs_id": jvs_id,
96
+ "name": speaker["name"],
97
+ "icon_id": speaker["icon_id"],
98
+ "category_id": category_id,
99
+ }
100
+
101
+ return data, jvs_to_internal, internal_to_info
102
+
103
+
104
+ def get_gallery_data_for_category(
105
+ speaker_data: list, category_id: str
106
+ ) -> list[tuple[str, str]]:
107
+ """Return list of (icon_path, "name|jvs_id") tuples for gallery."""
108
+ gallery_items = []
109
+ for category in speaker_data:
110
+ if category["id"] != category_id:
111
+ continue
112
+ for speaker in category["speakers"]:
113
+ icon_path = ICONS_PATH / f"{speaker['icon_id']}.webp"
114
+ # Encode jvs_id in caption for retrieval on selection
115
+ caption = f"{speaker['name']}|{speaker['id']}"
116
+ gallery_items.append((str(icon_path), caption))
117
+ return gallery_items
118
+
119
+
120
+ def get_voice_sample_path(jvs_id: str) -> str | None:
121
+ """Get path to voice sample: jvs006 -> assets/voices/jvs006.opus"""
122
+ voice_path = VOICES_PATH / f"{jvs_id}.opus"
123
+ if voice_path.exists():
124
+ return str(voice_path)
125
+ return None
126
+
127
+
128
+ # Load speaker data at startup
129
+ SPEAKER_DATA, JVS_TO_INTERNAL, INTERNAL_TO_INFO = load_speaker_mapping()
130
+ MALE_GALLERY = get_gallery_data_for_category(SPEAKER_DATA, "jvs-men")
131
+ FEMALE_GALLERY = get_gallery_data_for_category(SPEAKER_DATA, "jvs-woman")
132
+
133
+ # Default speaker info (jvs010, internal_id=10)
134
+ DEFAULT_SPEAKER_ID = 10
135
+ DEFAULT_SPEAKER_INFO = INTERNAL_TO_INFO.get(DEFAULT_SPEAKER_ID, {})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
 
138
  # --- Backend Interaction ---
 
229
  )
230
 
231
  with gr.Column(variant="panel"):
232
+ gr.Markdown("### 1. 話者を選ぶ")
233
+ # Speaker gallery
234
+ with gr.Accordion("話者を変更", open=False):
235
+ gr.Markdown("※ 番号は声の高さを表します(01が最も低く、51/49が最も高い)")
236
+ with gr.Tabs():
237
+ with gr.Tab(f"男性 ({len(MALE_GALLERY)})"):
238
+ male_gallery = gr.Gallery(
239
+ value=[
240
+ (item[0], item[1].split("|")[0]) for item in MALE_GALLERY
241
+ ],
242
+ columns=7,
243
+ rows=7,
244
+ height=2 * ICON_HEIGHT,
245
+ object_fit="fill",
246
+ allow_preview=False,
247
+ show_label=False,
248
+ elem_id="male-gallery",
249
+ )
250
+ with gr.Tab(f"女性 ({len(FEMALE_GALLERY)})"):
251
+ female_gallery = gr.Gallery(
252
+ value=[
253
+ (item[0], item[1].split("|")[0]) for item in FEMALE_GALLERY
254
+ ],
255
+ columns=7,
256
+ rows=8,
257
+ height=2 * ICON_HEIGHT,
258
+ object_fit="cover",
259
+ allow_preview=False,
260
+ show_label=False,
261
+ elem_id="female-gallery",
262
+ )
263
+ # For backward compatibility: direct speaker ID input
264
+ speaker_inp = gr.Slider(
265
+ label="話者ID (直接入力)",
266
+ minimum=1,
267
+ maximum=100,
268
+ step=1,
269
+ value=DEFAULT_SPEAKER_ID,
270
+ )
271
+
272
+ # Selected speaker preview
273
+ with gr.Row(equal_height=True):
274
+ with gr.Column(scale=1, min_width=140):
275
+ selected_icon = gr.Image(
276
+ value=str(
277
+ ICONS_PATH / f"{DEFAULT_SPEAKER_INFO.get('icon_id', '')}.webp"
278
+ )
279
+ if DEFAULT_SPEAKER_INFO
280
+ else None,
281
+ interactive=False,
282
+ label="アイコン",
283
+ height=ICON_HEIGHT,
284
+ buttons=[],
285
+ )
286
+ with gr.Column(scale=3):
287
+ voice_preview = gr.Audio(
288
+ label=f"ボイスサンプル ({DEFAULT_SPEAKER_INFO.get('name', '')})",
289
+ value=get_voice_sample_path(DEFAULT_SPEAKER_INFO.get("jvs_id", "")),
290
+ buttons=[],
291
+ )
292
+ with gr.Column(variant="panel"):
293
+ gr.Markdown("### 2. テキストを入力")
294
  text_inp = gr.Textbox(
295
  show_label=False,
296
  placeholder="読み上げてほしい日本語の文章を入力してください。",
 
306
  inputs=[text_inp],
307
  examples_per_page=20,
308
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
309
  with gr.Column(variant="panel"):
310
  gr.Markdown("### 3. 音声生成")
311
  with gr.Row(equal_height=True):
 
343
  out_usage = gr.Number(label="現在の使用量 (文字)", interactive=False)
344
  out_remaining = gr.Number(label="残り使用可能数 (文字)", interactive=False)
345
 
346
+ # --- Event Handler Functions ---
347
+ def _handle_gallery_select(evt: gr.SelectData, gallery_data: list):
348
+ """Handle gallery selection -> update slider and preview."""
349
+ if evt.index is None:
350
+ return gr.update(), gr.update(), gr.update()
351
+
352
+ # Get the full caption which contains "name|jvs_id"
353
+ full_caption = gallery_data[evt.index][1]
354
+ parts = full_caption.split("|")
355
+ display_name = parts[0]
356
+ jvs_id = parts[1] if len(parts) > 1 else None
357
+
358
+ if jvs_id is None:
359
+ return gr.update(), gr.update(), gr.update()
360
+
361
+ # Get internal ID (jvs006 -> 6)
362
+ internal_id = JVS_TO_INTERNAL.get(jvs_id, 10)
363
+
364
+ # Get icon path
365
+ info = INTERNAL_TO_INFO.get(internal_id, {})
366
+ icon_path = str(ICONS_PATH / f"{info.get('icon_id', '')}.webp")
367
+
368
+ # Get voice sample path
369
+ voice_path = get_voice_sample_path(jvs_id)
370
+
371
+ return (
372
+ internal_id, # speaker_inp
373
+ icon_path, # selected_icon
374
+ gr.update(label=f"ボイスサンプル ({display_name})", value=voice_path),
375
+ )
376
+
377
+ def on_male_gallery_select(evt: gr.SelectData):
378
+ return _handle_gallery_select(evt, MALE_GALLERY)
379
+
380
+ def on_female_gallery_select(evt: gr.SelectData):
381
+ return _handle_gallery_select(evt, FEMALE_GALLERY)
382
+
383
+ def on_slider_change(internal_id: int):
384
+ """Update preview when slider changes directly."""
385
+ info = INTERNAL_TO_INFO.get(internal_id)
386
+ if info:
387
+ icon_path = str(ICONS_PATH / f"{info['icon_id']}.webp")
388
+ voice_path = get_voice_sample_path(info["jvs_id"])
389
+ return icon_path, gr.update(
390
+ label=f"ボイスサンプル ({info['name']})", value=voice_path
391
+ )
392
+ return None, gr.update(label=f"ボイスサンプル (話者 {internal_id})", value=None)
393
+
394
  # --- Event Wiring ---
395
+ # Gallery selection events
396
+ male_gallery.select(
397
+ fn=on_male_gallery_select,
398
+ inputs=[],
399
+ outputs=[speaker_inp, selected_icon, voice_preview],
 
 
400
  api_visibility="private",
401
  )
402
+ female_gallery.select(
403
+ fn=on_female_gallery_select,
404
+ inputs=[],
405
+ outputs=[speaker_inp, selected_icon, voice_preview],
406
+ api_visibility="private",
407
+ )
408
+
409
+ # Slider change -> update preview
410
+ speaker_inp.change(
411
+ fn=on_slider_change,
412
+ inputs=[speaker_inp],
413
+ outputs=[selected_icon, voice_preview],
414
  api_visibility="private",
415
  )
416
 
 
422
  api_visibility="private",
423
  )
424
 
 
 
 
 
 
 
 
 
425
  # Generation
426
  gen_btn.click(
427
  fn=generate_speech,