Spaces:

Parakeet-Inc
/

paratts_demo

Running

App Files Files Community

henomoto commited on 22 days ago

Commit

5e1ba78

verified ·

1 Parent(s): 9c80886

Upload app.py with huggingface_hub

Browse files

Files changed (1) hide show

app.py +200 -156

app.py CHANGED Viewed

@@ -1,6 +1,8 @@
 import datetime
 import os
 import random
 import gradio as gr
 import jwt
@@ -18,6 +20,8 @@ MAX_QUEUE_SIZE = 15
 # Global Client Instance
 CLIENT = None
 def get_client():
     global CLIENT
@@ -60,114 +64,75 @@ examples = [
     "おい！そんなこと今更言ったってしょうがねぇだろ！いい大人なんだから、さっさと諦めろよ！",
 ]
-# Mapping from rank (1-100) to speaker_id
-# rank 1 = highest pitch, rank 100 = lowest pitch
-rank_to_num = [
-    10,
-    14,
-    93,
-    90,
-    36,
-    83,
-    26,
-    15,
-    4,
-    96,
-    30,
-    67,
-    66,
-    51,
-    65,
-    27,
-    24,
-    39,
-    61,
-    85,
-    55,
-    72,
-    8,
-    84,
-    25,
-    40,
-    38,
-    95,
-    53,
-    63,
-    94,
-    19,
-    59,
-    56,
-    60,
-    69,
-    29,
-    58,
-    18,
-    17,
-    35,
-    2,
-    57,
-    64,
-    91,
-    82,
-    43,
-    62,
-    7,
-    92,
-    16,
-    98,
-    32,
-    11,
-    99,
-    97,
-    52,
-    77,
-    87,
-    3,
-    75,
-    13,
-    80,
-    1,
-    5,
-    33,
-    47,
-    20,
-    76,
-    41,
-    46,
-    79,
-    73,
-    45,
-    50,
-    81,
-    49,
-    70,
-    86,
-    23,
-    44,
-    54,
-    68,
-    42,
-    88,
-    100,
-    9,
-    89,
-    22,
-    31,
-    12,
-    74,
-    34,
-    71,
-    28,
-    78,
-    48,
-    37,
-    21,
-    6,
-]
-def rank_to_speaker_id(rank: int) -> int:
-    return rank_to_num[rank - 1]
 # --- Backend Interaction ---
@@ -264,7 +229,68 @@ with gr.Blocks(title="Paratts 音声合成デモ") as app:
         )
     with gr.Column(variant="panel"):
-        gr.Markdown("### 1. テキストを入力")
         text_inp = gr.Textbox(
             show_label=False,
             placeholder="読み上げてほしい日本語の文章を入力してください。",
@@ -280,29 +306,6 @@ with gr.Blocks(title="Paratts 音声合成デモ") as app:
                 inputs=[text_inp],
                 examples_per_page=20,
             )
-    with gr.Column(variant="panel"):
-        gr.Markdown("### 2. 話者を選ぶ")
-        with gr.Row(equal_height=True):
-            with gr.Column():
-                speaker_inp = gr.Slider(
-                    label="話者ID",
-                    minimum=1,
-                    maximum=100,
-                    step=1,
-                    value=10,
-                )
-                speaker_random_btn = gr.Button("🎲ランダム話者", variant="secondary")
-            with gr.Column():
-                speaker_rank_inp = gr.Slider(
-                    label="声の高さで選ぶ",
-                    info="1 = 一番声が高い, 100 = 一番声が低い。選んだら「🔄話者IDに変換」を押してください。",
-                    minimum=1,
-                    maximum=100,
-                    step=1,
-                    value=1,
-                )
-                rank_to_spk_id_btn = gr.Button("🔄話者IDに変換")
     with gr.Column(variant="panel"):
         gr.Markdown("### 3. 音声生成")
         with gr.Row(equal_height=True):
@@ -340,25 +343,74 @@ with gr.Blocks(title="Paratts 音声合成デモ") as app:
             out_usage = gr.Number(label="現在の使用量 (文字)", interactive=False)
             out_remaining = gr.Number(label="残り使用可能数 (文字)", interactive=False)
     # --- Event Wiring ---
-    # Speaker Rank change
-    speaker_rank_inp.input(
-        fn=lambda: (
-            gr.Button(variant="primary"),
-            gr.Button(variant="secondary", interactive=False),
-        ),
-        outputs=[rank_to_spk_id_btn, gen_btn],
         api_visibility="private",
     )
-    # Rank to Speaker ID
-    rank_to_spk_id_btn.click(
-        fn=lambda rank: (
-            rank_to_speaker_id(rank),
-            gr.Button(variant="secondary"),
-            gr.Button(variant="primary", interactive=True),
-        ),
-        inputs=[speaker_rank_inp],
-        outputs=[speaker_inp, rank_to_spk_id_btn, gen_btn],
         api_visibility="private",
     )
@@ -370,14 +422,6 @@ with gr.Blocks(title="Paratts 音声合成デモ") as app:
         api_visibility="private",
     )
-    # Random Speaker
-    speaker_random_btn.click(
-        fn=lambda: random.randint(1, 100),
-        inputs=[],
-        outputs=[speaker_inp],
-        api_visibility="private",
-    )
     # Generation
     gen_btn.click(
         fn=generate_speech,

 import datetime
+import json
 import os
 import random
+from pathlib import Path
 import gradio as gr
 import jwt
 # Global Client Instance
 CLIENT = None
+ICON_HEIGHT = 140
 def get_client():
     global CLIENT
     "おい！そんなこと今更言ったってしょうがねぇだろ！いい大人なんだから、さっさと諦めろよ！",
 ]
+# --- Speaker Mapping Data ---
+# Use relative paths for Gradio compatibility (works on HF Spaces where cwd is /app)
+ASSETS_PATH = Path("assets")
+ICONS_PATH = ASSETS_PATH / "icons"
+VOICES_PATH = ASSETS_PATH / "voices"
+def load_speaker_mapping() -> tuple[list, dict, dict]:
+    """Load mapping.json and build lookup dictionaries."""
+    with open("mapping.json", "r", encoding="utf-8") as f:
+        data = json.load(f)
+    # Build lookup: jvs_id -> internal_id ("jvs006" -> 6)
+    jvs_to_internal: dict[str, int] = {}
+    # Build lookup: internal_id -> speaker_info (6 -> {name, icon_id, jvs_id, ...})
+    internal_to_info: dict[int, dict] = {}
+    for category in data:
+        category_id = category["id"]
+        # Skip character voices (not supported by backend yet)
+        if category_id == "character":
+            continue
+        for speaker in category["speakers"]:
+            jvs_id = speaker["id"]
+            # Extract internal ID: "jvs006" -> 6
+            internal_id = int(jvs_id[3:])
+            jvs_to_internal[jvs_id] = internal_id
+            internal_to_info[internal_id] = {
+                "jvs_id": jvs_id,
+                "name": speaker["name"],
+                "icon_id": speaker["icon_id"],
+                "category_id": category_id,
+            }
+    return data, jvs_to_internal, internal_to_info
+def get_gallery_data_for_category(
+    speaker_data: list, category_id: str
+) -> list[tuple[str, str]]:
+    """Return list of (icon_path, "name|jvs_id") tuples for gallery."""
+    gallery_items = []
+    for category in speaker_data:
+        if category["id"] != category_id:
+            continue
+        for speaker in category["speakers"]:
+            icon_path = ICONS_PATH / f"{speaker['icon_id']}.webp"
+            # Encode jvs_id in caption for retrieval on selection
+            caption = f"{speaker['name']}|{speaker['id']}"
+            gallery_items.append((str(icon_path), caption))
+    return gallery_items
+def get_voice_sample_path(jvs_id: str) -> str | None:
+    """Get path to voice sample: jvs006 -> assets/voices/jvs006.opus"""
+    voice_path = VOICES_PATH / f"{jvs_id}.opus"
+    if voice_path.exists():
+        return str(voice_path)
+    return None
+# Load speaker data at startup
+SPEAKER_DATA, JVS_TO_INTERNAL, INTERNAL_TO_INFO = load_speaker_mapping()
+MALE_GALLERY = get_gallery_data_for_category(SPEAKER_DATA, "jvs-men")
+FEMALE_GALLERY = get_gallery_data_for_category(SPEAKER_DATA, "jvs-woman")
+# Default speaker info (jvs010, internal_id=10)
+DEFAULT_SPEAKER_ID = 10
+DEFAULT_SPEAKER_INFO = INTERNAL_TO_INFO.get(DEFAULT_SPEAKER_ID, {})
 # --- Backend Interaction ---
         )
     with gr.Column(variant="panel"):
+        gr.Markdown("### 1. 話者を選ぶ")
+        # Speaker gallery
+        with gr.Accordion("話者を変更", open=False):
+            gr.Markdown("※ 番号は声の高さを表します（01が最も低く、51/49が最も高い）")
+            with gr.Tabs():
+                with gr.Tab(f"男性 ({len(MALE_GALLERY)})"):
+                    male_gallery = gr.Gallery(
+                        value=[
+                            (item[0], item[1].split("|")[0]) for item in MALE_GALLERY
+                        ],
+                        columns=7,
+                        rows=7,
+                        height=2 * ICON_HEIGHT,
+                        object_fit="fill",
+                        allow_preview=False,
+                        show_label=False,
+                        elem_id="male-gallery",
+                    )
+                with gr.Tab(f"女性 ({len(FEMALE_GALLERY)})"):
+                    female_gallery = gr.Gallery(
+                        value=[
+                            (item[0], item[1].split("|")[0]) for item in FEMALE_GALLERY
+                        ],
+                        columns=7,
+                        rows=8,
+                        height=2 * ICON_HEIGHT,
+                        object_fit="cover",
+                        allow_preview=False,
+                        show_label=False,
+                        elem_id="female-gallery",
+                    )
+            # For backward compatibility: direct speaker ID input
+            speaker_inp = gr.Slider(
+                label="話者ID (直接入力)",
+                minimum=1,
+                maximum=100,
+                step=1,
+                value=DEFAULT_SPEAKER_ID,
+            )
+        # Selected speaker preview
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1, min_width=140):
+                selected_icon = gr.Image(
+                    value=str(
+                        ICONS_PATH / f"{DEFAULT_SPEAKER_INFO.get('icon_id', '')}.webp"
+                    )
+                    if DEFAULT_SPEAKER_INFO
+                    else None,
+                    interactive=False,
+                    label="アイコン",
+                    height=ICON_HEIGHT,
+                    buttons=[],
+                )
+            with gr.Column(scale=3):
+                voice_preview = gr.Audio(
+                    label=f"ボイスサンプル ({DEFAULT_SPEAKER_INFO.get('name', '')})",
+                    value=get_voice_sample_path(DEFAULT_SPEAKER_INFO.get("jvs_id", "")),
+                    buttons=[],
+                )
+    with gr.Column(variant="panel"):
+        gr.Markdown("### 2. テキストを入力")
         text_inp = gr.Textbox(
             show_label=False,
             placeholder="読み上げてほしい日本語の文章を入力してください。",
                 inputs=[text_inp],
                 examples_per_page=20,
             )
     with gr.Column(variant="panel"):
         gr.Markdown("### 3. 音声生成")
         with gr.Row(equal_height=True):
             out_usage = gr.Number(label="現在の使用量 (文字)", interactive=False)
             out_remaining = gr.Number(label="残り使用可能数 (文字)", interactive=False)
+    # --- Event Handler Functions ---
+    def _handle_gallery_select(evt: gr.SelectData, gallery_data: list):
+        """Handle gallery selection -> update slider and preview."""
+        if evt.index is None:
+            return gr.update(), gr.update(), gr.update()
+        # Get the full caption which contains "name|jvs_id"
+        full_caption = gallery_data[evt.index][1]
+        parts = full_caption.split("|")
+        display_name = parts[0]
+        jvs_id = parts[1] if len(parts) > 1 else None
+        if jvs_id is None:
+            return gr.update(), gr.update(), gr.update()
+        # Get internal ID (jvs006 -> 6)
+        internal_id = JVS_TO_INTERNAL.get(jvs_id, 10)
+        # Get icon path
+        info = INTERNAL_TO_INFO.get(internal_id, {})
+        icon_path = str(ICONS_PATH / f"{info.get('icon_id', '')}.webp")
+        # Get voice sample path
+        voice_path = get_voice_sample_path(jvs_id)
+        return (
+            internal_id,  # speaker_inp
+            icon_path,  # selected_icon
+            gr.update(label=f"ボイスサンプル ({display_name})", value=voice_path),
+        )
+    def on_male_gallery_select(evt: gr.SelectData):
+        return _handle_gallery_select(evt, MALE_GALLERY)
+    def on_female_gallery_select(evt: gr.SelectData):
+        return _handle_gallery_select(evt, FEMALE_GALLERY)
+    def on_slider_change(internal_id: int):
+        """Update preview when slider changes directly."""
+        info = INTERNAL_TO_INFO.get(internal_id)
+        if info:
+            icon_path = str(ICONS_PATH / f"{info['icon_id']}.webp")
+            voice_path = get_voice_sample_path(info["jvs_id"])
+            return icon_path, gr.update(
+                label=f"ボイスサンプル ({info['name']})", value=voice_path
+            )
+        return None, gr.update(label=f"ボイスサンプル (話者 {internal_id})", value=None)
     # --- Event Wiring ---
+    # Gallery selection events
+    male_gallery.select(
+        fn=on_male_gallery_select,
+        inputs=[],
+        outputs=[speaker_inp, selected_icon, voice_preview],
         api_visibility="private",
     )
+    female_gallery.select(
+        fn=on_female_gallery_select,
+        inputs=[],
+        outputs=[speaker_inp, selected_icon, voice_preview],
+        api_visibility="private",
+    )
+    # Slider change -> update preview
+    speaker_inp.change(
+        fn=on_slider_change,
+        inputs=[speaker_inp],
+        outputs=[selected_icon, voice_preview],
         api_visibility="private",
     )
         api_visibility="private",
     )
     # Generation
     gen_btn.click(
         fn=generate_speech,