File size: 5,559 Bytes
4d03adc
 
 
 
 
 
 
 
 
 
 
 
49c59a3
4d03adc
49c59a3
4d03adc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49c59a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d03adc
 
49c59a3
4d03adc
 
 
 
 
 
 
49c59a3
4d03adc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49c59a3
 
 
4d03adc
49c59a3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d03adc
 
 
 
49c59a3
4d03adc
 
 
 
 
 
 
49c59a3
4d03adc
 
 
 
 
 
 
 
 
49c59a3
 
4d03adc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
"""
ui/sidebar.py
──────────────────────────────────────────────────────────────────────────────
VoiceVerse Pro β€” Configuration Sidebar
"""

from __future__ import annotations

import os

import streamlit as st

from modules.llm_backbone import SUPPORTED_MODELS
from modules.tts_engine import TTSBackend
from ui.state import SidebarConfig, OutputMode


def render(current_stage: int) -> SidebarConfig:
    with st.sidebar:
        st.markdown("### βš™οΈ Configuration")
        st.divider()

        # ── Auth ──────────────────────────────────────────────────────────────
        st.markdown("**πŸ”‘ Hugging Face API Token**")
        hf_token = st.text_input(
            "HF Token",
            value=os.getenv("HUGGINGFACEHUB_API_TOKEN", ""),
            type="password",
            label_visibility="collapsed",
            placeholder="hf_…",
        )
        if hf_token:
            os.environ["HUGGINGFACEHUB_API_TOKEN"] = hf_token

        st.divider()

        # ── Output Mode ───────────────────────────────────────────────────────
        st.markdown("**🎭 Output Mode**")
        # Read current value from session state (set by main-area toggle)
        current_mode = st.session_state.get("output_mode", "Audio Transcript")
        mode_options = [m.value for m in OutputMode]
        mode_index = mode_options.index(current_mode) if current_mode in mode_options else 0
        mode_label = st.radio(
            "Output Mode",
            options=mode_options,
            index=mode_index,
            label_visibility="collapsed",
            help=(
                "Also controllable via the main toggle above the pipeline. "
                "Both controls are in sync."
            ),
        )
        # Write back to session state so main-area toggle reflects sidebar change
        st.session_state["output_mode"] = mode_label
        output_mode = OutputMode(mode_label)

        st.divider()

        # ── RAG ───────────────────────────────────────────────────────────────
        st.markdown("**πŸ” RAG Parameters**")
        top_k = st.slider("Top-K Chunks", 1, 8, 4)
        chunk_size = st.slider("Chunk Size", 400, 2000, 1000, step=100)
        chunk_overlap = st.slider("Chunk Overlap", 0, 300, 100, step=50)

        st.divider()

        # ── LLM ───────────────────────────────────────────────────────────────
        st.markdown("**🧠 LLM Settings**")
        model_id = st.selectbox("Model", options=SUPPORTED_MODELS, index=0)
        temperature = st.slider("Temperature", 0.1, 1.2, 0.65, step=0.05)
        max_tokens = st.slider("Max New Tokens", 256, 2048, 1024, step=128)
        target_words = st.slider("Target Script Words", 100, 800, 400, step=50)

        st.divider()

        # ── TTS ───────────────────────────────────────────────────────────────
        st.markdown("**πŸ”Š TTS Backend**")
        tts_label = st.selectbox(
            "TTS Engine",
            options=[b.value for b in TTSBackend],
            index=0,
            label_visibility="collapsed",
        )
        tts_backend = TTSBackend(tts_label)

        speaker_id = 7306
        female_speaker_id = 1580
        male_speaker_id = 7306

        if tts_backend == TTSBackend.SPEECHT5:
            if output_mode == OutputMode.TRANSCRIPT:
                speaker_id = st.slider(
                    "Speaker ID (xvector)", 0, 7500, 7306, step=100,
                    help="CMU Arctic speaker index for the narrator voice.",
                )
            else:
                st.markdown("*Podcast voices (CMU Arctic xvectors):*")
                female_speaker_id = st.slider(
                    "πŸŽ™οΈ HOST β€” Female Voice ID", 0, 7500, 1580, step=100,
                    help="Speaker embedding for the female host.",
                )
                male_speaker_id = st.slider(
                    "πŸŽ™οΈ GUEST β€” Male Voice ID", 0, 7500, 7306, step=100,
                    help="Speaker embedding for the male guest.",
                )

        st.divider()

        _stage_icons = ["β­•", "πŸ”΅", "🟑", "🟠", "🟒"]
        st.markdown(f"**Pipeline:** {_stage_icons[current_stage]} Stage {current_stage}/4")
        st.markdown(
            "<small style='color:#555'>VoiceVerse Pro Β· 2026 Stable</small>",
            unsafe_allow_html=True,
        )

    return SidebarConfig(
        hf_token=hf_token,
        output_mode=output_mode,
        top_k=top_k,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        model_id=model_id,
        temperature=temperature,
        max_tokens=max_tokens,
        target_words=target_words,
        tts_backend=tts_backend,
        speaker_id=speaker_id,
        female_speaker_id=female_speaker_id,
        male_speaker_id=male_speaker_id,
    )