File size: 9,951 Bytes
f655146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8aa73fd
f655146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8aa73fd
f655146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8aa73fd
f655146
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
06223b0
 
 
f655146
06223b0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
# app.py β€” Gradio Blocks entry point. UI + wiring only. ZERO model references.
"""Rupkotha (ΰ¦°ΰ§‚ΰ¦ͺকΰ¦₯ΰ¦Ύ) β€” a bedtime-story app for kids.

This file orchestrates the UI and chains core functions:
    transcribe() β†’ generate_story() β†’ speak()
It must contain no model names, paths, or model logic β€” those live only in core/.

Layout: a two-panel "studio" β€” a Create panel (language/style, pictures, ask) and a
Story panel (text + audio + save) β€” over a night-sky theme. Session memory uses
gr.State, never browser storage (CLAUDE.md Β§11).
"""

from pathlib import Path

import gradio as gr

from core.vision_story import generate_story
from core.stt import transcribe
from core.tts import speak
from core.prompts import STYLES

# Language radio: display label β†’ internal code passed to core functions.
_LANGUAGES = [("English", "en"), ("বাংলা", "bn")]
_STYLE_CHOICES = {lang: list(styles.keys()) for lang, styles in STYLES.items()}

_CSS_PATH = Path(__file__).parent / "assets" / "styles.css"

HISTORY_SIZE = 3  # how many recent stories to keep (CLAUDE.md Β§11: last 3)


def _styles_for(language: str):
    """Return a style-dropdown update for the chosen language."""
    choices = _STYLE_CHOICES.get(language, _STYLE_CHOICES["en"])
    return gr.update(choices=choices, value=choices[0])


def _preview(files):
    """Show uploaded images in the preview gallery; hide it when empty."""
    files = files or []
    return gr.update(value=files, visible=bool(files))


def _voice_to_text(audio_path, language):
    """Transcribe a mic recording into the instruction box. On empty/failed
    transcription, leave whatever the child already typed untouched."""
    text = transcribe(audio_path, language)
    return text if text else gr.update()


def _tell_a_story(images, instruction, language, style, child_name):
    """Chain: images + instruction β†’ story text β†’ motherly-voice audio.

    Each core call degrades gracefully (never raises), so the UI always shows
    a story even if Modal is unreachable or audio synthesis fails. Also returns
    a `current` dict so the Save button can capture the exact result shown.
    """
    image_paths = [img for img in (images or [])]
    story, model_label = generate_story(
        image_paths=image_paths,
        instruction=instruction or "",
        language=language,
        style=style,
        child_name=child_name or "",
    )
    wav_path, tts_label = speak(story, language)
    badge = f"πŸ“– {model_label}γ€€Β·γ€€πŸ”Š {tts_label}"
    current = {"story": story, "audio": wav_path, "badge": badge}
    return story, wav_path, badge, current


def _history_updates(history):
    """Flatten `history` into per-slot updates: (group, markdown, audio) Γ— N."""
    updates = []
    for i in range(HISTORY_SIZE):
        if i < len(history):
            entry = history[i]
            body = f"{entry['story']}\n\n<span class='saved-badge'>{entry['badge']}</span>"
            updates += [
                gr.update(visible=True),
                gr.update(value=body),
                gr.update(value=entry.get("audio")),
            ]
        else:
            updates += [
                gr.update(visible=False),
                gr.update(value=""),
                gr.update(value=None),
            ]
    return updates


def _save_story(current, history):
    """Prepend the current story to the session history (newest first, max N)."""
    history = list(history or [])
    if current and current.get("story"):
        history = ([current] + history)[:HISTORY_SIZE]
    return [history, *_history_updates(history)]


def build_ui() -> gr.Blocks:
    theme = gr.themes.Soft(
        primary_hue="amber",
        secondary_hue="orange",
        neutral_hue="slate",
        radius_size="lg",
        font=[gr.themes.GoogleFont("Nunito"), "ui-sans-serif", "sans-serif"],
    )
    css_kw = {"css_paths": [str(_CSS_PATH)]} if _CSS_PATH.exists() else {}
    with gr.Blocks(title="ΰ¦°ΰ§‚ΰ¦ͺকΰ¦₯ΰ¦Ύ Β· Rupkotha", theme=theme, fill_width=True, **css_kw) as demo:
        # ── Hero ─────────────────────────────────────────────────────────
        gr.HTML(
            """
            <div id="hero">
              <div class="hero-moon">πŸŒ™</div>
              <h1>ΰ¦°ΰ§‚ΰ¦ͺকΰ¦₯ΰ¦Ύ Β· Rupkotha</h1>
              <p>Show a picture, ask for a story β€” and hear it told in a warm
                 motherly voice.</p>
            </div>
            """
        )

        with gr.Row(elem_id="studio", equal_height=False):
            # ── Create panel ─────────────────────────────────────────────
            with gr.Column(scale=5, elem_classes="panel"):
                gr.HTML('<div class="panel-head"><span class="step">1</span>Choose</div>')
                with gr.Row():
                    language = gr.Radio(
                        choices=_LANGUAGES, value="en",
                        label="Language Β· ΰ¦­ΰ¦Ύΰ¦·ΰ¦Ύ", elem_classes="seg",
                    )
                    style = gr.Dropdown(
                        choices=_STYLE_CHOICES["en"], value=_STYLE_CHOICES["en"][0],
                        label="Story style",
                    )

                gr.HTML('<div class="panel-head"><span class="step">2</span>Show your pictures</div>')
                images = gr.File(
                    file_count="multiple",
                    type="filepath",
                    file_types=["image"],
                    label="Drawings or toys β€” 1 to 4 pictures",
                    elem_classes="upload-box",
                )
                preview = gr.Gallery(
                    label="Your pictures",
                    columns=4,
                    height="auto",
                    object_fit="contain",   # show the whole image, don't crop/trim
                    show_label=True,
                    visible=False,
                    elem_classes="preview",
                )

                gr.HTML('<div class="panel-head"><span class="step">3</span>Ask for a story</div>')
                mic = gr.Audio(
                    sources=["microphone"],
                    type="filepath",
                    label="🎀 Speak your request (optional) β€” it fills the box below",
                )
                instruction = gr.Textbox(
                    label="What story do you want?",
                    placeholder="tell me a story about my cat…",
                    lines=2,
                )
                child_name = gr.Textbox(
                    label="Your name (optional)",
                    placeholder="e.g. Rupa β€” woven into the story",
                    lines=1,
                )
                generate_btn = gr.Button(
                    "✨ Tell me a story", variant="primary", size="lg",
                    elem_id="generate-btn",
                )

            # ── Story panel ──────────────────────────────────────────────
            with gr.Column(scale=6, elem_classes="panel story-panel"):
                gr.HTML('<div class="panel-head">πŸ“– Your story</div>')
                story_out = gr.Textbox(
                    show_label=False,
                    lines=8,
                    max_lines=40,   # grow to fit the whole story (no inner scrollbar)
                    autoscroll=False,
                    placeholder="Your bedtime story will appear here…  ✨",
                    elem_classes="story-text",
                    container=False,
                )
                audio_out = gr.Audio(label="πŸ”Š Listen (press play to replay)", type="filepath")
                badge_out = gr.Markdown(elem_classes="model-badge")
                save_btn = gr.Button("πŸ’Ύ Save this story", elem_id="save-btn")

        # ── Saved stories: last 3, each replayable (gr.State session memory) ─
        current = gr.State(None)
        history = gr.State([])
        gr.HTML('<div class="section-title">🌟 Your saved stories</div>')
        slots = []
        with gr.Row(elem_id="history-row", equal_height=False):
            for _ in range(HISTORY_SIZE):
                with gr.Column(scale=1, min_width=240):
                    with gr.Group(visible=False, elem_classes="saved-card") as slot_group:
                        slot_md = gr.Markdown(elem_classes="saved-text")
                        slot_audio = gr.Audio(type="filepath", label="Replay")
                slots.append((slot_group, slot_md, slot_audio))

        # ── Wiring ───────────────────────────────────────────────────────
        language.change(_styles_for, inputs=language, outputs=style)

        # Show thumbnails of the uploaded pictures.
        images.change(_preview, inputs=images, outputs=preview)

        # Voice is a bonus: it fills the typed box, which stays primary (Β§2, Β§14).
        mic.stop_recording(_voice_to_text, inputs=[mic, language], outputs=instruction)

        generate_btn.click(
            _tell_a_story,
            inputs=[images, instruction, language, style, child_name],
            outputs=[story_out, audio_out, badge_out, current],
        )

        # Flatten slots for the Save outputs: history + (group, md, audio) Γ— N.
        slot_outputs = [comp for slot in slots for comp in slot]
        save_btn.click(
            _save_story,
            inputs=[current, history],
            outputs=[history, *slot_outputs],
        )
    return demo


# Module-level `demo` so Hugging Face Spaces (gradio SDK) can discover it.
demo = build_ui().queue()

if __name__ == "__main__":
    demo.launch()