Spaces:

build-small-hackathon
/

rupkotha

Running

File size: 9,951 Bytes

# app.py — Gradio Blocks entry point. UI + wiring only. ZERO model references.
"""Rupkotha (রূপকথা) — a bedtime-story app for kids.

This file orchestrates the UI and chains core functions:
    transcribe() → generate_story() → speak()
It must contain no model names, paths, or model logic — those live only in core/.

Layout: a two-panel "studio" — a Create panel (language/style, pictures, ask) and a
Story panel (text + audio + save) — over a night-sky theme. Session memory uses
gr.State, never browser storage (CLAUDE.md §11).
"""

from pathlib import Path

import gradio as gr

from core.vision_story import generate_story
from core.stt import transcribe
from core.tts import speak
from core.prompts import STYLES

# Language radio: display label → internal code passed to core functions.
_LANGUAGES = [("English", "en"), ("বাংলা", "bn")]
_STYLE_CHOICES = {lang: list(styles.keys()) for lang, styles in STYLES.items()}

_CSS_PATH = Path(__file__).parent / "assets" / "styles.css"

HISTORY_SIZE = 3  # how many recent stories to keep (CLAUDE.md §11: last 3)


def _styles_for(language: str):
    """Return a style-dropdown update for the chosen language."""
    choices = _STYLE_CHOICES.get(language, _STYLE_CHOICES["en"])
    return gr.update(choices=choices, value=choices[0])


def _preview(files):
    """Show uploaded images in the preview gallery; hide it when empty."""
    files = files or []
    return gr.update(value=files, visible=bool(files))


def _voice_to_text(audio_path, language):
    """Transcribe a mic recording into the instruction box. On empty/failed
    transcription, leave whatever the child already typed untouched."""
    text = transcribe(audio_path, language)
    return text if text else gr.update()


def _tell_a_story(images, instruction, language, style, child_name):
    """Chain: images + instruction → story text → motherly-voice audio.

    Each core call degrades gracefully (never raises), so the UI always shows
    a story even if Modal is unreachable or audio synthesis fails. Also returns
    a `current` dict so the Save button can capture the exact result shown.
    """
    image_paths = [img for img in (images or [])]
    story, model_label = generate_story(
        image_paths=image_paths,
        instruction=instruction or "",
        language=language,
        style=style,
        child_name=child_name or "",
    )
    wav_path, tts_label = speak(story, language)
    badge = f"📖 {model_label}　·　🔊 {tts_label}"
    current = {"story": story, "audio": wav_path, "badge": badge}
    return story, wav_path, badge, current


def _history_updates(history):
    """Flatten `history` into per-slot updates: (group, markdown, audio) × N."""
    updates = []
    for i in range(HISTORY_SIZE):
        if i < len(history):
            entry = history[i]
            body = f"{entry['story']}\n\n<span class='saved-badge'>{entry['badge']}</span>"
            updates += [
                gr.update(visible=True),
                gr.update(value=body),
                gr.update(value=entry.get("audio")),
            ]
        else:
            updates += [
                gr.update(visible=False),
                gr.update(value=""),
                gr.update(value=None),
            ]
    return updates


def _save_story(current, history):
    """Prepend the current story to the session history (newest first, max N)."""
    history = list(history or [])
    if current and current.get("story"):
        history = ([current] + history)[:HISTORY_SIZE]
    return [history, *_history_updates(history)]


def build_ui() -> gr.Blocks:
    theme = gr.themes.Soft(
        primary_hue="amber",
        secondary_hue="orange",
        neutral_hue="slate",
        radius_size="lg",
        font=[gr.themes.GoogleFont("Nunito"), "ui-sans-serif", "sans-serif"],
    )
    css_kw = {"css_paths": [str(_CSS_PATH)]} if _CSS_PATH.exists() else {}
    with gr.Blocks(title="রূপকথা · Rupkotha", theme=theme, fill_width=True, **css_kw) as demo:
        # ── Hero ─────────────────────────────────────────────────────────
        gr.HTML(
            """
            <div id="hero">
              <div class="hero-moon">🌙</div>
              <h1>রূপকথা · Rupkotha</h1>
              <p>Show a picture, ask for a story — and hear it told in a warm
                 motherly voice.</p>
            </div>
            """
        )

        with gr.Row(elem_id="studio", equal_height=False):
            # ── Create panel ─────────────────────────────────────────────
            with gr.Column(scale=5, elem_classes="panel"):
                gr.HTML('<div class="panel-head"><span class="step">1</span>Choose</div>')
                with gr.Row():
                    language = gr.Radio(
                        choices=_LANGUAGES, value="en",
                        label="Language · ভাষা", elem_classes="seg",
                    )
                    style = gr.Dropdown(
                        choices=_STYLE_CHOICES["en"], value=_STYLE_CHOICES["en"][0],
                        label="Story style",
                    )

                gr.HTML('<div class="panel-head"><span class="step">2</span>Show your pictures</div>')
                images = gr.File(
                    file_count="multiple",
                    type="filepath",
                    file_types=["image"],
                    label="Drawings or toys — 1 to 4 pictures",
                    elem_classes="upload-box",
                )
                preview = gr.Gallery(
                    label="Your pictures",
                    columns=4,
                    height="auto",
                    object_fit="contain",   # show the whole image, don't crop/trim
                    show_label=True,
                    visible=False,
                    elem_classes="preview",
                )

                gr.HTML('<div class="panel-head"><span class="step">3</span>Ask for a story</div>')
                mic = gr.Audio(
                    sources=["microphone"],
                    type="filepath",
                    label="🎤 Speak your request (optional) — it fills the box below",
                )
                instruction = gr.Textbox(
                    label="What story do you want?",
                    placeholder="tell me a story about my cat…",
                    lines=2,
                )
                child_name = gr.Textbox(
                    label="Your name (optional)",
                    placeholder="e.g. Rupa — woven into the story",
                    lines=1,
                )
                generate_btn = gr.Button(
                    "✨ Tell me a story", variant="primary", size="lg",
                    elem_id="generate-btn",
                )

            # ── Story panel ──────────────────────────────────────────────
            with gr.Column(scale=6, elem_classes="panel story-panel"):
                gr.HTML('<div class="panel-head">📖 Your story</div>')
                story_out = gr.Textbox(
                    show_label=False,
                    lines=8,
                    max_lines=40,   # grow to fit the whole story (no inner scrollbar)
                    autoscroll=False,
                    placeholder="Your bedtime story will appear here…  ✨",
                    elem_classes="story-text",
                    container=False,
                )
                audio_out = gr.Audio(label="🔊 Listen (press play to replay)", type="filepath")
                badge_out = gr.Markdown(elem_classes="model-badge")
                save_btn = gr.Button("💾 Save this story", elem_id="save-btn")

        # ── Saved stories: last 3, each replayable (gr.State session memory) ─
        current = gr.State(None)
        history = gr.State([])
        gr.HTML('<div class="section-title">🌟 Your saved stories</div>')
        slots = []
        with gr.Row(elem_id="history-row", equal_height=False):
            for _ in range(HISTORY_SIZE):
                with gr.Column(scale=1, min_width=240):
                    with gr.Group(visible=False, elem_classes="saved-card") as slot_group:
                        slot_md = gr.Markdown(elem_classes="saved-text")
                        slot_audio = gr.Audio(type="filepath", label="Replay")
                slots.append((slot_group, slot_md, slot_audio))

        # ── Wiring ───────────────────────────────────────────────────────
        language.change(_styles_for, inputs=language, outputs=style)

        # Show thumbnails of the uploaded pictures.
        images.change(_preview, inputs=images, outputs=preview)

        # Voice is a bonus: it fills the typed box, which stays primary (§2, §14).
        mic.stop_recording(_voice_to_text, inputs=[mic, language], outputs=instruction)

        generate_btn.click(
            _tell_a_story,
            inputs=[images, instruction, language, style, child_name],
            outputs=[story_out, audio_out, badge_out, current],
        )

        # Flatten slots for the Save outputs: history + (group, md, audio) × N.
        slot_outputs = [comp for slot in slots for comp in slot]
        save_btn.click(
            _save_story,
            inputs=[current, history],
            outputs=[history, *slot_outputs],
        )
    return demo


# Module-level `demo` so Hugging Face Spaces (gradio SDK) can discover it.
demo = build_ui().queue()

if __name__ == "__main__":
    demo.launch()