tinytutor / app.py
cwattsnogueira's picture
Enhanced AudioScriptWrtierAgent prompt. (#2)
edbb725 verified
import os
import asyncio
import google.generativeai as genai
from google.adk.agents import Agent
from google.adk.models.google_llm import Gemini
from google.adk.runners import InMemoryRunner
from google.adk.tools import google_search
from google.genai import types
from google.cloud import texttospeech
from pydub import AudioSegment
import gradio as gr
# TinyTutor App
# --- Configure API Keys ---
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
if not GOOGLE_API_KEY:
raise RuntimeError("❌ Missing GOOGLE_API_KEY environment variable.")
genai.configure(api_key=GOOGLE_API_KEY)
SERVICE_ACCOUNT_JSON = os.getenv("GCP_VI_SERVICE_ACCOUNT_JSON")
if not SERVICE_ACCOUNT_JSON:
raise RuntimeError("❌ Missing GCP_VI_SERVICE_ACCOUNT_JSON environment variable.")
with open("tinytutor-tss-agent.json", "w") as f:
f.write(SERVICE_ACCOUNT_JSON)
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "tinytutor-tss-agent.json"
tts_client = texttospeech.TextToSpeechClient()
# --- Retry Options ---
retry_config = types.HttpRetryOptions(
attempts=5,
exp_base=7,
initial_delay=1,
http_status_codes=[429, 500, 503, 504]
)
# --- Pedagogy Agent ---
pedagogy_agent = Agent(
name="PedagogyAgent",
model=Gemini(model="gemini-2.5-flash-lite", retry_options=retry_config),
description="Explains topics in simple ELI5 style.",
instruction="Explain the topic like I'm 5. Use google_search if needed.",
tools=[google_search],
)
runner = InMemoryRunner(agent=pedagogy_agent)
async def run_pedagogy_async(topic: str) -> str:
response = await runner.run_debug(topic)
return response[0].content.parts[0].text
# --- ScriptWriter Agent ---
SCRIPTWRITER_SYSTEM_PROMPT = """
You are a Teacher.
Your role is to take a simplified explanation created by the Pedagogy Agent and turn it into a clear, friendly teaching script suitable for a young child around the age of 5.
The script you produce will be used by a Text-to-Speech (TTS) system, so write in a way that sounds natural when spoken aloud.
Follow these steps:
1. Read the simplified explanation provided by the Pedagogy Agent.
2. Transform it into a spoken-style teaching script that:
- Uses short, clear sentences.
- Uses warm, encouraging language.
- Keeps a playful, curious tone suitable for a young child.
- Avoids complex words unless they were already explained.
- Includes gentle teacher-like transitions (“Let’s imagine…”, “Did you know…?”, “Now let’s think about…”).
- **Do NOT use sound effects or onomatopoeia (e.g., “boing,” “zoom,” “pow”).**
- **Do NOT repeat words for dramatic effect (e.g., “straight, straight, straight”).**
- Keep playfulness through ideas and imagery, not noises.
3. Add exactly 2 learning questions inside the story to spark curiosity.
- The questions must feel natural within the flow of the explanation.
- They should be simple, open-ended questions a young child can think about.
- Do NOT place both questions back-to-back.
4. Make sure the script is vivid and engaging:
- Use simple imagery.
- Ask simple rhetorical questions.
- Use examples familiar to young children.
5. Avoid:
- Any reference to agents, prompts, or system instructions.
- Visual descriptions that don't make sense in audio (“look at this picture”).
- Overly long paragraphs—keep pacing steady for TTS.
6. Output only the final teaching script, nothing else. No labels, no titles, no markdown.
"""
def run_scriptwriter(explanation: str) -> str:
model = genai.GenerativeModel(
model_name="gemini-2.5-flash",
system_instruction=SCRIPTWRITER_SYSTEM_PROMPT
)
response = model.generate_content(
f"Write a children's story based on this:\n{explanation}",
generation_config=genai.GenerationConfig(
temperature=0.9,
max_output_tokens=4096
)
)
try:
return response.text
except Exception:
try:
return response.candidates[0].content.parts[0].text
except Exception:
return "⚠️ ScriptWriter failed."
# --- Audio Generator ---
def chunk_text(text, max_chars=4500):
text = text.strip()
if len(text) <= max_chars:
return [text]
chunks = []
while len(text) > max_chars:
cut = text.rfind(". ", 0, max_chars)
if cut == -1:
cut = max_chars
chunks.append(text[:cut+1])
text = text[cut+1:].strip()
chunks.append(text)
return chunks
def tts_segment(text):
synthesis_input = texttospeech.SynthesisInput(text=text)
voice = texttospeech.VoiceSelectionParams(
language_code="en-US",
name="en-US-Journey-F"
)
audio_cfg = texttospeech.AudioConfig(
audio_encoding=texttospeech.AudioEncoding.MP3,
speaking_rate=0.94,
pitch=0.0,
volume_gain_db=0.0
)
response = tts_client.synthesize_speech(
input=synthesis_input,
voice=voice,
audio_config=audio_cfg
)
return response.audio_content
def audio_writer(script_text: str, out="story.mp3"):
chunks = chunk_text(script_text)
audio = AudioSegment.silent(200)
for i, chunk in enumerate(chunks, 1):
path = f"seg_{i}.mp3"
with open(path, "wb") as f:
f.write(tts_segment(chunk))
audio += AudioSegment.from_mp3(path)
audio += AudioSegment.silent(150)
audio.export(out, format="mp3")
return out
# --- Full Pipeline ---
async def full_pipeline(topic: str):
eli5 = await run_pedagogy_async(topic)
script = run_scriptwriter(eli5)
audio_path = audio_writer(script, "story.mp3")
return eli5, script, audio_path
# --- Gradio App ---
app = gr.Interface(
fn=full_pipeline,
inputs=gr.Textbox(label="Your Topic"),
outputs=[
gr.Textbox(label="ELI5 Explanation", lines=8),
gr.Textbox(label="Generated Story Script", lines=20),
gr.Audio(label="Generated Audio")
],
title="🎧 TinyTutor — Full Pipeline"
)
if __name__ == "__main__":
app.launch()