aileen3-core / demo /translation_cell.py
ndurner's picture
add comments
0c163b8
raw
history blame
16.6 kB
from __future__ import annotations
import asyncio
import os
import sys
from pathlib import Path
import base64
from io import BytesIO
from typing import Tuple, Optional
import fastmcp
import gradio as gr
from PIL import Image
from layout import cell
from health import GEMINI_ENV_VAR
from problem_cell import render_status_box
from media_analysis_cell import (
ANALYSIS_VIDEO_URL,
_unwrap_tool_result,
_is_done,
_poll_until_done,
MAX_POLL_ATTEMPTS,
POLL_WAIT_SECONDS,
)
from slide_utils import normalize_slide_entries
from demo_logging import get_demo_logger, get_demo_log_path
log = get_demo_logger(__name__)
DEMO_LOG_PATH = str(get_demo_log_path())
LOG_HINT = f"See `{DEMO_LOG_PATH}` for details."
async def _run_translation_flow(
gemini_api_key: str,
language: str,
slide_index_text: str,
) -> Tuple[str, str, Optional[Image.Image]]:
"""Drive the MCP tools to translate a particular slide from the fixed video.
The flow reuses the same retrieval and slide-extraction pipeline as the
expectation-driven analysis cell, then calls the dedicated `translate_slide`
MCP tool to produce a target-language slide image.
"""
try:
from fastmcp import Client # type: ignore[import-untyped]
from fastmcp.client.transports import StdioTransport # type: ignore[import-untyped]
except Exception as exc: # pragma: no cover - defensive
status = render_status_box(f"fastmcp is not available in this environment: {exc}", "fail")
return status, "", None
target_language = (language or "").strip()
if not target_language:
status = render_status_box("Please provide a target language for translation.", "fail")
return status, "", None
slide_index_raw = (slide_index_text or "").strip()
try:
target_index = int(slide_index_raw)
except (TypeError, ValueError):
log.warning(
"Slide translation received non-integer slide number: %r", slide_index_text
)
status = render_status_box(
"Slide number must be an integer (e.g. 0, 1, 2).",
"fail",
)
return status, "", None
if target_index < 0:
log.warning("Slide translation received negative slide index: %d", target_index)
status = render_status_box(
"Slide number must be zero or a positive integer.",
"fail",
)
return status, "", None
log.info(
"Slide translation demo start video=%s language=%s slide_index_raw=%s",
ANALYSIS_VIDEO_URL,
target_language,
slide_index_text,
)
# Spawn the MCP server as a subprocess with PYTHONPATH pointing at
# the local `mcp/src` tree so this stays self-contained in the Space.
repo_root = Path(__file__).resolve().parents[1]
mcp_src = repo_root / "mcp" / "src"
existing_py_path = os.environ.get("PYTHONPATH", "")
py_path = f"{mcp_src}{os.pathsep}{existing_py_path}" if existing_py_path else str(mcp_src)
env = os.environ.copy()
env["PYTHONPATH"] = py_path
env[GEMINI_ENV_VAR] = gemini_api_key
server_entry = ["-m", "aileen3_mcp.server"]
log.info(
"Slide translation demo spawning MCP server: cmd=%s args=%s PYTHONPATH=%s cwd=%s language=%s",
sys.executable,
server_entry,
py_path,
repo_root,
target_language,
)
transport = StdioTransport(
command=sys.executable,
args=server_entry,
env=env,
cwd=str(repo_root),
)
async with Client(transport) as client:
retrieval_start = _unwrap_tool_result(
await client.call_tool(
"start_media_retrieval",
{
"source": ANALYSIS_VIDEO_URL,
"prefer_audio_only": False,
"wait_seconds": POLL_WAIT_SECONDS,
},
)
)
if retrieval_start.get("is_error"):
detail = retrieval_start.get("detail") or "Media retrieval failed."
log.warning("Slide translation retrieval failed: %s", detail)
status = render_status_box(detail, "fail")
return status, "", None
reference = retrieval_start.get("reference")
if not reference:
status = render_status_box(
"Media retrieval did not return a reference token.", "fail"
)
log.warning("Slide translation missing reference for video=%s", ANALYSIS_VIDEO_URL)
return status, "", None
retrieval = retrieval_start
if not _is_done(retrieval_start):
retrieval = await _poll_until_done(
client,
tool_name="get_media_retrieval_status",
reference=reference,
wait_seconds=POLL_WAIT_SECONDS,
max_attempts=MAX_POLL_ATTEMPTS,
)
if retrieval.get("is_error") or not _is_done(retrieval):
detail = retrieval.get("detail") or retrieval.get("status") or "Retrieval incomplete."
status = render_status_box(
f"Media retrieval did not complete successfully: {detail}", "fail"
)
return status, "", None
slides_result = _unwrap_tool_result(
await client.call_tool(
"get_extracted_slides",
{
"reference": reference,
"wait_seconds": 0,
},
)
)
slides = normalize_slide_entries(slides_result)
if not slides:
# No cached slides yet; trigger extraction explicitly.
extraction_start = _unwrap_tool_result(
await client.call_tool(
"start_slide_extraction",
{
"reference": reference,
"wait_seconds": POLL_WAIT_SECONDS,
},
)
)
if extraction_start.get("is_error"):
detail = extraction_start.get("detail") or "Slide extraction failed to start."
status = render_status_box(
f"Slide extraction did not complete successfully: {detail}", "fail"
)
log.error("Slide extraction failed reference=%s detail=%s", reference, detail)
return status, "", None
if not _is_done(extraction_start):
extraction_done = await _poll_until_done(
client,
tool_name="get_extracted_slides",
reference=reference,
wait_seconds=POLL_WAIT_SECONDS,
max_attempts=MAX_POLL_ATTEMPTS,
)
if extraction_done.get("is_error") or not _slide_entries_from_result(extraction_done):
detail = (
extraction_done.get("detail")
or extraction_done.get("status")
or "Slides not available."
)
status = render_status_box(
f"Slide extraction did not complete successfully: {detail}", "fail"
)
log.warning("Slide extraction still unavailable reference=%s detail=%s", reference, detail)
return status, "", None
slides = normalize_slide_entries(extraction_done)
if not slides:
status = render_status_box(
"No slides were detected for this video; nothing to translate.", "fail"
)
log.warning("Slide translation found no slides reference=%s", reference)
return status, "", None
if target_index >= len(slides):
status = render_status_box(
f"Slide number {target_index} is out of range; detected slides are indexed 0 to {len(slides) - 1}.",
"fail",
)
log.warning(
"Slide translation index out of range reference=%s requested=%d available=%d",
reference,
target_index,
len(slides),
)
return status, "", None
translate_result = await client.call_tool(
"translate_slide",
{
"reference": reference,
"slide_index": target_index,
"language": target_language,
},
)
log.info("translate_slide raw result type=%s", type(translate_result))
data: Optional[str] | Optional[bytes] = None
mime_type: Optional[str] = None
if isinstance(translate_result, fastmcp.client.client.CallToolResult):
img = translate_result.content[0]
data = img.data
mime_type = img.mimeType
else:
# old format, possibly cached
log.exception("Failed to decode translated slide, type=", type(translate_result))
status = render_status_box(
f"Slide translation produced data, but the object type is unknown.",
"fail",
)
details_md = (
f"Result type: {type(translate_result)}"
)
return status, details_md, None
translated_image: Optional[Image.Image] = None
image_bytes: Optional[bytes] = None
if isinstance(data, str):
if data.startswith("data:"):
try:
header, b64_part = data.split(",", 1)
except ValueError:
b64_part = ""
if not mime_type and ":" in header:
mime_type = header.split(";", 1)[0].split(":", 1)[1]
if b64_part:
try:
image_bytes = base64.b64decode(b64_part)
except Exception:
image_bytes = None
else:
try:
image_bytes = base64.b64decode(data)
except Exception:
image_bytes = None
elif isinstance(data, bytes):
image_bytes = data
if image_bytes is not None and mime_type:
try:
with Image.open(BytesIO(image_bytes)) as img:
translated_image = img.copy()
except Exception as exc:
log.exception("Failed to decode translated slide reference=%s: %s", reference, exc)
status = render_status_box(
f"Slide translation produced data, but it could not be decoded. {LOG_HINT}",
"fail",
)
details_md = (
f"**Source video**: {ANALYSIS_VIDEO_URL}\n"
f"**Target language**: {target_language}\n"
f"Translated slide index: {target_index}\n"
f"Decoding failed. {LOG_HINT}"
)
return status, details_md, None
else:
log.warning(
"Slide translation returned no image payload reference=%s data_type=%s mime=%s payload_type=%s",
reference,
type(data),
mime_type,
type(translate_result),
)
data_preview = ""
if isinstance(data, str):
data_preview = f"data[:64]={data[:64]!r} len={len(data)}"
elif isinstance(data, bytes):
data_preview = f"bytes len={len(data)}"
else:
data_preview = f"type={type(data)}"
status = render_status_box(
f"Slide translation completed but returned no image payload. {LOG_HINT}",
"fail",
)
details_md = (
f"**Source video**: {ANALYSIS_VIDEO_URL}\n"
f"**Target language**: {target_language}\n"
f"Translated slide index: {target_index}\n"
f"Gemini did not return an image payload. {data_preview}. {LOG_HINT}"
)
return status, details_md, None
headline = f"βœ… Translated slide #{target_index} into {target_language}."
status_html = render_status_box(headline, "success")
details_lines = [
]
details_md = "\n".join(details_lines)
log.info(
"Slide translation success reference=%s language=%s slide_index=%d mime=%s",
reference,
target_language,
target_index,
mime_type,
)
return status_html, details_md, translated_image
def run_translation_demo(
gemini_api_key: str | None,
language: str,
slide_index_text: str,
) -> Tuple[str, str, Optional[Image.Image]]:
"""Gradio callback entry point for the slide translation demo."""
key = (gemini_api_key or "").strip()
if not key:
status = render_status_box(
"Please provide a Gemini API key in the setup cell above before running this demo.",
"fail",
)
details = (
"The slide translation demo relies on Gemini via the Aileen MCP server. "
"Set `GEMINI_API_KEY` in the setup cell, run the health check to verify it, "
"then try this demo again."
)
return status, details, None
try:
return asyncio.run(_run_translation_flow(key, language, slide_index_text))
except Exception as exc: # pragma: no cover - defensive
log.exception("Slide translation demo failed: %s", exc)
status = render_status_box(f"Slide translation demo failed: {exc}", "fail")
details = (
"Something went wrong while talking to the Aileen MCP media tools. "
"Check the Space logs for more detail and ensure that ffmpeg, yt-dlp and Gemini "
f"are all available. {LOG_HINT}"
)
return status, details, None
def render_translation_cell(gemini_key_input: gr.Textbox) -> None:
"""Render the notebook-style cell for slide translation."""
with cell("🌍 Translating slides for international briefings"):
gr.Markdown(
"""
### πŸ‘©πŸ»β€πŸ« Background
Once an interesting talk has been analyzed, teams often want to **quickly pass on key slides to colleagues in other languages**. Instead
of translating the whole video, Aileen 3 Core focuses on the most information-dense artefacts – slide stills – and uses Gemini to produce
target-language variants that stay close to the original visual design.
This fits the information-foraging mindset: first detect where the signal lives (through retrieval and analysis), then invest translation
effort only on the pieces that are worth sharing.
### πŸ’πŸ»β€β™€οΈ Demo
In this cell we reuse the same **short, lecture-style GPT-OSS video** as in the analysis demo. The MCP server retrieves the video, extracts
representative slides, and translates a selected slide into a language of your choice. The translated slide preview is shown so you can
assess whether the result is good enough to forward to your team chat or notes.
"""
)
gr.Textbox(
label="YouTube video URL",
value=ANALYSIS_VIDEO_URL,
interactive=False,
)
slide_index_box = gr.Textbox(
label="Slide number to translate (0 = first detected slide)",
lines=1,
value="0",
placeholder="Integer index such as 0, 1, 2 …",
)
language_box = gr.Textbox(
label="Target language for slide translation",
lines=1,
value="German",
placeholder="e.g. German, English, French",
)
run_button = gr.Button("Translate slide", variant="primary")
result_panel = gr.HTML(
value=render_status_box(
"πŸ‘‰ Click the button to fetch the media, extract slides, and translate a representative slide into your chosen language.",
"placeholder",
)
)
details_markdown = gr.Markdown(visible=True)
translated_image = gr.Image(
label="Translated slide preview",
interactive=False,
type="pil",
)
run_button.click(
fn=run_translation_demo,
inputs=[gemini_key_input, language_box, slide_index_box],
outputs=[result_panel, details_markdown, translated_image],
queue=False,
)