Spaces:

MCP-1st-Birthday
/

aileen3-core

Running

App Files Files Community

aileen3-core / demo /translation_cell.py

ndurner

add comments

0c163b8 15 days ago

raw

history blame contribute delete

16.6 kB

	from __future__ import annotations

	import asyncio
	import os
	import sys
	from pathlib import Path
	import base64
	from io import BytesIO
	from typing import Tuple, Optional
	import fastmcp

	import gradio as gr
	from PIL import Image

	from layout import cell
	from health import GEMINI_ENV_VAR
	from problem_cell import render_status_box
	from media_analysis_cell import (
	ANALYSIS_VIDEO_URL,
	_unwrap_tool_result,
	_is_done,
	_poll_until_done,
	MAX_POLL_ATTEMPTS,
	POLL_WAIT_SECONDS,
	)
	from slide_utils import normalize_slide_entries
	from demo_logging import get_demo_logger, get_demo_log_path

	log = get_demo_logger(__name__)
	DEMO_LOG_PATH = str(get_demo_log_path())
	LOG_HINT = f"See `{DEMO_LOG_PATH}` for details."


	async def _run_translation_flow(
	gemini_api_key: str,
	language: str,
	slide_index_text: str,
	) -> Tuple[str, str, Optional[Image.Image]]:
	"""Drive the MCP tools to translate a particular slide from the fixed video.

	The flow reuses the same retrieval and slide-extraction pipeline as the
	expectation-driven analysis cell, then calls the dedicated `translate_slide`
	MCP tool to produce a target-language slide image.
	"""
	try:
	from fastmcp import Client # type: ignore[import-untyped]
	from fastmcp.client.transports import StdioTransport # type: ignore[import-untyped]
	except Exception as exc: # pragma: no cover - defensive
	status = render_status_box(f"fastmcp is not available in this environment: {exc}", "fail")
	return status, "", None

	target_language = (language or "").strip()
	if not target_language:
	status = render_status_box("Please provide a target language for translation.", "fail")
	return status, "", None

	slide_index_raw = (slide_index_text or "").strip()
	try:
	target_index = int(slide_index_raw)
	except (TypeError, ValueError):
	log.warning(
	"Slide translation received non-integer slide number: %r", slide_index_text
	)
	status = render_status_box(
	"Slide number must be an integer (e.g. 0, 1, 2).",
	"fail",
	)
	return status, "", None
	if target_index < 0:
	log.warning("Slide translation received negative slide index: %d", target_index)
	status = render_status_box(
	"Slide number must be zero or a positive integer.",
	"fail",
	)
	return status, "", None

	log.info(
	"Slide translation demo start video=%s language=%s slide_index_raw=%s",
	ANALYSIS_VIDEO_URL,
	target_language,
	slide_index_text,
	)

	# Spawn the MCP server as a subprocess with PYTHONPATH pointing at
	# the local `mcp/src` tree so this stays self-contained in the Space.
	repo_root = Path(__file__).resolve().parents[1]
	mcp_src = repo_root / "mcp" / "src"
	existing_py_path = os.environ.get("PYTHONPATH", "")
	py_path = f"{mcp_src}{os.pathsep}{existing_py_path}" if existing_py_path else str(mcp_src)

	env = os.environ.copy()
	env["PYTHONPATH"] = py_path
	env[GEMINI_ENV_VAR] = gemini_api_key

	server_entry = ["-m", "aileen3_mcp.server"]

	log.info(
	"Slide translation demo spawning MCP server: cmd=%s args=%s PYTHONPATH=%s cwd=%s language=%s",
	sys.executable,
	server_entry,
	py_path,
	repo_root,
	target_language,
	)

	transport = StdioTransport(
	command=sys.executable,
	args=server_entry,
	env=env,
	cwd=str(repo_root),
	)

	async with Client(transport) as client:
	retrieval_start = _unwrap_tool_result(
	await client.call_tool(
	"start_media_retrieval",
	{
	"source": ANALYSIS_VIDEO_URL,
	"prefer_audio_only": False,
	"wait_seconds": POLL_WAIT_SECONDS,
	},
	)
	)

	if retrieval_start.get("is_error"):
	detail = retrieval_start.get("detail") or "Media retrieval failed."
	log.warning("Slide translation retrieval failed: %s", detail)
	status = render_status_box(detail, "fail")
	return status, "", None

	reference = retrieval_start.get("reference")
	if not reference:
	status = render_status_box(
	"Media retrieval did not return a reference token.", "fail"
	)
	log.warning("Slide translation missing reference for video=%s", ANALYSIS_VIDEO_URL)
	return status, "", None

	retrieval = retrieval_start
	if not _is_done(retrieval_start):
	retrieval = await _poll_until_done(
	client,
	tool_name="get_media_retrieval_status",
	reference=reference,
	wait_seconds=POLL_WAIT_SECONDS,
	max_attempts=MAX_POLL_ATTEMPTS,
	)

	if retrieval.get("is_error") or not _is_done(retrieval):
	detail = retrieval.get("detail") or retrieval.get("status") or "Retrieval incomplete."
	status = render_status_box(
	f"Media retrieval did not complete successfully: {detail}", "fail"
	)
	return status, "", None

	slides_result = _unwrap_tool_result(
	await client.call_tool(
	"get_extracted_slides",
	{
	"reference": reference,
	"wait_seconds": 0,
	},
	)
	)

	slides = normalize_slide_entries(slides_result)
	if not slides:
	# No cached slides yet; trigger extraction explicitly.
	extraction_start = _unwrap_tool_result(
	await client.call_tool(
	"start_slide_extraction",
	{
	"reference": reference,
	"wait_seconds": POLL_WAIT_SECONDS,
	},
	)
	)
	if extraction_start.get("is_error"):
	detail = extraction_start.get("detail") or "Slide extraction failed to start."
	status = render_status_box(
	f"Slide extraction did not complete successfully: {detail}", "fail"
	)
	log.error("Slide extraction failed reference=%s detail=%s", reference, detail)
	return status, "", None

	if not _is_done(extraction_start):
	extraction_done = await _poll_until_done(
	client,
	tool_name="get_extracted_slides",
	reference=reference,
	wait_seconds=POLL_WAIT_SECONDS,
	max_attempts=MAX_POLL_ATTEMPTS,
	)
	if extraction_done.get("is_error") or not _slide_entries_from_result(extraction_done):
	detail = (
	extraction_done.get("detail")
	or extraction_done.get("status")
	or "Slides not available."
	)
	status = render_status_box(
	f"Slide extraction did not complete successfully: {detail}", "fail"
	)
	log.warning("Slide extraction still unavailable reference=%s detail=%s", reference, detail)
	return status, "", None
	slides = normalize_slide_entries(extraction_done)

	if not slides:
	status = render_status_box(
	"No slides were detected for this video; nothing to translate.", "fail"
	)
	log.warning("Slide translation found no slides reference=%s", reference)
	return status, "", None

	if target_index >= len(slides):
	status = render_status_box(
	f"Slide number {target_index} is out of range; detected slides are indexed 0 to {len(slides) - 1}.",
	"fail",
	)
	log.warning(
	"Slide translation index out of range reference=%s requested=%d available=%d",
	reference,
	target_index,
	len(slides),
	)
	return status, "", None
	translate_result = await client.call_tool(
	"translate_slide",
	{
	"reference": reference,
	"slide_index": target_index,
	"language": target_language,
	},
	)
	log.info("translate_slide raw result type=%s", type(translate_result))

	data: Optional[str] \| Optional[bytes] = None
	mime_type: Optional[str] = None

	if isinstance(translate_result, fastmcp.client.client.CallToolResult):
	img = translate_result.content[0]
	data = img.data
	mime_type = img.mimeType
	else:
	# old format, possibly cached
	log.exception("Failed to decode translated slide, type=", type(translate_result))
	status = render_status_box(
	f"Slide translation produced data, but the object type is unknown.",
	"fail",
	)
	details_md = (
	f"Result type: {type(translate_result)}"
	)
	return status, details_md, None

	translated_image: Optional[Image.Image] = None
	image_bytes: Optional[bytes] = None
	if isinstance(data, str):
	if data.startswith("data:"):
	try:
	header, b64_part = data.split(",", 1)
	except ValueError:
	b64_part = ""
	if not mime_type and ":" in header:
	mime_type = header.split(";", 1)[0].split(":", 1)[1]
	if b64_part:
	try:
	image_bytes = base64.b64decode(b64_part)
	except Exception:
	image_bytes = None
	else:
	try:
	image_bytes = base64.b64decode(data)
	except Exception:
	image_bytes = None
	elif isinstance(data, bytes):
	image_bytes = data

	if image_bytes is not None and mime_type:
	try:
	with Image.open(BytesIO(image_bytes)) as img:
	translated_image = img.copy()
	except Exception as exc:
	log.exception("Failed to decode translated slide reference=%s: %s", reference, exc)
	status = render_status_box(
	f"Slide translation produced data, but it could not be decoded. {LOG_HINT}",
	"fail",
	)
	details_md = (
	f"Source video: {ANALYSIS_VIDEO_URL}\n"
	f"Target language: {target_language}\n"
	f"Translated slide index: {target_index}\n"
	f"Decoding failed. {LOG_HINT}"
	)
	return status, details_md, None
	else:
	log.warning(
	"Slide translation returned no image payload reference=%s data_type=%s mime=%s payload_type=%s",
	reference,
	type(data),
	mime_type,
	type(translate_result),
	)
	data_preview = ""
	if isinstance(data, str):
	data_preview = f"data[:64]={data[:64]!r} len={len(data)}"
	elif isinstance(data, bytes):
	data_preview = f"bytes len={len(data)}"
	else:
	data_preview = f"type={type(data)}"
	status = render_status_box(
	f"Slide translation completed but returned no image payload. {LOG_HINT}",
	"fail",
	)
	details_md = (
	f"Source video: {ANALYSIS_VIDEO_URL}\n"
	f"Target language: {target_language}\n"
	f"Translated slide index: {target_index}\n"
	f"Gemini did not return an image payload. {data_preview}. {LOG_HINT}"
	)
	return status, details_md, None

	headline = f"✅ Translated slide #{target_index} into {target_language}."
	status_html = render_status_box(headline, "success")

	details_lines = [
	]
	details_md = "\n".join(details_lines)

	log.info(
	"Slide translation success reference=%s language=%s slide_index=%d mime=%s",
	reference,
	target_language,
	target_index,
	mime_type,
	)

	return status_html, details_md, translated_image


	def run_translation_demo(
	gemini_api_key: str \| None,
	language: str,
	slide_index_text: str,
	) -> Tuple[str, str, Optional[Image.Image]]:
	"""Gradio callback entry point for the slide translation demo."""
	key = (gemini_api_key or "").strip()
	if not key:
	status = render_status_box(
	"Please provide a Gemini API key in the setup cell above before running this demo.",
	"fail",
	)
	details = (
	"The slide translation demo relies on Gemini via the Aileen MCP server. "
	"Set `GEMINI_API_KEY` in the setup cell, run the health check to verify it, "
	"then try this demo again."
	)
	return status, details, None

	try:
	return asyncio.run(_run_translation_flow(key, language, slide_index_text))
	except Exception as exc: # pragma: no cover - defensive
	log.exception("Slide translation demo failed: %s", exc)
	status = render_status_box(f"Slide translation demo failed: {exc}", "fail")
	details = (
	"Something went wrong while talking to the Aileen MCP media tools. "
	"Check the Space logs for more detail and ensure that ffmpeg, yt-dlp and Gemini "
	f"are all available. {LOG_HINT}"
	)
	return status, details, None


	def render_translation_cell(gemini_key_input: gr.Textbox) -> None:
	"""Render the notebook-style cell for slide translation."""
	with cell("🌍 Translating slides for international briefings"):
	gr.Markdown(
	"""
	### 👩🏻‍🏫 Background
	Once an interesting talk has been analyzed, teams often want to quickly pass on key slides to colleagues in other languages. Instead
	of translating the whole video, Aileen 3 Core focuses on the most information-dense artefacts – slide stills – and uses Gemini to produce
	target-language variants that stay close to the original visual design.

	This fits the information-foraging mindset: first detect where the signal lives (through retrieval and analysis), then invest translation
	effort only on the pieces that are worth sharing.

	### 💁🏻‍♀️ Demo
	In this cell we reuse the same short, lecture-style GPT-OSS video as in the analysis demo. The MCP server retrieves the video, extracts
	representative slides, and translates a selected slide into a language of your choice. The translated slide preview is shown so you can
	assess whether the result is good enough to forward to your team chat or notes.
	"""
	)

	gr.Textbox(
	label="YouTube video URL",
	value=ANALYSIS_VIDEO_URL,
	interactive=False,
	)

	slide_index_box = gr.Textbox(
	label="Slide number to translate (0 = first detected slide)",
	lines=1,
	value="0",
	placeholder="Integer index such as 0, 1, 2 …",
	)

	language_box = gr.Textbox(
	label="Target language for slide translation",
	lines=1,
	value="German",
	placeholder="e.g. German, English, French",
	)

	run_button = gr.Button("Translate slide", variant="primary")
	result_panel = gr.HTML(
	value=render_status_box(
	"👉 Click the button to fetch the media, extract slides, and translate a representative slide into your chosen language.",
	"placeholder",
	)
	)
	details_markdown = gr.Markdown(visible=True)
	translated_image = gr.Image(
	label="Translated slide preview",
	interactive=False,
	type="pil",
	)

	run_button.click(
	fn=run_translation_demo,
	inputs=[gemini_key_input, language_box, slide_index_box],
	outputs=[result_panel, details_markdown, translated_image],
	queue=False,
	)