Spaces:
Running
Running
File size: 16,553 Bytes
3fb9411 0c163b8 3fb9411 0c163b8 3fb9411 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 |
from __future__ import annotations
import asyncio
import os
import sys
from pathlib import Path
import base64
from io import BytesIO
from typing import Tuple, Optional
import fastmcp
import gradio as gr
from PIL import Image
from layout import cell
from health import GEMINI_ENV_VAR
from problem_cell import render_status_box
from media_analysis_cell import (
ANALYSIS_VIDEO_URL,
_unwrap_tool_result,
_is_done,
_poll_until_done,
MAX_POLL_ATTEMPTS,
POLL_WAIT_SECONDS,
)
from slide_utils import normalize_slide_entries
from demo_logging import get_demo_logger, get_demo_log_path
log = get_demo_logger(__name__)
DEMO_LOG_PATH = str(get_demo_log_path())
LOG_HINT = f"See `{DEMO_LOG_PATH}` for details."
async def _run_translation_flow(
gemini_api_key: str,
language: str,
slide_index_text: str,
) -> Tuple[str, str, Optional[Image.Image]]:
"""Drive the MCP tools to translate a particular slide from the fixed video.
The flow reuses the same retrieval and slide-extraction pipeline as the
expectation-driven analysis cell, then calls the dedicated `translate_slide`
MCP tool to produce a target-language slide image.
"""
try:
from fastmcp import Client # type: ignore[import-untyped]
from fastmcp.client.transports import StdioTransport # type: ignore[import-untyped]
except Exception as exc: # pragma: no cover - defensive
status = render_status_box(f"fastmcp is not available in this environment: {exc}", "fail")
return status, "", None
target_language = (language or "").strip()
if not target_language:
status = render_status_box("Please provide a target language for translation.", "fail")
return status, "", None
slide_index_raw = (slide_index_text or "").strip()
try:
target_index = int(slide_index_raw)
except (TypeError, ValueError):
log.warning(
"Slide translation received non-integer slide number: %r", slide_index_text
)
status = render_status_box(
"Slide number must be an integer (e.g. 0, 1, 2).",
"fail",
)
return status, "", None
if target_index < 0:
log.warning("Slide translation received negative slide index: %d", target_index)
status = render_status_box(
"Slide number must be zero or a positive integer.",
"fail",
)
return status, "", None
log.info(
"Slide translation demo start video=%s language=%s slide_index_raw=%s",
ANALYSIS_VIDEO_URL,
target_language,
slide_index_text,
)
# Spawn the MCP server as a subprocess with PYTHONPATH pointing at
# the local `mcp/src` tree so this stays self-contained in the Space.
repo_root = Path(__file__).resolve().parents[1]
mcp_src = repo_root / "mcp" / "src"
existing_py_path = os.environ.get("PYTHONPATH", "")
py_path = f"{mcp_src}{os.pathsep}{existing_py_path}" if existing_py_path else str(mcp_src)
env = os.environ.copy()
env["PYTHONPATH"] = py_path
env[GEMINI_ENV_VAR] = gemini_api_key
server_entry = ["-m", "aileen3_mcp.server"]
log.info(
"Slide translation demo spawning MCP server: cmd=%s args=%s PYTHONPATH=%s cwd=%s language=%s",
sys.executable,
server_entry,
py_path,
repo_root,
target_language,
)
transport = StdioTransport(
command=sys.executable,
args=server_entry,
env=env,
cwd=str(repo_root),
)
async with Client(transport) as client:
retrieval_start = _unwrap_tool_result(
await client.call_tool(
"start_media_retrieval",
{
"source": ANALYSIS_VIDEO_URL,
"prefer_audio_only": False,
"wait_seconds": POLL_WAIT_SECONDS,
},
)
)
if retrieval_start.get("is_error"):
detail = retrieval_start.get("detail") or "Media retrieval failed."
log.warning("Slide translation retrieval failed: %s", detail)
status = render_status_box(detail, "fail")
return status, "", None
reference = retrieval_start.get("reference")
if not reference:
status = render_status_box(
"Media retrieval did not return a reference token.", "fail"
)
log.warning("Slide translation missing reference for video=%s", ANALYSIS_VIDEO_URL)
return status, "", None
retrieval = retrieval_start
if not _is_done(retrieval_start):
retrieval = await _poll_until_done(
client,
tool_name="get_media_retrieval_status",
reference=reference,
wait_seconds=POLL_WAIT_SECONDS,
max_attempts=MAX_POLL_ATTEMPTS,
)
if retrieval.get("is_error") or not _is_done(retrieval):
detail = retrieval.get("detail") or retrieval.get("status") or "Retrieval incomplete."
status = render_status_box(
f"Media retrieval did not complete successfully: {detail}", "fail"
)
return status, "", None
slides_result = _unwrap_tool_result(
await client.call_tool(
"get_extracted_slides",
{
"reference": reference,
"wait_seconds": 0,
},
)
)
slides = normalize_slide_entries(slides_result)
if not slides:
# No cached slides yet; trigger extraction explicitly.
extraction_start = _unwrap_tool_result(
await client.call_tool(
"start_slide_extraction",
{
"reference": reference,
"wait_seconds": POLL_WAIT_SECONDS,
},
)
)
if extraction_start.get("is_error"):
detail = extraction_start.get("detail") or "Slide extraction failed to start."
status = render_status_box(
f"Slide extraction did not complete successfully: {detail}", "fail"
)
log.error("Slide extraction failed reference=%s detail=%s", reference, detail)
return status, "", None
if not _is_done(extraction_start):
extraction_done = await _poll_until_done(
client,
tool_name="get_extracted_slides",
reference=reference,
wait_seconds=POLL_WAIT_SECONDS,
max_attempts=MAX_POLL_ATTEMPTS,
)
if extraction_done.get("is_error") or not _slide_entries_from_result(extraction_done):
detail = (
extraction_done.get("detail")
or extraction_done.get("status")
or "Slides not available."
)
status = render_status_box(
f"Slide extraction did not complete successfully: {detail}", "fail"
)
log.warning("Slide extraction still unavailable reference=%s detail=%s", reference, detail)
return status, "", None
slides = normalize_slide_entries(extraction_done)
if not slides:
status = render_status_box(
"No slides were detected for this video; nothing to translate.", "fail"
)
log.warning("Slide translation found no slides reference=%s", reference)
return status, "", None
if target_index >= len(slides):
status = render_status_box(
f"Slide number {target_index} is out of range; detected slides are indexed 0 to {len(slides) - 1}.",
"fail",
)
log.warning(
"Slide translation index out of range reference=%s requested=%d available=%d",
reference,
target_index,
len(slides),
)
return status, "", None
translate_result = await client.call_tool(
"translate_slide",
{
"reference": reference,
"slide_index": target_index,
"language": target_language,
},
)
log.info("translate_slide raw result type=%s", type(translate_result))
data: Optional[str] | Optional[bytes] = None
mime_type: Optional[str] = None
if isinstance(translate_result, fastmcp.client.client.CallToolResult):
img = translate_result.content[0]
data = img.data
mime_type = img.mimeType
else:
# old format, possibly cached
log.exception("Failed to decode translated slide, type=", type(translate_result))
status = render_status_box(
f"Slide translation produced data, but the object type is unknown.",
"fail",
)
details_md = (
f"Result type: {type(translate_result)}"
)
return status, details_md, None
translated_image: Optional[Image.Image] = None
image_bytes: Optional[bytes] = None
if isinstance(data, str):
if data.startswith("data:"):
try:
header, b64_part = data.split(",", 1)
except ValueError:
b64_part = ""
if not mime_type and ":" in header:
mime_type = header.split(";", 1)[0].split(":", 1)[1]
if b64_part:
try:
image_bytes = base64.b64decode(b64_part)
except Exception:
image_bytes = None
else:
try:
image_bytes = base64.b64decode(data)
except Exception:
image_bytes = None
elif isinstance(data, bytes):
image_bytes = data
if image_bytes is not None and mime_type:
try:
with Image.open(BytesIO(image_bytes)) as img:
translated_image = img.copy()
except Exception as exc:
log.exception("Failed to decode translated slide reference=%s: %s", reference, exc)
status = render_status_box(
f"Slide translation produced data, but it could not be decoded. {LOG_HINT}",
"fail",
)
details_md = (
f"**Source video**: {ANALYSIS_VIDEO_URL}\n"
f"**Target language**: {target_language}\n"
f"Translated slide index: {target_index}\n"
f"Decoding failed. {LOG_HINT}"
)
return status, details_md, None
else:
log.warning(
"Slide translation returned no image payload reference=%s data_type=%s mime=%s payload_type=%s",
reference,
type(data),
mime_type,
type(translate_result),
)
data_preview = ""
if isinstance(data, str):
data_preview = f"data[:64]={data[:64]!r} len={len(data)}"
elif isinstance(data, bytes):
data_preview = f"bytes len={len(data)}"
else:
data_preview = f"type={type(data)}"
status = render_status_box(
f"Slide translation completed but returned no image payload. {LOG_HINT}",
"fail",
)
details_md = (
f"**Source video**: {ANALYSIS_VIDEO_URL}\n"
f"**Target language**: {target_language}\n"
f"Translated slide index: {target_index}\n"
f"Gemini did not return an image payload. {data_preview}. {LOG_HINT}"
)
return status, details_md, None
headline = f"β
Translated slide #{target_index} into {target_language}."
status_html = render_status_box(headline, "success")
details_lines = [
]
details_md = "\n".join(details_lines)
log.info(
"Slide translation success reference=%s language=%s slide_index=%d mime=%s",
reference,
target_language,
target_index,
mime_type,
)
return status_html, details_md, translated_image
def run_translation_demo(
gemini_api_key: str | None,
language: str,
slide_index_text: str,
) -> Tuple[str, str, Optional[Image.Image]]:
"""Gradio callback entry point for the slide translation demo."""
key = (gemini_api_key or "").strip()
if not key:
status = render_status_box(
"Please provide a Gemini API key in the setup cell above before running this demo.",
"fail",
)
details = (
"The slide translation demo relies on Gemini via the Aileen MCP server. "
"Set `GEMINI_API_KEY` in the setup cell, run the health check to verify it, "
"then try this demo again."
)
return status, details, None
try:
return asyncio.run(_run_translation_flow(key, language, slide_index_text))
except Exception as exc: # pragma: no cover - defensive
log.exception("Slide translation demo failed: %s", exc)
status = render_status_box(f"Slide translation demo failed: {exc}", "fail")
details = (
"Something went wrong while talking to the Aileen MCP media tools. "
"Check the Space logs for more detail and ensure that ffmpeg, yt-dlp and Gemini "
f"are all available. {LOG_HINT}"
)
return status, details, None
def render_translation_cell(gemini_key_input: gr.Textbox) -> None:
"""Render the notebook-style cell for slide translation."""
with cell("π Translating slides for international briefings"):
gr.Markdown(
"""
### π©π»βπ« Background
Once an interesting talk has been analyzed, teams often want to **quickly pass on key slides to colleagues in other languages**. Instead
of translating the whole video, Aileen 3 Core focuses on the most information-dense artefacts β slide stills β and uses Gemini to produce
target-language variants that stay close to the original visual design.
This fits the information-foraging mindset: first detect where the signal lives (through retrieval and analysis), then invest translation
effort only on the pieces that are worth sharing.
### ππ»ββοΈ Demo
In this cell we reuse the same **short, lecture-style GPT-OSS video** as in the analysis demo. The MCP server retrieves the video, extracts
representative slides, and translates a selected slide into a language of your choice. The translated slide preview is shown so you can
assess whether the result is good enough to forward to your team chat or notes.
"""
)
gr.Textbox(
label="YouTube video URL",
value=ANALYSIS_VIDEO_URL,
interactive=False,
)
slide_index_box = gr.Textbox(
label="Slide number to translate (0 = first detected slide)",
lines=1,
value="0",
placeholder="Integer index such as 0, 1, 2 β¦",
)
language_box = gr.Textbox(
label="Target language for slide translation",
lines=1,
value="German",
placeholder="e.g. German, English, French",
)
run_button = gr.Button("Translate slide", variant="primary")
result_panel = gr.HTML(
value=render_status_box(
"π Click the button to fetch the media, extract slides, and translate a representative slide into your chosen language.",
"placeholder",
)
)
details_markdown = gr.Markdown(visible=True)
translated_image = gr.Image(
label="Translated slide preview",
interactive=False,
type="pil",
)
run_button.click(
fn=run_translation_demo,
inputs=[gemini_key_input, language_box, slide_index_box],
outputs=[result_panel, details_markdown, translated_image],
queue=False,
)
|