ndurner commited on
Commit
982628c
·
1 Parent(s): fbc8645

transcript caching

Browse files
Files changed (2) hide show
  1. Dockerfile +6 -0
  2. demo/problem_cell.py +32 -0
Dockerfile CHANGED
@@ -18,6 +18,12 @@ RUN apt-get update && \
18
  rm -rf /tmp/deno.zip && \
19
  rm -rf /var/lib/apt/lists/*
20
 
 
 
 
 
 
 
21
  COPY demo/requirements.txt demo/requirements.txt
22
  COPY mcp mcp
23
  RUN pip install --upgrade pip && \
 
18
  rm -rf /tmp/deno.zip && \
19
  rm -rf /var/lib/apt/lists/*
20
 
21
+ # populate cache; this works around HuggingFace egress restrictions
22
+ RUN curl -fsSL "https://ndurner.de/download/aileen3/aileen3-cache.zip" -o /tmp/aileen3-cache.zip && \
23
+ unzip -q /tmp/aileen3-cache.zip -d /tmp && \
24
+ mv /tmp/aileen3-cache /root/.cache/aileen3 && \
25
+ rm -rf /tmp/aileen3-cache.zip
26
+
27
  COPY demo/requirements.txt demo/requirements.txt
28
  COPY mcp mcp
29
  RUN pip install --upgrade pip && \
demo/problem_cell.py CHANGED
@@ -1,5 +1,8 @@
1
  from __future__ import annotations
2
 
 
 
 
3
  import tempfile
4
  from pathlib import Path
5
  from urllib.parse import parse_qs, urlparse
@@ -22,6 +25,13 @@ HERE = Path(__file__).parent
22
  ASSETS_DIR = HERE / "assets"
23
  DIGITALGIPFEL_IMG = ASSETS_DIR / "digitalgipfel.jpeg"
24
 
 
 
 
 
 
 
 
25
 
26
  def render_status_box(message: str, tone: str = "placeholder") -> str:
27
  tone_class = {
@@ -44,11 +54,26 @@ def _extract_video_id(video_url: str) -> str | None:
44
 
45
 
46
  def _fetch_transcript(video_url: str) -> tuple[str | None, str | None]:
 
 
47
  if YoutubeDL is None: # pragma: no cover - dependency should always be present
48
  return None, "yt-dlp is not installed in this environment."
49
  video_id = _extract_video_id(video_url)
50
  if not video_id:
51
  return None, "That does not look like a valid YouTube URL with a video id."
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  with tempfile.TemporaryDirectory() as tmpdir:
53
  output_template = str(Path(tmpdir) / "%(id)s.%(ext)s")
54
  ydl_opts = {
@@ -84,6 +109,13 @@ def _fetch_transcript(video_url: str) -> tuple[str | None, str | None]:
84
  readable = " ".join(text_chunks).strip()
85
  if not readable:
86
  return None, "Transcript was empty. Try again or choose another video."
 
 
 
 
 
 
 
87
  return readable, None
88
 
89
 
 
1
  from __future__ import annotations
2
 
3
+ import hashlib
4
+ import json
5
+ import os
6
  import tempfile
7
  from pathlib import Path
8
  from urllib.parse import parse_qs, urlparse
 
25
  ASSETS_DIR = HERE / "assets"
26
  DIGITALGIPFEL_IMG = ASSETS_DIR / "digitalgipfel.jpeg"
27
 
28
+ BASE_CACHE = Path(os.environ.get("AILEEN3_CACHE_DIR", Path.home() / ".cache" / "aileen3"))
29
+ TRANSCRIPTION_CACHE = BASE_CACHE / "transcription"
30
+
31
+
32
+ def _transcription_cache_path(reference: str) -> Path:
33
+ return TRANSCRIPTION_CACHE / f"{reference}.json"
34
+
35
 
36
  def render_status_box(message: str, tone: str = "placeholder") -> str:
37
  tone_class = {
 
54
 
55
 
56
  def _fetch_transcript(video_url: str) -> tuple[str | None, str | None]:
57
+ TRANSCRIPTION_CACHE.mkdir(parents=True, exist_ok=True)
58
+
59
  if YoutubeDL is None: # pragma: no cover - dependency should always be present
60
  return None, "yt-dlp is not installed in this environment."
61
  video_id = _extract_video_id(video_url)
62
  if not video_id:
63
  return None, "That does not look like a valid YouTube URL with a video id."
64
+
65
+ # Align cache layout with `media_tools`: transcription cache under BASE_CACHE/transcription
66
+ # using a stable reference derived from the YouTube video id when available.
67
+ reference = f"youtube_{hashlib.sha256(video_id.encode('utf-8')).hexdigest()[:32]}"
68
+ cache_path = _transcription_cache_path(reference)
69
+ if cache_path.exists():
70
+ try:
71
+ cached = json.loads(cache_path.read_text(encoding="utf-8"))
72
+ except Exception:
73
+ cached = None
74
+ if isinstance(cached, str) and cached.strip():
75
+ return cached, None
76
+
77
  with tempfile.TemporaryDirectory() as tmpdir:
78
  output_template = str(Path(tmpdir) / "%(id)s.%(ext)s")
79
  ydl_opts = {
 
109
  readable = " ".join(text_chunks).strip()
110
  if not readable:
111
  return None, "Transcript was empty. Try again or choose another video."
112
+
113
+ try:
114
+ cache_path.write_text(json.dumps(readable), encoding="utf-8")
115
+ except Exception:
116
+ # Cache failures should not block the happy path.
117
+ pass
118
+
119
  return readable, None
120
 
121