Spaces:

Rsnarsna
/

transcript

Sleeping

App Files Files Community

rsnarsna commited on May 31

Commit

0bcc65e

1 Parent(s): eb0770c

feat: Enhance transcript extraction with multi-tier fallback; add YouTube API support and update UI to display extraction method

Browse files

Files changed (7) hide show

Google_oauth_token.json +1 -1
Google_oauth_token1.json +1 -0
app.py +64 -55
gemini_transcript.py +402 -30
index.html +6 -1
oauth_states.json +1 -0
requirements.txt +2 -1

Google_oauth_token.json CHANGED Viewed

@@ -1 +1 @@

- {"token": "ya29.a0AQvPyIPFC0StrVIncchRY249KeicYmMTdPM-ICSViACoeC7axklt6_pagQJFmnXAaxlQAb1pUi7DR0O0D3VGSYA3XipKuxFUZ0F8OGjpkbcqyrs3sWwEz-FrFDk6Qz_0vSl0Vf0JYVyin-w9pEOui5lLrGaz1kY8ZQOfTuq7dzS5A5rm0WhCOMZwoW3XvFG2BqEk1GMaCgYKAX4SARcSFQHGX2MiUQek-j_qAtSy-2Rt5wEdbA0206", "refresh_token": "1//~~0gu9~~-~~I_aiMWIFCgYIARAAGBASNwF~~-~~L9IrcIS5BPy6qGVLHH1C0EyEF612MzfTDW3_unszWw60sjA4c64i1jKfshLxfxvNnn2i1X0~~", "token_uri": "https://oauth2.googleapis.com/token", "client_id": "769133159215-9gbq0l5v49kmclfcq7vbq7tutck0aphd.apps.googleusercontent.com", "client_secret": "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", "scopes": ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/gmail.send", "https://www.googleapis.com/auth/drive.file"], "universe_domain": "googleapis.com", "account": "", "expiry": "2026-05-~~31T11~~:38:~~17.225953Z~~"}

+ {"token": "ya29.a0AQvPyIO26RTYjrTK11YqxleX0yEhb_vlrw0TChQxwxTP2GWBonDMQonUUdknaad1vpWBNMhMOrD0Mbw9pNon3W20odwEFIyiPcXX0DRC07hrmbPIiUN4R9hlbl5H_gZdBMfa6oHoBIAb358uMxWCtVoawWEuKAm_XZrZhIsEG8xlXSLY5e_Mi50nu77y09IYASOHe2QaCgYKAacSARcSFQHGX2MiZB9Z5G4jtAvKppNhrfPtKA0206", "refresh_token": "1//0gyGUH_G9f9CbCgYIARAAGBASNwF-L9Irnwc3yS0FAs7ocMc8Vtmxu-C3GbrkS_deBoCRToBbEBl0vkRHEjVWmIHw2EZ6RpjFGX8", "token_uri": "https://oauth2.googleapis.com/token", "client_id": "769133159215-9gbq0l5v49kmclfcq7vbq7tutck0aphd.apps.googleusercontent.com", "client_secret": "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", "scopes": ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/gmail.send", "https://www.googleapis.com/auth/drive.file"], "universe_domain": "googleapis.com", "account": "", "expiry": "2026-05-31T12:33:45Z"}

Google_oauth_token1.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"token": "ya29.a0AQvPyIPFC0StrVIncchRY249KeicYmMTdPM-ICSViACoeC7axklt6_pagQJFmnXAaxlQAb1pUi7DR0O0D3VGSYA3XipKuxFUZ0F8OGjpkbcqyrs3sWwEz-FrFDk6Qz_0vSl0Vf0JYVyin-w9pEOui5lLrGaz1kY8ZQOfTuq7dzS5A5rm0WhCOMZwoW3XvFG2BqEk1GMaCgYKAX4SARcSFQHGX2MiUQek-j_qAtSy-2Rt5wEdbA0206", "refresh_token": "1//0gu9-I_aiMWIFCgYIARAAGBASNwF-L9IrcIS5BPy6qGVLHH1C0EyEF612MzfTDW3_unszWw60sjA4c64i1jKfshLxfxvNnn2i1X0", "token_uri": "https://oauth2.googleapis.com/token", "client_id": "769133159215-9gbq0l5v49kmclfcq7vbq7tutck0aphd.apps.googleusercontent.com", "client_secret": "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", "scopes": ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/gmail.send", "https://www.googleapis.com/auth/drive.file"], "universe_domain": "googleapis.com", "account": "", "expiry": "2026-05-31T11:38:17.225953Z"}

app.py CHANGED Viewed

@@ -43,6 +43,7 @@ SCOPES = [
     "https://www.googleapis.com/auth/spreadsheets",
     "https://www.googleapis.com/auth/gmail.send",
     "https://www.googleapis.com/auth/drive.file",
 ]
 SHEETS_HEADERS = [
@@ -51,15 +52,16 @@ SHEETS_HEADERS = [
     "Video Title",           # C
     "YouTube URL",           # D
     "Model Used",            # E
-    "Status",                # F
-    "Summary Drive Link",    # G
-    "Q&A Drive Link",        # H
-    "Transcript Drive Link", # I
-    "Email Sent To",         # J
-    "Email Status",          # K
-    "Email Message ID",      # L
-    "Completed At",          # M
-    "Error",                 # N
 ]
@@ -493,15 +495,16 @@ def _create_sheet_record(
             "",          # C — Video Title
             youtube_url, # D — YouTube URL
             "",          # E — Model Used
-            "initiated", # F — Status
-            "",          # G — Summary Link
-            "",          # H — Q&A Link
-            "",          # I — Transcript Link
-            email_to,    # J — Email Sent To
-            "",          # K — Email Status
-            "",          # L — Email Message ID
-            "",          # M — Completed At
-            "",          # N — Error
         ]
         append_sheet(DEFAULT_SPREADSHEET_ID, "Sheet1!A1", [row], creds=creds)
     except Exception as exc:
@@ -511,16 +514,17 @@ def _create_sheet_record(
 def _update_sheet_record(
     job_id: str,
     creds: Credentials,
-    video_title: str     = "",
-    model_used: str      = "",
-    status: str          = "",
-    summary_link: str    = "",
-    qa_link: str         = "",
-    transcript_link: str = "",
-    email_status: str    = "",
-    email_msg_id: str    = "",
-    completed_at: str    = "",
-    error: str           = "",
 ) -> None:
     """Find job row by job_id and overwrite with updated values."""
     if not DEFAULT_SPREADSHEET_ID:
@@ -533,10 +537,10 @@ def _update_sheet_record(
         existing     = read_sheet(
             DEFAULT_SPREADSHEET_ID,
-            f"Sheet1!A{row_num}:N{row_num}",
             creds=creds,
         )
-        existing_row = existing[0] if existing else [""] * 14
         def _v(new: str, idx: int) -> str:
             return new if new != "" else (
@@ -544,25 +548,26 @@ def _update_sheet_record(
             )
         updated_row = [
-            _v("",              0),  # A — Timestamp       (immutable)
-            job_id,                  # B — Job ID          (immutable)
-            _v(video_title,     2),  # C — Video Title
-            _v("",              3),  # D — YouTube URL     (immutable)
-            _v(model_used,      4),  # E — Model Used
-            _v(status,          5),  # F — Status
-            _v(summary_link,    6),  # G — Summary Link
-            _v(qa_link,         7),  # H — Q&A Link
-            _v(transcript_link, 8),  # I — Transcript Link
-            _v("",              9),  # J — Email Sent To   (immutable)
-            _v(email_status,   10),  # K — Email Status
-            _v(email_msg_id,   11),  # L — Email Message ID
-            _v(completed_at,   12),  # M — Completed At
-            _v(error,          13),  # N — Error
         ]
         write_sheet(
             DEFAULT_SPREADSHEET_ID,
-            f"Sheet1!A{row_num}:N{row_num}",
             [updated_row],
             creds=creds,
         )
@@ -899,12 +904,14 @@ def _run_pipeline(job_id: str, youtube_url: str, email_to: str):
             pipeline   = TranscriptSummaryPipeline(
                 youtube_url=youtube_url,
                 languages=["en", "en-US", "en-GB"],
             )
-            transcript = pipeline.fetcher.run()
             _set_step(job_id, "fetch_transcript", "done")
             _update_sheet_record(
                 job_id, creds,
                 video_title=pipeline.video_title,
                 status="transcript_ready",
             )
         except Exception as exc:
@@ -981,15 +988,16 @@ def _run_pipeline(job_id: str, youtube_url: str, email_to: str):
 Your YouTube video has been processed successfully.
-🎥 Title      : {video_title}
-🔗 Video URL  : {youtube_url}
-📄 Summary    : {summary_link}
-❓ Q&A         : {qa_link}
-📝 Transcript : {transcript_link}
 ────────────────────────────────
-Model Used : {model_used}
 ────────────────────────────────
 Regards,
@@ -1034,9 +1042,10 @@ Google Integration API
             status="completed",
             completed_at=completed_at,
             result={
-                "video_title": video_title,
-                "youtube_url": youtube_url,
-                "model_used":  model_used,
                 "drive": {
                     "folder_id":  folder_id,
                     "summary": {

     "https://www.googleapis.com/auth/spreadsheets",
     "https://www.googleapis.com/auth/gmail.send",
     "https://www.googleapis.com/auth/drive.file",
+    "https://www.googleapis.com/auth/youtube.force-ssl",
 ]
 SHEETS_HEADERS = [
     "Video Title",           # C
     "YouTube URL",           # D
     "Model Used",            # E
+    "Transcript Method",     # F
+    "Status",                # G
+    "Summary Drive Link",    # H
+    "Q&A Drive Link",        # I
+    "Transcript Drive Link", # J
+    "Email Sent To",         # K
+    "Email Status",          # L
+    "Email Message ID",      # M
+    "Completed At",          # N
+    "Error",                 # O
 ]
             "",          # C — Video Title
             youtube_url, # D — YouTube URL
             "",          # E — Model Used
+            "",          # F — Transcript Method
+            "initiated", # G — Status
+            "",          # H — Summary Link
+            "",          # I — Q&A Link
+            "",          # J — Transcript Link
+            email_to,    # K — Email Sent To
+            "",          # L — Email Status
+            "",          # M — Email Message ID
+            "",          # N — Completed At
+            "",          # O — Error
         ]
         append_sheet(DEFAULT_SPREADSHEET_ID, "Sheet1!A1", [row], creds=creds)
     except Exception as exc:
 def _update_sheet_record(
     job_id: str,
     creds: Credentials,
+    video_title: str        = "",
+    model_used: str         = "",
+    extraction_method: str  = "",
+    status: str             = "",
+    summary_link: str       = "",
+    qa_link: str            = "",
+    transcript_link: str    = "",
+    email_status: str       = "",
+    email_msg_id: str       = "",
+    completed_at: str       = "",
+    error: str              = "",
 ) -> None:
     """Find job row by job_id and overwrite with updated values."""
     if not DEFAULT_SPREADSHEET_ID:
         existing     = read_sheet(
             DEFAULT_SPREADSHEET_ID,
+            f"Sheet1!A{row_num}:O{row_num}",
             creds=creds,
         )
+        existing_row = existing[0] if existing else [""] * 15
         def _v(new: str, idx: int) -> str:
             return new if new != "" else (
             )
         updated_row = [
+            _v("",                 0),  # A — Timestamp          (immutable)
+            job_id,                     # B — Job ID             (immutable)
+            _v(video_title,        2),  # C — Video Title
+            _v("",                 3),  # D — YouTube URL        (immutable)
+            _v(model_used,         4),  # E — Model Used
+            _v(extraction_method,  5),  # F — Transcript Method
+            _v(status,             6),  # G — Status
+            _v(summary_link,       7),  # H — Summary Link
+            _v(qa_link,            8),  # I — Q&A Link
+            _v(transcript_link,    9),  # J — Transcript Link
+            _v("",                10),  # K — Email Sent To      (immutable)
+            _v(email_status,      11),  # L — Email Status
+            _v(email_msg_id,      12),  # M — Email Message ID
+            _v(completed_at,      13),  # N — Completed At
+            _v(error,             14),  # O — Error
         ]
         write_sheet(
             DEFAULT_SPREADSHEET_ID,
+            f"Sheet1!A{row_num}:O{row_num}",
             [updated_row],
             creds=creds,
         )
             pipeline   = TranscriptSummaryPipeline(
                 youtube_url=youtube_url,
                 languages=["en", "en-US", "en-GB"],
+                google_creds=creds,
             )
+            transcript, extraction_method = pipeline.fetcher.run()
             _set_step(job_id, "fetch_transcript", "done")
             _update_sheet_record(
                 job_id, creds,
                 video_title=pipeline.video_title,
+                extraction_method=extraction_method,
                 status="transcript_ready",
             )
         except Exception as exc:
 Your YouTube video has been processed successfully.
+🎥 Title              : {video_title}
+🔗 Video URL          : {youtube_url}
+📄 Summary            : {summary_link}
+❓ Q&A                 : {qa_link}
+📝 Transcript         : {transcript_link}
 ────────────────────────────────
+Model Used            : {model_used}
+Transcript Extraction : {extraction_method}
 ────────────────────────────────
 Regards,
             status="completed",
             completed_at=completed_at,
             result={
+                "video_title":       video_title,
+                "youtube_url":       youtube_url,
+                "model_used":        model_used,
+                "extraction_method": extraction_method,
                 "drive": {
                     "folder_id":  folder_id,
                     "summary": {

gemini_transcript.py CHANGED Viewed

@@ -127,6 +127,79 @@ def _format_duration(seconds: int) -> str:
     return f"{h}h {m}m" if m else f"{h}h"
 def fetch_video_title(video_id: str) -> str:
     """Fetch YouTube video title via oembed — no API key needed."""
     try:
@@ -151,8 +224,11 @@ def fetch_video_title(video_id: str) -> str:
 class YouTubeTranscriptFetcher:
     """
-    Fetches a YouTube transcript and returns it as a plain string.
-    No files are written to disk.
     """
     def __init__(
@@ -160,12 +236,14 @@ class YouTubeTranscriptFetcher:
         youtube_url: str,
         languages: Optional[List[str]] = None,
         polling_config: dict           = None,
     ):
         self.youtube_url    = youtube_url
         self.languages      = languages or ["en", "en-US", "en-GB"]
         self.polling_config = polling_config or POLLING_CONFIG
         self.video_id       = self._extract_video_id(youtube_url)
         self.api            = YouTubeTranscriptApi()
     @staticmethod
     def _extract_video_id(url: str) -> str:
@@ -186,10 +264,64 @@ class YouTubeTranscriptFetcher:
         transcript = self.api.fetch(self.video_id, languages=self.languages)
         return " ".join(item.text for item in transcript)
-    def run(self) -> str:
         """
-        Fetch transcript with polling retry.
-        Returns transcript as a string — nothing is written to disk.
         """
         logger.info("Video ID          : %s", self.video_id)
         logger.info("Polling attempts  : %d", len(self.polling_config))
@@ -209,34 +341,24 @@ class YouTubeTranscriptFetcher:
                 time.sleep(wait_before)
             logger.info(
-                "[%d/%d] %s — fetching transcript now...",
                 idx, len(attempts), description,
             )
             try:
-                text = self._fetch_once()
                 logger.info(
-                    "[%d/%d] ✅ Transcript fetched — %d characters",
-                    idx, len(attempts), len(text),
                 )
-                return text
-            except TranscriptsDisabled as e:
-                logger.warning("[%d/%d] Transcripts disabled: %s", idx, len(attempts), e)
-                raise  # no point retrying
-            except VideoUnavailable as e:
-                logger.warning("[%d/%d] Video unavailable: %s", idx, len(attempts), e)
-            except NoTranscriptFound as e:
-                logger.warning("[%d/%d] No transcript yet: %s", idx, len(attempts), e)
             except KeyboardInterrupt:
                 logger.warning("Interrupted by user.")
                 raise
             except Exception as e:
-                logger.exception("[%d/%d] Unexpected error: %s", idx, len(attempts), e)
             if idx < len(attempts):
                 next_cfg = attempts[idx][1]
@@ -255,6 +377,249 @@ class YouTubeTranscriptFetcher:
         )
 # ============================================================================
 # GEMINI SUMMARIZER
 # ============================================================================
@@ -375,6 +740,7 @@ class TranscriptSummaryPipeline:
     """
     Orchestrates fetch → summarize.
     All data flows in memory — no disk I/O.
     """
     def __init__(
@@ -382,12 +748,14 @@ class TranscriptSummaryPipeline:
         youtube_url: str,
         languages: Optional[List[str]] = None,
         polling_config: dict           = None,
     ):
         self.youtube_url = youtube_url
         self.fetcher     = YouTubeTranscriptFetcher(
             youtube_url=youtube_url,
             languages=languages,
             polling_config=polling_config,
         )
         self.summarizer  = GeminiSummarizer()
         self.video_id    = self.fetcher.video_id
@@ -397,18 +765,22 @@ class TranscriptSummaryPipeline:
         logger.info("=== Pipeline started ===")
         logger.info("Video title : %s", self.video_title)
-        transcript         = self.fetcher.run()
-        summary, qa, model = self.summarizer.run(transcript)
-        logger.info("=== Pipeline complete | model: %s ===", model)
         return {
-            "video_id":    self.video_id,
-            "video_title": self.video_title,
-            "model_used":  model,
-            "summary":     summary,
-            "qa":          qa,
-            "transcript":  transcript,
         }

     return f"{h}h {m}m" if m else f"{h}h"
+# ============================================================================
+# SUBTITLE PARSERS
+# ============================================================================
+def _parse_vtt(content: str) -> str:
+    """
+    Parse WebVTT subtitle content into clean plain text.
+    Strips headers, timestamps, position metadata, and deduplicates
+    consecutive identical lines (VTT scrolling captions repeat text).
+    """
+    lines = content.splitlines()
+    text_lines: list[str] = []
+    prev_line = ""
+    for line in lines:
+        stripped = line.strip()
+        # Skip empty lines
+        if not stripped:
+            continue
+        # Skip VTT header
+        if stripped.startswith("WEBVTT"):
+            continue
+        # Skip metadata lines (Kind:, Language:, Style, NOTE, etc.)
+        if re.match(r"^(Kind:|Language:|Style|NOTE)", stripped, re.IGNORECASE):
+            continue
+        # Skip timestamp lines (00:00:00.000 --> 00:00:05.000)
+        if re.match(r"^\d{2}:\d{2}[:\.]\d{2}[\.:]\d{3}\s*-->\s*\d{2}:\d{2}", stripped):
+            continue
+        # Skip position/alignment metadata
+        if re.match(r"^(position:|align:|line:|size:)", stripped, re.IGNORECASE):
+            continue
+        # Skip sequence numbers (pure digits)
+        if stripped.isdigit():
+            continue
+        # Strip inline tags like <c>, </c>, <00:00:01.234>
+        cleaned = re.sub(r"<[^>]+>", "", stripped)
+        cleaned = cleaned.strip()
+        if not cleaned:
+            continue
+        # Deduplicate consecutive identical lines
+        if cleaned != prev_line:
+            text_lines.append(cleaned)
+            prev_line = cleaned
+    return " ".join(text_lines)
+def _parse_srt(content: str) -> str:
+    """
+    Parse SRT subtitle content into clean plain text.
+    Strips sequence numbers and timing lines.
+    """
+    lines = content.splitlines()
+    text_lines: list[str] = []
+    for line in lines:
+        stripped = line.strip()
+        if not stripped:
+            continue
+        # Skip sequence numbers
+        if stripped.isdigit():
+            continue
+        # Skip timing lines
+        if re.match(r"^\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}", stripped):
+            continue
+        # Strip HTML-style tags
+        cleaned = re.sub(r"<[^>]+>", "", stripped).strip()
+        if cleaned:
+            text_lines.append(cleaned)
+    return " ".join(text_lines)
 def fetch_video_title(video_id: str) -> str:
     """Fetch YouTube video title via oembed — no API key needed."""
     try:
 class YouTubeTranscriptFetcher:
     """
+    Fetches a YouTube transcript using a multi-tier fallback strategy:
+      Tier 1: youtube_transcript_api (fast, works for most public videos)
+      Tier 2: yt-dlp (robust, handles auto-generated + manual subs)
+      Tier 3: YouTube Data API v3 (only for videos the user owns)
+    Returns (transcript_text, extraction_method) tuple.
     """
     def __init__(
         youtube_url: str,
         languages: Optional[List[str]] = None,
         polling_config: dict           = None,
+        google_creds                   = None,
     ):
         self.youtube_url    = youtube_url
         self.languages      = languages or ["en", "en-US", "en-GB"]
         self.polling_config = polling_config or POLLING_CONFIG
         self.video_id       = self._extract_video_id(youtube_url)
         self.api            = YouTubeTranscriptApi()
+        self.google_creds   = google_creds
     @staticmethod
     def _extract_video_id(url: str) -> str:
         transcript = self.api.fetch(self.video_id, languages=self.languages)
         return " ".join(item.text for item in transcript)
+    def _try_all_tiers(self) -> tuple[str, str]:
+        """
+        Try all transcript extraction tiers in order.
+        Returns (transcript_text, method_used) on first success.
+        Raises RuntimeError if all tiers fail.
+        """
+        errors: list[str] = []
+        # ── Tier 1: youtube_transcript_api ──
+        try:
+            text = self._fetch_once()
+            logger.info("[Tier 1] ✅ youtube_transcript_api succeeded — %d chars", len(text))
+            return text, "youtube_transcript_api"
+        except TranscriptsDisabled as e:
+            # Still try yt-dlp — sometimes auto-subs exist even when
+            # the captions toggle is "disabled" for the transcript API
+            errors.append(f"Tier1(TranscriptsDisabled): {e}")
+            logger.warning("[Tier 1] Transcripts disabled, trying fallbacks...")
+        except Exception as e:
+            errors.append(f"Tier1: {e}")
+            logger.warning("[Tier 1] Failed: %s", e)
+        # ── Tier 2: yt-dlp ──
+        try:
+            text = YtDlpTranscriptFetcher(
+                self.video_id, languages=self.languages
+            ).fetch()
+            logger.info("[Tier 2] ✅ yt-dlp succeeded — %d chars", len(text))
+            return text, "yt-dlp"
+        except Exception as e:
+            errors.append(f"Tier2(yt-dlp): {e}")
+            logger.warning("[Tier 2] Failed: %s", e)
+        # ── Tier 3: YouTube Data API v3 (owned videos only) ──
+        if self.google_creds:
+            try:
+                text = YouTubeApiTranscriptFetcher(
+                    self.video_id, self.google_creds, languages=self.languages
+                ).fetch()
+                logger.info("[Tier 3] ✅ YouTube Data API v3 succeeded — %d chars", len(text))
+                return text, "youtube_data_api_v3"
+            except Exception as e:
+                errors.append(f"Tier3(YT-API): {e}")
+                logger.warning("[Tier 3] Failed: %s", e)
+        else:
+            errors.append("Tier3: Skipped (no OAuth credentials)")
+            logger.info("[Tier 3] Skipped — no Google OAuth credentials provided.")
+        raise RuntimeError(
+            f"All transcript tiers failed for video {self.video_id}. "
+            f"Details: {'; '.join(errors)}"
+        )
+    def run(self) -> tuple[str, str]:
         """
+        Fetch transcript with polling retry and multi-tier fallback.
+        On each polling attempt, all tiers are tried before waiting.
+        Returns (transcript_text, extraction_method).
         """
         logger.info("Video ID          : %s", self.video_id)
         logger.info("Polling attempts  : %d", len(self.polling_config))
                 time.sleep(wait_before)
             logger.info(
+                "[%d/%d] %s — trying all transcript tiers...",
                 idx, len(attempts), description,
             )
             try:
+                text, method = self._try_all_tiers()
                 logger.info(
+                    "[%d/%d] ✅ Transcript fetched via %s — %d characters",
+                    idx, len(attempts), method, len(text),
                 )
+                return text, method
             except KeyboardInterrupt:
                 logger.warning("Interrupted by user.")
                 raise
             except Exception as e:
+                logger.warning("[%d/%d] All tiers failed: %s", idx, len(attempts), e)
             if idx < len(attempts):
                 next_cfg = attempts[idx][1]
         )
+# ============================================================================
+# TIER 2 — yt-dlp PYTHON API SUBTITLE FETCHER
+# ============================================================================
+class YtDlpTranscriptFetcher:
+    """
+    Tier 2 fallback — uses yt-dlp's Python API (no subprocess).
+    Extracts subtitle URLs from video metadata via extract_info(),
+    then fetches content in-memory via HTTP.
+    Handles both manual and auto-generated captions.
+    """
+    PREFERRED_FORMATS = ["vtt", "srt", "srv1", "srv2", "srv3", "ttml"]
+    def __init__(self, video_id: str, languages: Optional[List[str]] = None):
+        self.video_id  = video_id
+        self.languages = languages or ["en", "en-US", "en-GB"]
+    def _find_subtitle_url(self, manual_subs: dict, auto_subs: dict) -> tuple[str, str]:
+        """
+        Search manual subtitles first, then auto-generated, for a
+        matching language + preferred format.
+        Returns (url, format_ext).
+        """
+        for subs_dict in (manual_subs, auto_subs):
+            if not subs_dict:
+                continue
+            for lang in self.languages:
+                if lang not in subs_dict:
+                    continue
+                tracks = subs_dict[lang]  # list of {ext, url, ...}
+                if not tracks:
+                    continue
+                # Try preferred formats in order
+                for fmt in self.PREFERRED_FORMATS:
+                    for track in tracks:
+                        if track.get("ext") == fmt and track.get("url"):
+                            return track["url"], fmt
+                # No preferred format matched — use first available with URL
+                for track in tracks:
+                    if track.get("url"):
+                        return track["url"], track.get("ext", "vtt")
+        raise RuntimeError(
+            f"No subtitles found in yt-dlp metadata for "
+            f"languages {self.languages} (video: {self.video_id})"
+        )
+    def fetch(self) -> str:
+        """
+        Extract subtitle URL from video metadata, fetch content
+        in-memory via HTTP, parse and return as plain text.
+        No files are written to disk.
+        """
+        import yt_dlp
+        import requests as _requests
+        logger.info("[yt-dlp] Attempting in-memory subtitle extraction for %s", self.video_id)
+        url = f"https://www.youtube.com/watch?v={self.video_id}"
+        ydl_opts = {
+            "skip_download":    True,
+            "quiet":            True,
+            "no_warnings":      True,
+            "noplaylist":       True,
+            "extract_flat":     False,
+        }
+        try:
+            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+                info = ydl.extract_info(url, download=False)
+        except Exception as e:
+            raise RuntimeError(f"yt-dlp extract_info failed: {e}")
+        if not info:
+            raise RuntimeError("yt-dlp returned empty info dict.")
+        manual_subs = info.get("subtitles") or {}
+        auto_subs   = info.get("automatic_captions") or {}
+        logger.info(
+            "[yt-dlp] Found %d manual sub tracks, %d auto-caption tracks",
+            len(manual_subs), len(auto_subs),
+        )
+        sub_url, sub_fmt = self._find_subtitle_url(manual_subs, auto_subs)
+        logger.info("[yt-dlp] Fetching subtitle content (format=%s)", sub_fmt)
+        # Fetch subtitle content in-memory
+        try:
+            resp = _requests.get(sub_url, timeout=30)
+            resp.raise_for_status()
+            raw_content = resp.text
+        except Exception as e:
+            raise RuntimeError(f"Failed to fetch subtitle from URL: {e}")
+        if not raw_content.strip():
+            raise RuntimeError("Subtitle URL returned empty content.")
+        # Parse based on format
+        if sub_fmt in ("vtt",):
+            text = _parse_vtt(raw_content)
+        elif sub_fmt in ("srt",):
+            text = _parse_srt(raw_content)
+        else:
+            # For srv1/srv2/srv3/ttml — strip all XML/HTML tags as fallback
+            text = re.sub(r"<[^>]+>", "", raw_content)
+            text = re.sub(r"\s+", " ", text).strip()
+        if not text.strip():
+            raise RuntimeError(
+                f"yt-dlp subtitle content was empty after parsing (format={sub_fmt})."
+            )
+        logger.info(
+            "[yt-dlp] ✅ Transcript extracted — %d characters (format=%s)",
+            len(text), sub_fmt,
+        )
+        return text
+# ============================================================================
+# TIER 3 — YOUTUBE DATA API v3 CAPTIONS FETCHER
+# ============================================================================
+class YouTubeApiTranscriptFetcher:
+    """
+    Fallback fetcher using the official YouTube Data API v3.
+    ⚠️  Only works for videos the authenticated user OWNS.
+    Requires OAuth credentials with youtube.force-ssl scope.
+    """
+    def __init__(self, video_id: str, credentials, languages: Optional[List[str]] = None):
+        self.video_id    = video_id
+        self.credentials = credentials
+        self.languages   = languages or ["en", "en-US", "en-GB"]
+    def fetch(self) -> str:
+        """
+        List caption tracks, find a matching language, and download.
+        Returns plain text transcript.
+        """
+        if self.credentials is None:
+            raise RuntimeError("No OAuth credentials provided for YouTube API.")
+        logger.info("[YT-API] Attempting captions download for %s", self.video_id)
+        try:
+            from googleapiclient.discovery import build as yt_build
+            youtube = yt_build(
+                "youtube", "v3",
+                credentials=self.credentials,
+                cache_discovery=False,
+            )
+            # Step 1: List caption tracks
+            captions_response = youtube.captions().list(
+                part="snippet",
+                videoId=self.video_id,
+            ).execute()
+            items = captions_response.get("items", [])
+            if not items:
+                raise RuntimeError(
+                    f"No caption tracks found for video {self.video_id}"
+                )
+            # Step 2: Find best matching caption track
+            caption_id = None
+            for lang in self.languages:
+                for item in items:
+                    snippet = item.get("snippet", {})
+                    if snippet.get("language", "") == lang:
+                        # Prefer non-auto-generated (manual) captions
+                        if snippet.get("trackKind") != "ASR":
+                            caption_id = item["id"]
+                            logger.info(
+                                "[YT-API] Found manual caption: lang=%s, id=%s",
+                                lang, caption_id,
+                            )
+                            break
+                if caption_id:
+                    break
+            # Fallback: accept any track in preferred languages
+            if not caption_id:
+                for lang in self.languages:
+                    for item in items:
+                        if item.get("snippet", {}).get("language", "") == lang:
+                            caption_id = item["id"]
+                            logger.info(
+                                "[YT-API] Using caption (any kind): lang=%s, id=%s",
+                                lang, caption_id,
+                            )
+                            break
+                    if caption_id:
+                        break
+            if not caption_id:
+                available = [i["snippet"]["language"] for i in items]
+                raise RuntimeError(
+                    f"No caption track matches languages {self.languages}. "
+                    f"Available: {available}"
+                )
+            # Step 3: Download caption content as SRT
+            caption_content = youtube.captions().download(
+                id=caption_id,
+                tfmt="srt",
+            ).execute()
+            # Response may be bytes or string
+            if isinstance(caption_content, bytes):
+                caption_content = caption_content.decode("utf-8")
+            text = _parse_srt(caption_content)
+            if not text.strip():
+                raise RuntimeError("YouTube API caption download returned empty text.")
+            logger.info(
+                "[YT-API] ✅ Transcript extracted — %d characters", len(text)
+            )
+            return text
+        except ImportError:
+            raise RuntimeError(
+                "google-api-python-client is not installed. "
+                "Cannot use YouTube Data API v3 fallback."
+            )
+        except Exception as e:
+            err_str = str(e)
+            if "403" in err_str or "Forbidden" in err_str:
+                raise RuntimeError(
+                    f"YouTube API returned 403 Forbidden — you can only "
+                    f"download captions for videos you own. Error: {err_str}"
+                )
+            raise
 # ============================================================================
 # GEMINI SUMMARIZER
 # ============================================================================
     """
     Orchestrates fetch → summarize.
     All data flows in memory — no disk I/O.
+    Supports multi-tier fallback for transcript extraction.
     """
     def __init__(
         youtube_url: str,
         languages: Optional[List[str]] = None,
         polling_config: dict           = None,
+        google_creds                   = None,
     ):
         self.youtube_url = youtube_url
         self.fetcher     = YouTubeTranscriptFetcher(
             youtube_url=youtube_url,
             languages=languages,
             polling_config=polling_config,
+            google_creds=google_creds,
         )
         self.summarizer  = GeminiSummarizer()
         self.video_id    = self.fetcher.video_id
         logger.info("=== Pipeline started ===")
         logger.info("Video title : %s", self.video_title)
+        transcript, extraction_method = self.fetcher.run()
+        summary, qa, model           = self.summarizer.run(transcript)
+        logger.info(
+            "=== Pipeline complete | model: %s | extraction: %s ===",
+            model, extraction_method,
+        )
         return {
+            "video_id":          self.video_id,
+            "video_title":       self.video_title,
+            "model_used":        model,
+            "extraction_method": extraction_method,
+            "summary":           summary,
+            "qa":                qa,
+            "transcript":        transcript,
         }

index.html CHANGED Viewed

@@ -360,6 +360,8 @@
       ['ti-help-circle', 'Q&A',        drive.qa?.web_view_link],
       ['ti-align-left',  'Transcript', drive.transcript?.web_view_link],
     ];
     box.innerHTML = links
       .filter(([,, u]) => u)
       .map(([icon, label, u]) => `
@@ -368,7 +370,10 @@
           <a href="${u}" target="_blank">${label} <i class="ti ti-external-link" style="font-size:11px"></i></a>
         </div>`)
       .join('') +
-      `<p class="result-note"><i class="ti ti-mail" style="font-size:13px;vertical-align:-2px"></i> Results also sent to your email</p>`;
   }
   /* ── Init ── */

       ['ti-help-circle', 'Q&A',        drive.qa?.web_view_link],
       ['ti-align-left',  'Transcript', drive.transcript?.web_view_link],
     ];
+    const method = result.extraction_method || '';
+    const methodLabel = method ? `<span style="display:inline-block;font-size:11px;padding:2px 8px;border-radius:12px;background:#f0f4ff;color:#3b5998;border:1px solid #c4d3f0;margin-left:4px">${method}</span>` : '';
     box.innerHTML = links
       .filter(([,, u]) => u)
       .map(([icon, label, u]) => `
           <a href="${u}" target="_blank">${label} <i class="ti ti-external-link" style="font-size:11px"></i></a>
         </div>`)
       .join('') +
+      `<div class="result-note" style="display:flex;align-items:center;gap:6px;flex-wrap:wrap;">
+        <span><i class="ti ti-mail" style="font-size:13px;vertical-align:-2px"></i> Results also sent to your email</span>
+        ${methodLabel ? '<span style="color:#999">·</span> Extracted via ' + methodLabel : ''}
+      </div>`;
   }
   /* ── Init ── */

oauth_states.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

requirements.txt CHANGED Viewed

@@ -6,4 +6,5 @@ google-auth-oauthlib
 requests
 youtube_transcript_api
 google-generativeai
-google-genai

 requests
 youtube_transcript_api
 google-generativeai
+google-genai
+yt-dlp