rsnarsna commited on
Commit
0bcc65e
Β·
1 Parent(s): eb0770c

feat: Enhance transcript extraction with multi-tier fallback; add YouTube API support and update UI to display extraction method

Browse files
Google_oauth_token.json CHANGED
@@ -1 +1 @@
1
- {"token": "ya29.a0AQvPyIPFC0StrVIncchRY249KeicYmMTdPM-ICSViACoeC7axklt6_pagQJFmnXAaxlQAb1pUi7DR0O0D3VGSYA3XipKuxFUZ0F8OGjpkbcqyrs3sWwEz-FrFDk6Qz_0vSl0Vf0JYVyin-w9pEOui5lLrGaz1kY8ZQOfTuq7dzS5A5rm0WhCOMZwoW3XvFG2BqEk1GMaCgYKAX4SARcSFQHGX2MiUQek-j_qAtSy-2Rt5wEdbA0206", "refresh_token": "1//0gu9-I_aiMWIFCgYIARAAGBASNwF-L9IrcIS5BPy6qGVLHH1C0EyEF612MzfTDW3_unszWw60sjA4c64i1jKfshLxfxvNnn2i1X0", "token_uri": "https://oauth2.googleapis.com/token", "client_id": "769133159215-9gbq0l5v49kmclfcq7vbq7tutck0aphd.apps.googleusercontent.com", "client_secret": "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", "scopes": ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/gmail.send", "https://www.googleapis.com/auth/drive.file"], "universe_domain": "googleapis.com", "account": "", "expiry": "2026-05-31T11:38:17.225953Z"}
 
1
+ {"token": "ya29.a0AQvPyIO26RTYjrTK11YqxleX0yEhb_vlrw0TChQxwxTP2GWBonDMQonUUdknaad1vpWBNMhMOrD0Mbw9pNon3W20odwEFIyiPcXX0DRC07hrmbPIiUN4R9hlbl5H_gZdBMfa6oHoBIAb358uMxWCtVoawWEuKAm_XZrZhIsEG8xlXSLY5e_Mi50nu77y09IYASOHe2QaCgYKAacSARcSFQHGX2MiZB9Z5G4jtAvKppNhrfPtKA0206", "refresh_token": "1//0gyGUH_G9f9CbCgYIARAAGBASNwF-L9Irnwc3yS0FAs7ocMc8Vtmxu-C3GbrkS_deBoCRToBbEBl0vkRHEjVWmIHw2EZ6RpjFGX8", "token_uri": "https://oauth2.googleapis.com/token", "client_id": "769133159215-9gbq0l5v49kmclfcq7vbq7tutck0aphd.apps.googleusercontent.com", "client_secret": "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", "scopes": ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/gmail.send", "https://www.googleapis.com/auth/drive.file"], "universe_domain": "googleapis.com", "account": "", "expiry": "2026-05-31T12:33:45Z"}
Google_oauth_token1.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"token": "ya29.a0AQvPyIPFC0StrVIncchRY249KeicYmMTdPM-ICSViACoeC7axklt6_pagQJFmnXAaxlQAb1pUi7DR0O0D3VGSYA3XipKuxFUZ0F8OGjpkbcqyrs3sWwEz-FrFDk6Qz_0vSl0Vf0JYVyin-w9pEOui5lLrGaz1kY8ZQOfTuq7dzS5A5rm0WhCOMZwoW3XvFG2BqEk1GMaCgYKAX4SARcSFQHGX2MiUQek-j_qAtSy-2Rt5wEdbA0206", "refresh_token": "1//0gu9-I_aiMWIFCgYIARAAGBASNwF-L9IrcIS5BPy6qGVLHH1C0EyEF612MzfTDW3_unszWw60sjA4c64i1jKfshLxfxvNnn2i1X0", "token_uri": "https://oauth2.googleapis.com/token", "client_id": "769133159215-9gbq0l5v49kmclfcq7vbq7tutck0aphd.apps.googleusercontent.com", "client_secret": "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", "scopes": ["https://www.googleapis.com/auth/spreadsheets", "https://www.googleapis.com/auth/gmail.send", "https://www.googleapis.com/auth/drive.file"], "universe_domain": "googleapis.com", "account": "", "expiry": "2026-05-31T11:38:17.225953Z"}
app.py CHANGED
@@ -43,6 +43,7 @@ SCOPES = [
43
  "https://www.googleapis.com/auth/spreadsheets",
44
  "https://www.googleapis.com/auth/gmail.send",
45
  "https://www.googleapis.com/auth/drive.file",
 
46
  ]
47
 
48
  SHEETS_HEADERS = [
@@ -51,15 +52,16 @@ SHEETS_HEADERS = [
51
  "Video Title", # C
52
  "YouTube URL", # D
53
  "Model Used", # E
54
- "Status", # F
55
- "Summary Drive Link", # G
56
- "Q&A Drive Link", # H
57
- "Transcript Drive Link", # I
58
- "Email Sent To", # J
59
- "Email Status", # K
60
- "Email Message ID", # L
61
- "Completed At", # M
62
- "Error", # N
 
63
  ]
64
 
65
 
@@ -493,15 +495,16 @@ def _create_sheet_record(
493
  "", # C β€” Video Title
494
  youtube_url, # D β€” YouTube URL
495
  "", # E β€” Model Used
496
- "initiated", # F β€” Status
497
- "", # G β€” Summary Link
498
- "", # H β€” Q&A Link
499
- "", # I β€” Transcript Link
500
- email_to, # J β€” Email Sent To
501
- "", # K β€” Email Status
502
- "", # L β€” Email Message ID
503
- "", # M β€” Completed At
504
- "", # N β€” Error
 
505
  ]
506
  append_sheet(DEFAULT_SPREADSHEET_ID, "Sheet1!A1", [row], creds=creds)
507
  except Exception as exc:
@@ -511,16 +514,17 @@ def _create_sheet_record(
511
  def _update_sheet_record(
512
  job_id: str,
513
  creds: Credentials,
514
- video_title: str = "",
515
- model_used: str = "",
516
- status: str = "",
517
- summary_link: str = "",
518
- qa_link: str = "",
519
- transcript_link: str = "",
520
- email_status: str = "",
521
- email_msg_id: str = "",
522
- completed_at: str = "",
523
- error: str = "",
 
524
  ) -> None:
525
  """Find job row by job_id and overwrite with updated values."""
526
  if not DEFAULT_SPREADSHEET_ID:
@@ -533,10 +537,10 @@ def _update_sheet_record(
533
 
534
  existing = read_sheet(
535
  DEFAULT_SPREADSHEET_ID,
536
- f"Sheet1!A{row_num}:N{row_num}",
537
  creds=creds,
538
  )
539
- existing_row = existing[0] if existing else [""] * 14
540
 
541
  def _v(new: str, idx: int) -> str:
542
  return new if new != "" else (
@@ -544,25 +548,26 @@ def _update_sheet_record(
544
  )
545
 
546
  updated_row = [
547
- _v("", 0), # A β€” Timestamp (immutable)
548
- job_id, # B β€” Job ID (immutable)
549
- _v(video_title, 2), # C β€” Video Title
550
- _v("", 3), # D β€” YouTube URL (immutable)
551
- _v(model_used, 4), # E β€” Model Used
552
- _v(status, 5), # F β€” Status
553
- _v(summary_link, 6), # G β€” Summary Link
554
- _v(qa_link, 7), # H β€” Q&A Link
555
- _v(transcript_link, 8), # I β€” Transcript Link
556
- _v("", 9), # J β€” Email Sent To (immutable)
557
- _v(email_status, 10), # K β€” Email Status
558
- _v(email_msg_id, 11), # L β€” Email Message ID
559
- _v(completed_at, 12), # M β€” Completed At
560
- _v(error, 13), # N β€” Error
 
561
  ]
562
 
563
  write_sheet(
564
  DEFAULT_SPREADSHEET_ID,
565
- f"Sheet1!A{row_num}:N{row_num}",
566
  [updated_row],
567
  creds=creds,
568
  )
@@ -899,12 +904,14 @@ def _run_pipeline(job_id: str, youtube_url: str, email_to: str):
899
  pipeline = TranscriptSummaryPipeline(
900
  youtube_url=youtube_url,
901
  languages=["en", "en-US", "en-GB"],
 
902
  )
903
- transcript = pipeline.fetcher.run()
904
  _set_step(job_id, "fetch_transcript", "done")
905
  _update_sheet_record(
906
  job_id, creds,
907
  video_title=pipeline.video_title,
 
908
  status="transcript_ready",
909
  )
910
  except Exception as exc:
@@ -981,15 +988,16 @@ def _run_pipeline(job_id: str, youtube_url: str, email_to: str):
981
 
982
  Your YouTube video has been processed successfully.
983
 
984
- πŸŽ₯ Title : {video_title}
985
- πŸ”— Video URL : {youtube_url}
986
 
987
- πŸ“„ Summary : {summary_link}
988
- ❓ Q&A : {qa_link}
989
- πŸ“ Transcript : {transcript_link}
990
 
991
  ────────────────────────────────
992
- Model Used : {model_used}
 
993
  ────────────────────────────────
994
 
995
  Regards,
@@ -1034,9 +1042,10 @@ Google Integration API
1034
  status="completed",
1035
  completed_at=completed_at,
1036
  result={
1037
- "video_title": video_title,
1038
- "youtube_url": youtube_url,
1039
- "model_used": model_used,
 
1040
  "drive": {
1041
  "folder_id": folder_id,
1042
  "summary": {
 
43
  "https://www.googleapis.com/auth/spreadsheets",
44
  "https://www.googleapis.com/auth/gmail.send",
45
  "https://www.googleapis.com/auth/drive.file",
46
+ "https://www.googleapis.com/auth/youtube.force-ssl",
47
  ]
48
 
49
  SHEETS_HEADERS = [
 
52
  "Video Title", # C
53
  "YouTube URL", # D
54
  "Model Used", # E
55
+ "Transcript Method", # F
56
+ "Status", # G
57
+ "Summary Drive Link", # H
58
+ "Q&A Drive Link", # I
59
+ "Transcript Drive Link", # J
60
+ "Email Sent To", # K
61
+ "Email Status", # L
62
+ "Email Message ID", # M
63
+ "Completed At", # N
64
+ "Error", # O
65
  ]
66
 
67
 
 
495
  "", # C β€” Video Title
496
  youtube_url, # D β€” YouTube URL
497
  "", # E β€” Model Used
498
+ "", # F β€” Transcript Method
499
+ "initiated", # G β€” Status
500
+ "", # H β€” Summary Link
501
+ "", # I β€” Q&A Link
502
+ "", # J β€” Transcript Link
503
+ email_to, # K β€” Email Sent To
504
+ "", # L β€” Email Status
505
+ "", # M β€” Email Message ID
506
+ "", # N β€” Completed At
507
+ "", # O β€” Error
508
  ]
509
  append_sheet(DEFAULT_SPREADSHEET_ID, "Sheet1!A1", [row], creds=creds)
510
  except Exception as exc:
 
514
  def _update_sheet_record(
515
  job_id: str,
516
  creds: Credentials,
517
+ video_title: str = "",
518
+ model_used: str = "",
519
+ extraction_method: str = "",
520
+ status: str = "",
521
+ summary_link: str = "",
522
+ qa_link: str = "",
523
+ transcript_link: str = "",
524
+ email_status: str = "",
525
+ email_msg_id: str = "",
526
+ completed_at: str = "",
527
+ error: str = "",
528
  ) -> None:
529
  """Find job row by job_id and overwrite with updated values."""
530
  if not DEFAULT_SPREADSHEET_ID:
 
537
 
538
  existing = read_sheet(
539
  DEFAULT_SPREADSHEET_ID,
540
+ f"Sheet1!A{row_num}:O{row_num}",
541
  creds=creds,
542
  )
543
+ existing_row = existing[0] if existing else [""] * 15
544
 
545
  def _v(new: str, idx: int) -> str:
546
  return new if new != "" else (
 
548
  )
549
 
550
  updated_row = [
551
+ _v("", 0), # A β€” Timestamp (immutable)
552
+ job_id, # B β€” Job ID (immutable)
553
+ _v(video_title, 2), # C β€” Video Title
554
+ _v("", 3), # D β€” YouTube URL (immutable)
555
+ _v(model_used, 4), # E β€” Model Used
556
+ _v(extraction_method, 5), # F β€” Transcript Method
557
+ _v(status, 6), # G β€” Status
558
+ _v(summary_link, 7), # H β€” Summary Link
559
+ _v(qa_link, 8), # I β€” Q&A Link
560
+ _v(transcript_link, 9), # J β€” Transcript Link
561
+ _v("", 10), # K β€” Email Sent To (immutable)
562
+ _v(email_status, 11), # L β€” Email Status
563
+ _v(email_msg_id, 12), # M β€” Email Message ID
564
+ _v(completed_at, 13), # N β€” Completed At
565
+ _v(error, 14), # O β€” Error
566
  ]
567
 
568
  write_sheet(
569
  DEFAULT_SPREADSHEET_ID,
570
+ f"Sheet1!A{row_num}:O{row_num}",
571
  [updated_row],
572
  creds=creds,
573
  )
 
904
  pipeline = TranscriptSummaryPipeline(
905
  youtube_url=youtube_url,
906
  languages=["en", "en-US", "en-GB"],
907
+ google_creds=creds,
908
  )
909
+ transcript, extraction_method = pipeline.fetcher.run()
910
  _set_step(job_id, "fetch_transcript", "done")
911
  _update_sheet_record(
912
  job_id, creds,
913
  video_title=pipeline.video_title,
914
+ extraction_method=extraction_method,
915
  status="transcript_ready",
916
  )
917
  except Exception as exc:
 
988
 
989
  Your YouTube video has been processed successfully.
990
 
991
+ πŸŽ₯ Title : {video_title}
992
+ πŸ”— Video URL : {youtube_url}
993
 
994
+ πŸ“„ Summary : {summary_link}
995
+ ❓ Q&A : {qa_link}
996
+ πŸ“ Transcript : {transcript_link}
997
 
998
  ────────────────────────────────
999
+ Model Used : {model_used}
1000
+ Transcript Extraction : {extraction_method}
1001
  ────────────────────────────────
1002
 
1003
  Regards,
 
1042
  status="completed",
1043
  completed_at=completed_at,
1044
  result={
1045
+ "video_title": video_title,
1046
+ "youtube_url": youtube_url,
1047
+ "model_used": model_used,
1048
+ "extraction_method": extraction_method,
1049
  "drive": {
1050
  "folder_id": folder_id,
1051
  "summary": {
gemini_transcript.py CHANGED
@@ -127,6 +127,79 @@ def _format_duration(seconds: int) -> str:
127
  return f"{h}h {m}m" if m else f"{h}h"
128
 
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
  def fetch_video_title(video_id: str) -> str:
131
  """Fetch YouTube video title via oembed β€” no API key needed."""
132
  try:
@@ -151,8 +224,11 @@ def fetch_video_title(video_id: str) -> str:
151
 
152
  class YouTubeTranscriptFetcher:
153
  """
154
- Fetches a YouTube transcript and returns it as a plain string.
155
- No files are written to disk.
 
 
 
156
  """
157
 
158
  def __init__(
@@ -160,12 +236,14 @@ class YouTubeTranscriptFetcher:
160
  youtube_url: str,
161
  languages: Optional[List[str]] = None,
162
  polling_config: dict = None,
 
163
  ):
164
  self.youtube_url = youtube_url
165
  self.languages = languages or ["en", "en-US", "en-GB"]
166
  self.polling_config = polling_config or POLLING_CONFIG
167
  self.video_id = self._extract_video_id(youtube_url)
168
  self.api = YouTubeTranscriptApi()
 
169
 
170
  @staticmethod
171
  def _extract_video_id(url: str) -> str:
@@ -186,10 +264,64 @@ class YouTubeTranscriptFetcher:
186
  transcript = self.api.fetch(self.video_id, languages=self.languages)
187
  return " ".join(item.text for item in transcript)
188
 
189
- def run(self) -> str:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  """
191
- Fetch transcript with polling retry.
192
- Returns transcript as a string β€” nothing is written to disk.
 
193
  """
194
  logger.info("Video ID : %s", self.video_id)
195
  logger.info("Polling attempts : %d", len(self.polling_config))
@@ -209,34 +341,24 @@ class YouTubeTranscriptFetcher:
209
  time.sleep(wait_before)
210
 
211
  logger.info(
212
- "[%d/%d] %s β€” fetching transcript now...",
213
  idx, len(attempts), description,
214
  )
215
 
216
  try:
217
- text = self._fetch_once()
218
  logger.info(
219
- "[%d/%d] βœ… Transcript fetched β€” %d characters",
220
- idx, len(attempts), len(text),
221
  )
222
- return text
223
-
224
- except TranscriptsDisabled as e:
225
- logger.warning("[%d/%d] Transcripts disabled: %s", idx, len(attempts), e)
226
- raise # no point retrying
227
-
228
- except VideoUnavailable as e:
229
- logger.warning("[%d/%d] Video unavailable: %s", idx, len(attempts), e)
230
-
231
- except NoTranscriptFound as e:
232
- logger.warning("[%d/%d] No transcript yet: %s", idx, len(attempts), e)
233
 
234
  except KeyboardInterrupt:
235
  logger.warning("Interrupted by user.")
236
  raise
237
 
238
  except Exception as e:
239
- logger.exception("[%d/%d] Unexpected error: %s", idx, len(attempts), e)
240
 
241
  if idx < len(attempts):
242
  next_cfg = attempts[idx][1]
@@ -255,6 +377,249 @@ class YouTubeTranscriptFetcher:
255
  )
256
 
257
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  # ============================================================================
259
  # GEMINI SUMMARIZER
260
  # ============================================================================
@@ -375,6 +740,7 @@ class TranscriptSummaryPipeline:
375
  """
376
  Orchestrates fetch β†’ summarize.
377
  All data flows in memory β€” no disk I/O.
 
378
  """
379
 
380
  def __init__(
@@ -382,12 +748,14 @@ class TranscriptSummaryPipeline:
382
  youtube_url: str,
383
  languages: Optional[List[str]] = None,
384
  polling_config: dict = None,
 
385
  ):
386
  self.youtube_url = youtube_url
387
  self.fetcher = YouTubeTranscriptFetcher(
388
  youtube_url=youtube_url,
389
  languages=languages,
390
  polling_config=polling_config,
 
391
  )
392
  self.summarizer = GeminiSummarizer()
393
  self.video_id = self.fetcher.video_id
@@ -397,18 +765,22 @@ class TranscriptSummaryPipeline:
397
  logger.info("=== Pipeline started ===")
398
  logger.info("Video title : %s", self.video_title)
399
 
400
- transcript = self.fetcher.run()
401
- summary, qa, model = self.summarizer.run(transcript)
402
 
403
- logger.info("=== Pipeline complete | model: %s ===", model)
 
 
 
404
 
405
  return {
406
- "video_id": self.video_id,
407
- "video_title": self.video_title,
408
- "model_used": model,
409
- "summary": summary,
410
- "qa": qa,
411
- "transcript": transcript,
 
412
  }
413
 
414
 
 
127
  return f"{h}h {m}m" if m else f"{h}h"
128
 
129
 
130
+ # ============================================================================
131
+ # SUBTITLE PARSERS
132
+ # ============================================================================
133
+
134
+ def _parse_vtt(content: str) -> str:
135
+ """
136
+ Parse WebVTT subtitle content into clean plain text.
137
+ Strips headers, timestamps, position metadata, and deduplicates
138
+ consecutive identical lines (VTT scrolling captions repeat text).
139
+ """
140
+ lines = content.splitlines()
141
+ text_lines: list[str] = []
142
+ prev_line = ""
143
+
144
+ for line in lines:
145
+ stripped = line.strip()
146
+ # Skip empty lines
147
+ if not stripped:
148
+ continue
149
+ # Skip VTT header
150
+ if stripped.startswith("WEBVTT"):
151
+ continue
152
+ # Skip metadata lines (Kind:, Language:, Style, NOTE, etc.)
153
+ if re.match(r"^(Kind:|Language:|Style|NOTE)", stripped, re.IGNORECASE):
154
+ continue
155
+ # Skip timestamp lines (00:00:00.000 --> 00:00:05.000)
156
+ if re.match(r"^\d{2}:\d{2}[:\.]\d{2}[\.:]\d{3}\s*-->\s*\d{2}:\d{2}", stripped):
157
+ continue
158
+ # Skip position/alignment metadata
159
+ if re.match(r"^(position:|align:|line:|size:)", stripped, re.IGNORECASE):
160
+ continue
161
+ # Skip sequence numbers (pure digits)
162
+ if stripped.isdigit():
163
+ continue
164
+ # Strip inline tags like <c>, </c>, <00:00:01.234>
165
+ cleaned = re.sub(r"<[^>]+>", "", stripped)
166
+ cleaned = cleaned.strip()
167
+ if not cleaned:
168
+ continue
169
+ # Deduplicate consecutive identical lines
170
+ if cleaned != prev_line:
171
+ text_lines.append(cleaned)
172
+ prev_line = cleaned
173
+
174
+ return " ".join(text_lines)
175
+
176
+
177
+ def _parse_srt(content: str) -> str:
178
+ """
179
+ Parse SRT subtitle content into clean plain text.
180
+ Strips sequence numbers and timing lines.
181
+ """
182
+ lines = content.splitlines()
183
+ text_lines: list[str] = []
184
+
185
+ for line in lines:
186
+ stripped = line.strip()
187
+ if not stripped:
188
+ continue
189
+ # Skip sequence numbers
190
+ if stripped.isdigit():
191
+ continue
192
+ # Skip timing lines
193
+ if re.match(r"^\d{2}:\d{2}:\d{2}[,.]\d{3}\s*-->\s*\d{2}:\d{2}", stripped):
194
+ continue
195
+ # Strip HTML-style tags
196
+ cleaned = re.sub(r"<[^>]+>", "", stripped).strip()
197
+ if cleaned:
198
+ text_lines.append(cleaned)
199
+
200
+ return " ".join(text_lines)
201
+
202
+
203
  def fetch_video_title(video_id: str) -> str:
204
  """Fetch YouTube video title via oembed β€” no API key needed."""
205
  try:
 
224
 
225
  class YouTubeTranscriptFetcher:
226
  """
227
+ Fetches a YouTube transcript using a multi-tier fallback strategy:
228
+ Tier 1: youtube_transcript_api (fast, works for most public videos)
229
+ Tier 2: yt-dlp (robust, handles auto-generated + manual subs)
230
+ Tier 3: YouTube Data API v3 (only for videos the user owns)
231
+ Returns (transcript_text, extraction_method) tuple.
232
  """
233
 
234
  def __init__(
 
236
  youtube_url: str,
237
  languages: Optional[List[str]] = None,
238
  polling_config: dict = None,
239
+ google_creds = None,
240
  ):
241
  self.youtube_url = youtube_url
242
  self.languages = languages or ["en", "en-US", "en-GB"]
243
  self.polling_config = polling_config or POLLING_CONFIG
244
  self.video_id = self._extract_video_id(youtube_url)
245
  self.api = YouTubeTranscriptApi()
246
+ self.google_creds = google_creds
247
 
248
  @staticmethod
249
  def _extract_video_id(url: str) -> str:
 
264
  transcript = self.api.fetch(self.video_id, languages=self.languages)
265
  return " ".join(item.text for item in transcript)
266
 
267
+ def _try_all_tiers(self) -> tuple[str, str]:
268
+ """
269
+ Try all transcript extraction tiers in order.
270
+ Returns (transcript_text, method_used) on first success.
271
+ Raises RuntimeError if all tiers fail.
272
+ """
273
+ errors: list[str] = []
274
+
275
+ # ── Tier 1: youtube_transcript_api ──
276
+ try:
277
+ text = self._fetch_once()
278
+ logger.info("[Tier 1] βœ… youtube_transcript_api succeeded β€” %d chars", len(text))
279
+ return text, "youtube_transcript_api"
280
+ except TranscriptsDisabled as e:
281
+ # Still try yt-dlp β€” sometimes auto-subs exist even when
282
+ # the captions toggle is "disabled" for the transcript API
283
+ errors.append(f"Tier1(TranscriptsDisabled): {e}")
284
+ logger.warning("[Tier 1] Transcripts disabled, trying fallbacks...")
285
+ except Exception as e:
286
+ errors.append(f"Tier1: {e}")
287
+ logger.warning("[Tier 1] Failed: %s", e)
288
+
289
+ # ── Tier 2: yt-dlp ──
290
+ try:
291
+ text = YtDlpTranscriptFetcher(
292
+ self.video_id, languages=self.languages
293
+ ).fetch()
294
+ logger.info("[Tier 2] βœ… yt-dlp succeeded β€” %d chars", len(text))
295
+ return text, "yt-dlp"
296
+ except Exception as e:
297
+ errors.append(f"Tier2(yt-dlp): {e}")
298
+ logger.warning("[Tier 2] Failed: %s", e)
299
+
300
+ # ── Tier 3: YouTube Data API v3 (owned videos only) ──
301
+ if self.google_creds:
302
+ try:
303
+ text = YouTubeApiTranscriptFetcher(
304
+ self.video_id, self.google_creds, languages=self.languages
305
+ ).fetch()
306
+ logger.info("[Tier 3] βœ… YouTube Data API v3 succeeded β€” %d chars", len(text))
307
+ return text, "youtube_data_api_v3"
308
+ except Exception as e:
309
+ errors.append(f"Tier3(YT-API): {e}")
310
+ logger.warning("[Tier 3] Failed: %s", e)
311
+ else:
312
+ errors.append("Tier3: Skipped (no OAuth credentials)")
313
+ logger.info("[Tier 3] Skipped β€” no Google OAuth credentials provided.")
314
+
315
+ raise RuntimeError(
316
+ f"All transcript tiers failed for video {self.video_id}. "
317
+ f"Details: {'; '.join(errors)}"
318
+ )
319
+
320
+ def run(self) -> tuple[str, str]:
321
  """
322
+ Fetch transcript with polling retry and multi-tier fallback.
323
+ On each polling attempt, all tiers are tried before waiting.
324
+ Returns (transcript_text, extraction_method).
325
  """
326
  logger.info("Video ID : %s", self.video_id)
327
  logger.info("Polling attempts : %d", len(self.polling_config))
 
341
  time.sleep(wait_before)
342
 
343
  logger.info(
344
+ "[%d/%d] %s β€” trying all transcript tiers...",
345
  idx, len(attempts), description,
346
  )
347
 
348
  try:
349
+ text, method = self._try_all_tiers()
350
  logger.info(
351
+ "[%d/%d] βœ… Transcript fetched via %s β€” %d characters",
352
+ idx, len(attempts), method, len(text),
353
  )
354
+ return text, method
 
 
 
 
 
 
 
 
 
 
355
 
356
  except KeyboardInterrupt:
357
  logger.warning("Interrupted by user.")
358
  raise
359
 
360
  except Exception as e:
361
+ logger.warning("[%d/%d] All tiers failed: %s", idx, len(attempts), e)
362
 
363
  if idx < len(attempts):
364
  next_cfg = attempts[idx][1]
 
377
  )
378
 
379
 
380
+ # ============================================================================
381
+ # TIER 2 β€” yt-dlp PYTHON API SUBTITLE FETCHER
382
+ # ============================================================================
383
+
384
+ class YtDlpTranscriptFetcher:
385
+ """
386
+ Tier 2 fallback β€” uses yt-dlp's Python API (no subprocess).
387
+ Extracts subtitle URLs from video metadata via extract_info(),
388
+ then fetches content in-memory via HTTP.
389
+ Handles both manual and auto-generated captions.
390
+ """
391
+
392
+ PREFERRED_FORMATS = ["vtt", "srt", "srv1", "srv2", "srv3", "ttml"]
393
+
394
+ def __init__(self, video_id: str, languages: Optional[List[str]] = None):
395
+ self.video_id = video_id
396
+ self.languages = languages or ["en", "en-US", "en-GB"]
397
+
398
+ def _find_subtitle_url(self, manual_subs: dict, auto_subs: dict) -> tuple[str, str]:
399
+ """
400
+ Search manual subtitles first, then auto-generated, for a
401
+ matching language + preferred format.
402
+ Returns (url, format_ext).
403
+ """
404
+ for subs_dict in (manual_subs, auto_subs):
405
+ if not subs_dict:
406
+ continue
407
+ for lang in self.languages:
408
+ if lang not in subs_dict:
409
+ continue
410
+ tracks = subs_dict[lang] # list of {ext, url, ...}
411
+ if not tracks:
412
+ continue
413
+ # Try preferred formats in order
414
+ for fmt in self.PREFERRED_FORMATS:
415
+ for track in tracks:
416
+ if track.get("ext") == fmt and track.get("url"):
417
+ return track["url"], fmt
418
+ # No preferred format matched β€” use first available with URL
419
+ for track in tracks:
420
+ if track.get("url"):
421
+ return track["url"], track.get("ext", "vtt")
422
+
423
+ raise RuntimeError(
424
+ f"No subtitles found in yt-dlp metadata for "
425
+ f"languages {self.languages} (video: {self.video_id})"
426
+ )
427
+
428
+ def fetch(self) -> str:
429
+ """
430
+ Extract subtitle URL from video metadata, fetch content
431
+ in-memory via HTTP, parse and return as plain text.
432
+ No files are written to disk.
433
+ """
434
+ import yt_dlp
435
+ import requests as _requests
436
+
437
+ logger.info("[yt-dlp] Attempting in-memory subtitle extraction for %s", self.video_id)
438
+
439
+ url = f"https://www.youtube.com/watch?v={self.video_id}"
440
+
441
+ ydl_opts = {
442
+ "skip_download": True,
443
+ "quiet": True,
444
+ "no_warnings": True,
445
+ "noplaylist": True,
446
+ "extract_flat": False,
447
+ }
448
+
449
+ try:
450
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
451
+ info = ydl.extract_info(url, download=False)
452
+ except Exception as e:
453
+ raise RuntimeError(f"yt-dlp extract_info failed: {e}")
454
+
455
+ if not info:
456
+ raise RuntimeError("yt-dlp returned empty info dict.")
457
+
458
+ manual_subs = info.get("subtitles") or {}
459
+ auto_subs = info.get("automatic_captions") or {}
460
+
461
+ logger.info(
462
+ "[yt-dlp] Found %d manual sub tracks, %d auto-caption tracks",
463
+ len(manual_subs), len(auto_subs),
464
+ )
465
+
466
+ sub_url, sub_fmt = self._find_subtitle_url(manual_subs, auto_subs)
467
+
468
+ logger.info("[yt-dlp] Fetching subtitle content (format=%s)", sub_fmt)
469
+
470
+ # Fetch subtitle content in-memory
471
+ try:
472
+ resp = _requests.get(sub_url, timeout=30)
473
+ resp.raise_for_status()
474
+ raw_content = resp.text
475
+ except Exception as e:
476
+ raise RuntimeError(f"Failed to fetch subtitle from URL: {e}")
477
+
478
+ if not raw_content.strip():
479
+ raise RuntimeError("Subtitle URL returned empty content.")
480
+
481
+ # Parse based on format
482
+ if sub_fmt in ("vtt",):
483
+ text = _parse_vtt(raw_content)
484
+ elif sub_fmt in ("srt",):
485
+ text = _parse_srt(raw_content)
486
+ else:
487
+ # For srv1/srv2/srv3/ttml β€” strip all XML/HTML tags as fallback
488
+ text = re.sub(r"<[^>]+>", "", raw_content)
489
+ text = re.sub(r"\s+", " ", text).strip()
490
+
491
+ if not text.strip():
492
+ raise RuntimeError(
493
+ f"yt-dlp subtitle content was empty after parsing (format={sub_fmt})."
494
+ )
495
+
496
+ logger.info(
497
+ "[yt-dlp] βœ… Transcript extracted β€” %d characters (format=%s)",
498
+ len(text), sub_fmt,
499
+ )
500
+ return text
501
+
502
+
503
+ # ============================================================================
504
+ # TIER 3 β€” YOUTUBE DATA API v3 CAPTIONS FETCHER
505
+ # ============================================================================
506
+
507
+ class YouTubeApiTranscriptFetcher:
508
+ """
509
+ Fallback fetcher using the official YouTube Data API v3.
510
+ ⚠️ Only works for videos the authenticated user OWNS.
511
+ Requires OAuth credentials with youtube.force-ssl scope.
512
+ """
513
+
514
+ def __init__(self, video_id: str, credentials, languages: Optional[List[str]] = None):
515
+ self.video_id = video_id
516
+ self.credentials = credentials
517
+ self.languages = languages or ["en", "en-US", "en-GB"]
518
+
519
+ def fetch(self) -> str:
520
+ """
521
+ List caption tracks, find a matching language, and download.
522
+ Returns plain text transcript.
523
+ """
524
+ if self.credentials is None:
525
+ raise RuntimeError("No OAuth credentials provided for YouTube API.")
526
+
527
+ logger.info("[YT-API] Attempting captions download for %s", self.video_id)
528
+
529
+ try:
530
+ from googleapiclient.discovery import build as yt_build
531
+
532
+ youtube = yt_build(
533
+ "youtube", "v3",
534
+ credentials=self.credentials,
535
+ cache_discovery=False,
536
+ )
537
+
538
+ # Step 1: List caption tracks
539
+ captions_response = youtube.captions().list(
540
+ part="snippet",
541
+ videoId=self.video_id,
542
+ ).execute()
543
+
544
+ items = captions_response.get("items", [])
545
+ if not items:
546
+ raise RuntimeError(
547
+ f"No caption tracks found for video {self.video_id}"
548
+ )
549
+
550
+ # Step 2: Find best matching caption track
551
+ caption_id = None
552
+ for lang in self.languages:
553
+ for item in items:
554
+ snippet = item.get("snippet", {})
555
+ if snippet.get("language", "") == lang:
556
+ # Prefer non-auto-generated (manual) captions
557
+ if snippet.get("trackKind") != "ASR":
558
+ caption_id = item["id"]
559
+ logger.info(
560
+ "[YT-API] Found manual caption: lang=%s, id=%s",
561
+ lang, caption_id,
562
+ )
563
+ break
564
+ if caption_id:
565
+ break
566
+
567
+ # Fallback: accept any track in preferred languages
568
+ if not caption_id:
569
+ for lang in self.languages:
570
+ for item in items:
571
+ if item.get("snippet", {}).get("language", "") == lang:
572
+ caption_id = item["id"]
573
+ logger.info(
574
+ "[YT-API] Using caption (any kind): lang=%s, id=%s",
575
+ lang, caption_id,
576
+ )
577
+ break
578
+ if caption_id:
579
+ break
580
+
581
+ if not caption_id:
582
+ available = [i["snippet"]["language"] for i in items]
583
+ raise RuntimeError(
584
+ f"No caption track matches languages {self.languages}. "
585
+ f"Available: {available}"
586
+ )
587
+
588
+ # Step 3: Download caption content as SRT
589
+ caption_content = youtube.captions().download(
590
+ id=caption_id,
591
+ tfmt="srt",
592
+ ).execute()
593
+
594
+ # Response may be bytes or string
595
+ if isinstance(caption_content, bytes):
596
+ caption_content = caption_content.decode("utf-8")
597
+
598
+ text = _parse_srt(caption_content)
599
+
600
+ if not text.strip():
601
+ raise RuntimeError("YouTube API caption download returned empty text.")
602
+
603
+ logger.info(
604
+ "[YT-API] βœ… Transcript extracted β€” %d characters", len(text)
605
+ )
606
+ return text
607
+
608
+ except ImportError:
609
+ raise RuntimeError(
610
+ "google-api-python-client is not installed. "
611
+ "Cannot use YouTube Data API v3 fallback."
612
+ )
613
+ except Exception as e:
614
+ err_str = str(e)
615
+ if "403" in err_str or "Forbidden" in err_str:
616
+ raise RuntimeError(
617
+ f"YouTube API returned 403 Forbidden β€” you can only "
618
+ f"download captions for videos you own. Error: {err_str}"
619
+ )
620
+ raise
621
+
622
+
623
  # ============================================================================
624
  # GEMINI SUMMARIZER
625
  # ============================================================================
 
740
  """
741
  Orchestrates fetch β†’ summarize.
742
  All data flows in memory β€” no disk I/O.
743
+ Supports multi-tier fallback for transcript extraction.
744
  """
745
 
746
  def __init__(
 
748
  youtube_url: str,
749
  languages: Optional[List[str]] = None,
750
  polling_config: dict = None,
751
+ google_creds = None,
752
  ):
753
  self.youtube_url = youtube_url
754
  self.fetcher = YouTubeTranscriptFetcher(
755
  youtube_url=youtube_url,
756
  languages=languages,
757
  polling_config=polling_config,
758
+ google_creds=google_creds,
759
  )
760
  self.summarizer = GeminiSummarizer()
761
  self.video_id = self.fetcher.video_id
 
765
  logger.info("=== Pipeline started ===")
766
  logger.info("Video title : %s", self.video_title)
767
 
768
+ transcript, extraction_method = self.fetcher.run()
769
+ summary, qa, model = self.summarizer.run(transcript)
770
 
771
+ logger.info(
772
+ "=== Pipeline complete | model: %s | extraction: %s ===",
773
+ model, extraction_method,
774
+ )
775
 
776
  return {
777
+ "video_id": self.video_id,
778
+ "video_title": self.video_title,
779
+ "model_used": model,
780
+ "extraction_method": extraction_method,
781
+ "summary": summary,
782
+ "qa": qa,
783
+ "transcript": transcript,
784
  }
785
 
786
 
index.html CHANGED
@@ -360,6 +360,8 @@
360
  ['ti-help-circle', 'Q&A', drive.qa?.web_view_link],
361
  ['ti-align-left', 'Transcript', drive.transcript?.web_view_link],
362
  ];
 
 
363
  box.innerHTML = links
364
  .filter(([,, u]) => u)
365
  .map(([icon, label, u]) => `
@@ -368,7 +370,10 @@
368
  <a href="${u}" target="_blank">${label} <i class="ti ti-external-link" style="font-size:11px"></i></a>
369
  </div>`)
370
  .join('') +
371
- `<p class="result-note"><i class="ti ti-mail" style="font-size:13px;vertical-align:-2px"></i> Results also sent to your email</p>`;
 
 
 
372
  }
373
 
374
  /* ── Init ── */
 
360
  ['ti-help-circle', 'Q&A', drive.qa?.web_view_link],
361
  ['ti-align-left', 'Transcript', drive.transcript?.web_view_link],
362
  ];
363
+ const method = result.extraction_method || '';
364
+ const methodLabel = method ? `<span style="display:inline-block;font-size:11px;padding:2px 8px;border-radius:12px;background:#f0f4ff;color:#3b5998;border:1px solid #c4d3f0;margin-left:4px">${method}</span>` : '';
365
  box.innerHTML = links
366
  .filter(([,, u]) => u)
367
  .map(([icon, label, u]) => `
 
370
  <a href="${u}" target="_blank">${label} <i class="ti ti-external-link" style="font-size:11px"></i></a>
371
  </div>`)
372
  .join('') +
373
+ `<div class="result-note" style="display:flex;align-items:center;gap:6px;flex-wrap:wrap;">
374
+ <span><i class="ti ti-mail" style="font-size:13px;vertical-align:-2px"></i> Results also sent to your email</span>
375
+ ${methodLabel ? '<span style="color:#999">Β·</span> Extracted via ' + methodLabel : ''}
376
+ </div>`;
377
  }
378
 
379
  /* ── Init ── */
oauth_states.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
requirements.txt CHANGED
@@ -6,4 +6,5 @@ google-auth-oauthlib
6
  requests
7
  youtube_transcript_api
8
  google-generativeai
9
- google-genai
 
 
6
  requests
7
  youtube_transcript_api
8
  google-generativeai
9
+ google-genai
10
+ yt-dlp