rsnarsna commited on
Commit
f59712d
Β·
1 Parent(s): 24cacb3

refactor: Clean up imports and improve file path handling in app.py and gemini_transcript.py; update requirements.txt for new dependencies

Browse files
Files changed (3) hide show
  1. app.py +20 -21
  2. gemini_transcript.py +16 -12
  3. requirements.txt +3 -1
app.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import os
2
  import json
3
  import base64
@@ -11,9 +13,8 @@ from pathlib import Path
11
  from email.mime.text import MIMEText
12
  from datetime import datetime, timezone
13
 
14
- from fastapi.responses import FileResponse
15
  from fastapi import FastAPI, Request, HTTPException
16
- from fastapi.responses import RedirectResponse, HTMLResponse
17
  from pydantic import BaseModel
18
 
19
  from google_auth_oauthlib.flow import Flow
@@ -32,12 +33,12 @@ from gemini_transcript import TranscriptSummaryPipeline
32
  os.environ.setdefault("OAUTHLIB_INSECURE_TRANSPORT", "1")
33
 
34
  BASE_DIR = Path(__file__).resolve().parent
35
- CLIENT_SECRETS = os.getenv("CLIENT_SECRETS", str(BASE_DIR / "client_secret.json"))
36
- TOKEN_PATH = os.getenv("GOOGLE_OAUTH_TOKEN_PATH", str(BASE_DIR / "Google_oauth_token.json"))
37
- REDIRECT_URI = os.getenv("REDIRECT_URI", "http://localhost:8000/auth/callback")
38
  STATE_FILE = BASE_DIR / "oauth_states.json"
39
- DEFAULT_SPREADSHEET_ID = os.getenv("DEFAULT_SPREADSHEET_ID", "1XA3vW_guHBT-ktkYvhktmUqcquECBe8exGZAoSQS3Ag")
40
- DEFAULT_DRIVE_FOLDER_ID = os.getenv("DEFAULT_DRIVE_FOLDER_ID", "1hI6dNXysR_2p9gHkDpsI-iwMExmy2hhR")
41
 
42
  SCOPES = [
43
  "https://www.googleapis.com/auth/spreadsheets",
@@ -45,10 +46,10 @@ SCOPES = [
45
  "https://www.googleapis.com/auth/drive.file",
46
  ]
47
 
48
- BASE_DIR = Path(".")
49
- TRANSCRIPT_FILE = BASE_DIR / "output" / "transcript.txt"
50
- SUMMARY_FILE = BASE_DIR / "output" / "summary.txt"
51
- QA_FILE = BASE_DIR / "output" / "qa.txt"
52
 
53
  SHEETS_HEADERS = [
54
  "Timestamp", # A
@@ -274,8 +275,8 @@ def upload_file_to_drive(
274
  ) -> dict:
275
  if creds is None:
276
  creds = require_credentials()
277
- svc = _drive(creds)
278
- meta = {"name": os.path.basename(filepath)}
279
  if folder_id:
280
  meta["parents"] = [folder_id]
281
  media = MediaFileUpload(filepath, resumable=True)
@@ -556,7 +557,6 @@ def _update_sheet_record(
556
  print(f"[WARN] Row for job {job_id} not found in sheet.")
557
  return
558
 
559
- # Read existing to preserve immutable columns
560
  existing = read_sheet(
561
  DEFAULT_SPREADSHEET_ID,
562
  f"Sheet1!A{row_num}:N{row_num}",
@@ -613,7 +613,7 @@ def on_startup():
613
 
614
  @app.get("/")
615
  def root():
616
- return FileResponse("index.html")
617
 
618
 
619
  @app.get("/health")
@@ -641,9 +641,9 @@ def health():
641
 
642
  @app.get("/auth/start")
643
  def auth_start():
644
- flow = create_flow()
645
- verifier = secrets.token_urlsafe(64)
646
- challenge = (
647
  base64.urlsafe_b64encode(hashlib.sha256(verifier.encode()).digest())
648
  .rstrip(b"=").decode()
649
  )
@@ -904,7 +904,6 @@ def _run_pipeline(job_id: str, youtube_url: str, email_to: str):
904
  creds = load_credentials()
905
  timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
906
 
907
- # Create initial sheet row immediately
908
  _create_sheet_record(
909
  job_id=job_id,
910
  timestamp=timestamp,
@@ -1063,11 +1062,11 @@ Google Integration API
1063
  "model_used": model_used,
1064
  "drive": {
1065
  "folder_id": folder_id,
1066
- "summary": {
1067
  "web_view_link": summary_drive.get("webViewLink"),
1068
  "direct_download_link": summary_link,
1069
  },
1070
- "qa": {
1071
  "web_view_link": qa_drive.get("webViewLink"),
1072
  "direct_download_link": qa_link,
1073
  },
 
1
+ #!/usr/bin/env python3
2
+
3
  import os
4
  import json
5
  import base64
 
13
  from email.mime.text import MIMEText
14
  from datetime import datetime, timezone
15
 
 
16
  from fastapi import FastAPI, Request, HTTPException
17
+ from fastapi.responses import RedirectResponse, HTMLResponse, FileResponse
18
  from pydantic import BaseModel
19
 
20
  from google_auth_oauthlib.flow import Flow
 
33
  os.environ.setdefault("OAUTHLIB_INSECURE_TRANSPORT", "1")
34
 
35
  BASE_DIR = Path(__file__).resolve().parent
36
+ CLIENT_SECRETS = os.getenv("CLIENT_SECRETS", str(BASE_DIR / "client_secret.json"))
37
+ TOKEN_PATH = os.getenv("GOOGLE_OAUTH_TOKEN_PATH", str(BASE_DIR / "Google_oauth_token.json"))
38
+ REDIRECT_URI = os.getenv("REDIRECT_URI", "http://localhost:8000/auth/callback")
39
  STATE_FILE = BASE_DIR / "oauth_states.json"
40
+ DEFAULT_SPREADSHEET_ID = os.getenv("DEFAULT_SPREADSHEET_ID", "1XA3vW_guHBT-ktkYvhktmUqcquECBe8exGZAoSQS3Ag")
41
+ DEFAULT_DRIVE_FOLDER_ID = os.getenv("DEFAULT_DRIVE_FOLDER_ID", "1hI6dNXysR_2p9gHkDpsI-iwMExmy2hhR")
42
 
43
  SCOPES = [
44
  "https://www.googleapis.com/auth/spreadsheets",
 
46
  "https://www.googleapis.com/auth/drive.file",
47
  ]
48
 
49
+ OUTPUT_DIR = Path(".") / "output"
50
+ TRANSCRIPT_FILE = OUTPUT_DIR / "transcript.txt"
51
+ SUMMARY_FILE = OUTPUT_DIR / "summary.txt"
52
+ QA_FILE = OUTPUT_DIR / "qa.txt"
53
 
54
  SHEETS_HEADERS = [
55
  "Timestamp", # A
 
275
  ) -> dict:
276
  if creds is None:
277
  creds = require_credentials()
278
+ svc = _drive(creds)
279
+ meta = {"name": os.path.basename(filepath)}
280
  if folder_id:
281
  meta["parents"] = [folder_id]
282
  media = MediaFileUpload(filepath, resumable=True)
 
557
  print(f"[WARN] Row for job {job_id} not found in sheet.")
558
  return
559
 
 
560
  existing = read_sheet(
561
  DEFAULT_SPREADSHEET_ID,
562
  f"Sheet1!A{row_num}:N{row_num}",
 
613
 
614
  @app.get("/")
615
  def root():
616
+ return FileResponse("index.html")
617
 
618
 
619
  @app.get("/health")
 
641
 
642
  @app.get("/auth/start")
643
  def auth_start():
644
+ flow = create_flow()
645
+ verifier = secrets.token_urlsafe(64)
646
+ challenge = (
647
  base64.urlsafe_b64encode(hashlib.sha256(verifier.encode()).digest())
648
  .rstrip(b"=").decode()
649
  )
 
904
  creds = load_credentials()
905
  timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
906
 
 
907
  _create_sheet_record(
908
  job_id=job_id,
909
  timestamp=timestamp,
 
1062
  "model_used": model_used,
1063
  "drive": {
1064
  "folder_id": folder_id,
1065
+ "summary": {
1066
  "web_view_link": summary_drive.get("webViewLink"),
1067
  "direct_download_link": summary_link,
1068
  },
1069
+ "qa": {
1070
  "web_view_link": qa_drive.get("webViewLink"),
1071
  "direct_download_link": qa_link,
1072
  },
gemini_transcript.py CHANGED
@@ -11,7 +11,9 @@ import time
11
  from pathlib import Path
12
  from typing import Optional, List
13
  from urllib.parse import urlparse, parse_qs
14
- # from google import genai
 
 
15
 
16
  from youtube_transcript_api import (
17
  YouTubeTranscriptApi,
@@ -267,10 +269,9 @@ class YouTubeTranscriptFetcher:
267
  class GeminiSummarizer:
268
  """Sends transcript to Gemini with model fallback + per-model retry."""
269
 
270
- # Retry config
271
- MAX_RETRIES = 5
272
- BASE_WAIT = 10 # seconds
273
- MAX_WAIT = 120 # seconds cap
274
 
275
  # Errors β†’ retry same model with backoff
276
  RETRYABLE = ["503", "502", "500", "UNAVAILABLE", "SERVICE_UNAVAILABLE"]
@@ -308,7 +309,9 @@ class GeminiSummarizer:
308
  response = self.client.models.generate_content(
309
  model=model,
310
  contents=transcript,
311
- config={"system_instruction": SYSTEM_PROMPT},
 
 
312
  )
313
  logger.info(
314
  "βœ… Response received from: %s (attempt %d)",
@@ -325,21 +328,19 @@ class GeminiSummarizer:
325
  " [%d/%d] %s β€” quota/not-found, skipping to next model.",
326
  attempt, self.MAX_RETRIES, model,
327
  )
328
- break # skip to next model
329
 
330
  elif any(k in err for k in self.RETRYABLE):
331
  if attempt < self.MAX_RETRIES:
332
  logger.warning(
333
- " [%d/%d] %s β€” transient error. "
334
- "Retrying in %ds...",
335
  attempt, self.MAX_RETRIES, model, wait,
336
  )
337
  time.sleep(wait)
338
  wait = min(wait * 2, self.MAX_WAIT)
339
  else:
340
  logger.warning(
341
- " [%d/%d] %s β€” max retries reached, "
342
- "trying next model.",
343
  attempt, self.MAX_RETRIES, model,
344
  )
345
 
@@ -368,6 +369,9 @@ class GeminiSummarizer:
368
  full, model_used = self._call_api(transcript)
369
  summary, qa = self._split(full)
370
 
 
 
 
371
  self.summary_file.write_text(summary, encoding="utf-8")
372
  self.qa_file.write_text(qa, encoding="utf-8")
373
 
@@ -425,7 +429,7 @@ class TranscriptSummaryPipeline:
425
 
426
  def main():
427
  if len(sys.argv) < 2:
428
- print("Usage: python gemini.py <youtube_url>", file=sys.stderr)
429
  sys.exit(1)
430
 
431
  pipeline = TranscriptSummaryPipeline(
 
11
  from pathlib import Path
12
  from typing import Optional, List
13
  from urllib.parse import urlparse, parse_qs
14
+
15
+ from google import genai # pip install google-genai
16
+ from google.genai import types
17
 
18
  from youtube_transcript_api import (
19
  YouTubeTranscriptApi,
 
269
  class GeminiSummarizer:
270
  """Sends transcript to Gemini with model fallback + per-model retry."""
271
 
272
+ MAX_RETRIES = 5
273
+ BASE_WAIT = 10 # seconds
274
+ MAX_WAIT = 120 # seconds cap
 
275
 
276
  # Errors β†’ retry same model with backoff
277
  RETRYABLE = ["503", "502", "500", "UNAVAILABLE", "SERVICE_UNAVAILABLE"]
 
309
  response = self.client.models.generate_content(
310
  model=model,
311
  contents=transcript,
312
+ config=types.GenerateContentConfig(
313
+ system_instruction=SYSTEM_PROMPT,
314
+ ),
315
  )
316
  logger.info(
317
  "βœ… Response received from: %s (attempt %d)",
 
328
  " [%d/%d] %s β€” quota/not-found, skipping to next model.",
329
  attempt, self.MAX_RETRIES, model,
330
  )
331
+ break
332
 
333
  elif any(k in err for k in self.RETRYABLE):
334
  if attempt < self.MAX_RETRIES:
335
  logger.warning(
336
+ " [%d/%d] %s β€” transient error. Retrying in %ds...",
 
337
  attempt, self.MAX_RETRIES, model, wait,
338
  )
339
  time.sleep(wait)
340
  wait = min(wait * 2, self.MAX_WAIT)
341
  else:
342
  logger.warning(
343
+ " [%d/%d] %s β€” max retries reached, trying next model.",
 
344
  attempt, self.MAX_RETRIES, model,
345
  )
346
 
 
369
  full, model_used = self._call_api(transcript)
370
  summary, qa = self._split(full)
371
 
372
+ self.summary_file.parent.mkdir(parents=True, exist_ok=True)
373
+ self.qa_file.parent.mkdir(parents=True, exist_ok=True)
374
+
375
  self.summary_file.write_text(summary, encoding="utf-8")
376
  self.qa_file.write_text(qa, encoding="utf-8")
377
 
 
429
 
430
  def main():
431
  if len(sys.argv) < 2:
432
+ print("Usage: python gemini_transcript.py <youtube_url>", file=sys.stderr)
433
  sys.exit(1)
434
 
435
  pipeline = TranscriptSummaryPipeline(
requirements.txt CHANGED
@@ -4,4 +4,6 @@ google-api-python-client
4
  google-auth-httplib2
5
  google-auth-oauthlib
6
  requests
7
- youtube_transcript_api
 
 
 
4
  google-auth-httplib2
5
  google-auth-oauthlib
6
  requests
7
+ youtube_transcript_api
8
+ google-generativeai
9
+ google-genai