rsnarsna commited on
Commit
7b71216
Β·
1 Parent(s): ee74594

refactor: Remove unused variables and functions in app.py and gemini_transcript.py; streamline file handling and improve memory usage

Browse files
Files changed (2) hide show
  1. app.py +31 -54
  2. gemini_transcript.py +32 -43
app.py CHANGED
@@ -5,7 +5,6 @@ import json
5
  import base64
6
  import hashlib
7
  import secrets
8
- import shutil
9
  import tempfile
10
  import threading
11
 
@@ -33,12 +32,12 @@ from gemini_transcript import TranscriptSummaryPipeline
33
  os.environ.setdefault("OAUTHLIB_INSECURE_TRANSPORT", "1")
34
 
35
  BASE_DIR = Path(__file__).resolve().parent
36
- CLIENT_SECRETS = os.getenv("CLIENT_SECRETS", str(BASE_DIR / "client_secret.json"))
37
- TOKEN_PATH = os.getenv("GOOGLE_OAUTH_TOKEN_PATH", str(BASE_DIR / "Google_oauth_token.json"))
38
- REDIRECT_URI = os.getenv("REDIRECT_URI", "http://localhost:8000/auth/callback")
39
  STATE_FILE = BASE_DIR / "oauth_states.json"
40
- DEFAULT_SPREADSHEET_ID = os.getenv("DEFAULT_SPREADSHEET_ID", "1XA3vW_guHBT-ktkYvhktmUqcquECBe8exGZAoSQS3Ag")
41
- DEFAULT_DRIVE_FOLDER_ID = os.getenv("DEFAULT_DRIVE_FOLDER_ID", "1hI6dNXysR_2p9gHkDpsI-iwMExmy2hhR")
42
 
43
  SCOPES = [
44
  "https://www.googleapis.com/auth/spreadsheets",
@@ -46,11 +45,6 @@ SCOPES = [
46
  "https://www.googleapis.com/auth/drive.file",
47
  ]
48
 
49
- OUTPUT_DIR = Path(".") / "output"
50
- TRANSCRIPT_FILE = OUTPUT_DIR / "transcript.txt"
51
- SUMMARY_FILE = OUTPUT_DIR / "summary.txt"
52
- QA_FILE = OUTPUT_DIR / "qa.txt"
53
-
54
  SHEETS_HEADERS = [
55
  "Timestamp", # A
56
  "Job ID", # B
@@ -267,30 +261,6 @@ def create_drive_folder(
267
  return folder["id"]
268
 
269
 
270
- def upload_file_to_drive(
271
- filepath: str,
272
- creds: Credentials = None,
273
- folder_id: str | None = None,
274
- make_public: bool = True,
275
- ) -> dict:
276
- if creds is None:
277
- creds = require_credentials()
278
- svc = _drive(creds)
279
- meta = {"name": os.path.basename(filepath)}
280
- if folder_id:
281
- meta["parents"] = [folder_id]
282
- media = MediaFileUpload(filepath, resumable=True)
283
- file = svc.files().create(
284
- body=meta, media_body=media,
285
- fields="id,name,webViewLink,webContentLink,mimeType,size",
286
- ).execute()
287
- file_id = file["id"]
288
- if make_public:
289
- file = _make_public(svc, file_id)
290
- file["direct_download_link"] = _direct_link(file_id)
291
- return file
292
-
293
-
294
  def create_file_on_drive(
295
  filename: str,
296
  content: str,
@@ -299,6 +269,10 @@ def create_file_on_drive(
299
  folder_id: str | None = None,
300
  make_public: bool = True,
301
  ) -> dict:
 
 
 
 
302
  if creds is None:
303
  creds = require_credentials()
304
  svc = _drive(creds)
@@ -873,26 +847,29 @@ def list_jobs():
873
  # PIPELINE BACKGROUND WORKER
874
  # ============================================================================
875
 
876
- def _upload_with_title(
877
- local_file: Path,
878
- drive_name: str,
879
  step_key: str,
880
  job_id: str,
881
  folder_id: str,
882
  creds: Credentials,
883
  ) -> dict:
884
- """Copy file with video-title name, upload to Drive, clean up."""
 
 
 
 
885
  _set_step(job_id, step_key, "running")
886
  try:
887
- tmp_path = local_file.parent / drive_name
888
- if tmp_path.exists():
889
- tmp_path.unlink()
890
- shutil.copy2(local_file, tmp_path)
891
- result = upload_file_to_drive(
892
- str(tmp_path), creds=creds,
893
- folder_id=folder_id, make_public=True,
894
  )
895
- tmp_path.unlink()
896
  _set_step(job_id, step_key, "done")
897
  return result
898
  except Exception as exc:
@@ -965,20 +942,20 @@ def _run_pipeline(job_id: str, youtube_url: str, email_to: str):
965
  _set_step(job_id, "create_drive_folder", "failed")
966
  raise RuntimeError(f"Drive folder creation failed: {exc}")
967
 
968
- # ── STEP 4–6: Upload files ───────────────────────────────────────
969
  _update_job(job_id, status="uploading_drive")
970
  _update_sheet_record(job_id, creds, status="uploading_to_drive")
971
 
972
- summary_drive = _upload_with_title(
973
- SUMMARY_FILE, f"{video_title}__summary.txt", "upload_summary",
974
  job_id, folder_id, creds,
975
  )
976
- qa_drive = _upload_with_title(
977
- QA_FILE, f"{video_title}__qa.txt", "upload_qa",
978
  job_id, folder_id, creds,
979
  )
980
- transcript_drive = _upload_with_title(
981
- TRANSCRIPT_FILE, f"{video_title}__transcript.txt", "upload_transcript",
982
  job_id, folder_id, creds,
983
  )
984
 
 
5
  import base64
6
  import hashlib
7
  import secrets
 
8
  import tempfile
9
  import threading
10
 
 
32
  os.environ.setdefault("OAUTHLIB_INSECURE_TRANSPORT", "1")
33
 
34
  BASE_DIR = Path(__file__).resolve().parent
35
+ CLIENT_SECRETS = os.getenv("CLIENT_SECRETS", str(BASE_DIR / "client_secret.json"))
36
+ TOKEN_PATH = os.getenv("GOOGLE_OAUTH_TOKEN_PATH", str(BASE_DIR / "Google_oauth_token.json"))
37
+ REDIRECT_URI = os.getenv("REDIRECT_URI", "http://localhost:8000/auth/callback")
38
  STATE_FILE = BASE_DIR / "oauth_states.json"
39
+ DEFAULT_SPREADSHEET_ID = os.getenv("DEFAULT_SPREADSHEET_ID", "1XA3vW_guHBT-ktkYvhktmUqcquECBe8exGZAoSQS3Ag")
40
+ DEFAULT_DRIVE_FOLDER_ID = os.getenv("DEFAULT_DRIVE_FOLDER_ID", "1hI6dNXysR_2p9gHkDpsI-iwMExmy2hhR")
41
 
42
  SCOPES = [
43
  "https://www.googleapis.com/auth/spreadsheets",
 
45
  "https://www.googleapis.com/auth/drive.file",
46
  ]
47
 
 
 
 
 
 
48
  SHEETS_HEADERS = [
49
  "Timestamp", # A
50
  "Job ID", # B
 
261
  return folder["id"]
262
 
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  def create_file_on_drive(
265
  filename: str,
266
  content: str,
 
269
  folder_id: str | None = None,
270
  make_public: bool = True,
271
  ) -> dict:
272
+ """
273
+ Upload a string as a Drive file via a temporary file.
274
+ No permanent local files are created.
275
+ """
276
  if creds is None:
277
  creds = require_credentials()
278
  svc = _drive(creds)
 
847
  # PIPELINE BACKGROUND WORKER
848
  # ============================================================================
849
 
850
+ def _upload_content(
851
+ content: str,
852
+ filename: str,
853
  step_key: str,
854
  job_id: str,
855
  folder_id: str,
856
  creds: Credentials,
857
  ) -> dict:
858
+ """
859
+ Upload a string directly to Drive as a text file.
860
+ Uses create_file_on_drive which handles its own temp file internally.
861
+ No permanent local files are created.
862
+ """
863
  _set_step(job_id, step_key, "running")
864
  try:
865
+ result = create_file_on_drive(
866
+ filename=filename,
867
+ content=content,
868
+ mimetype="text/plain",
869
+ creds=creds,
870
+ folder_id=folder_id,
871
+ make_public=True,
872
  )
 
873
  _set_step(job_id, step_key, "done")
874
  return result
875
  except Exception as exc:
 
942
  _set_step(job_id, "create_drive_folder", "failed")
943
  raise RuntimeError(f"Drive folder creation failed: {exc}")
944
 
945
+ # ── STEP 4–6: Upload content strings directly to Drive ───────────
946
  _update_job(job_id, status="uploading_drive")
947
  _update_sheet_record(job_id, creds, status="uploading_to_drive")
948
 
949
+ summary_drive = _upload_content(
950
+ summary, f"{video_title}__summary.txt", "upload_summary",
951
  job_id, folder_id, creds,
952
  )
953
+ qa_drive = _upload_content(
954
+ qa, f"{video_title}__qa.txt", "upload_qa",
955
  job_id, folder_id, creds,
956
  )
957
+ transcript_drive = _upload_content(
958
+ transcript, f"{video_title}__transcript.txt", "upload_transcript",
959
  job_id, folder_id, creds,
960
  )
961
 
gemini_transcript.py CHANGED
@@ -8,11 +8,10 @@ import json
8
  import logging
9
  import time
10
 
11
- from pathlib import Path
12
  from typing import Optional, List
13
  from urllib.parse import urlparse, parse_qs
14
 
15
- from google import genai # pip install google-genai
16
  from google.genai import types
17
 
18
  from youtube_transcript_api import (
@@ -27,12 +26,7 @@ from youtube_transcript_api import (
27
  # CONFIG
28
  # ============================================================================
29
 
30
- BASE_DIR = Path(".")
31
- TRANSCRIPT_FILE = BASE_DIR / "output" / "transcript.txt"
32
- SUMMARY_FILE = BASE_DIR / "output" / "summary.txt"
33
- QA_FILE = BASE_DIR / "output" / "qa.txt"
34
-
35
- GEMINI_API_KEY = "AIzaSyCNz5wQAyJ65kNRkwr0-1A-_Z6-lQzdcyc"
36
 
37
  GEMINI_MODELS = [
38
  "gemini-2.5-flash",
@@ -156,17 +150,18 @@ def fetch_video_title(video_id: str) -> str:
156
  # ============================================================================
157
 
158
  class YouTubeTranscriptFetcher:
159
- """Fetches YouTube transcript with polling retry for new uploads."""
 
 
 
160
 
161
  def __init__(
162
  self,
163
  youtube_url: str,
164
- output_file: Path = TRANSCRIPT_FILE,
165
  languages: Optional[List[str]] = None,
166
  polling_config: dict = None,
167
  ):
168
  self.youtube_url = youtube_url
169
- self.output_file = Path(output_file)
170
  self.languages = languages or ["en", "en-US", "en-GB"]
171
  self.polling_config = polling_config or POLLING_CONFIG
172
  self.video_id = self._extract_video_id(youtube_url)
@@ -191,14 +186,13 @@ class YouTubeTranscriptFetcher:
191
  transcript = self.api.fetch(self.video_id, languages=self.languages)
192
  return " ".join(item.text for item in transcript)
193
 
194
- def _save(self, text: str) -> None:
195
- self.output_file.parent.mkdir(parents=True, exist_ok=True)
196
- self.output_file.write_text(text, encoding="utf-8")
197
-
198
  def run(self) -> str:
199
- logger.info("Video ID : %s", self.video_id)
200
- logger.info("Output file : %s", self.output_file)
201
- logger.info("Total polling attempts: %d", len(self.polling_config))
 
 
 
202
 
203
  attempts = list(self.polling_config.items())
204
 
@@ -221,7 +215,6 @@ class YouTubeTranscriptFetcher:
221
 
222
  try:
223
  text = self._fetch_once()
224
- self._save(text)
225
  logger.info(
226
  "[%d/%d] βœ… Transcript fetched β€” %d characters",
227
  idx, len(attempts), len(text),
@@ -267,34 +260,32 @@ class YouTubeTranscriptFetcher:
267
  # ============================================================================
268
 
269
  class GeminiSummarizer:
270
- """Sends transcript to Gemini with model fallback + per-model retry."""
 
 
 
271
 
272
  MAX_RETRIES = 5
273
  BASE_WAIT = 10 # seconds
274
  MAX_WAIT = 120 # seconds cap
275
 
276
- # Errors β†’ retry same model with backoff
277
  RETRYABLE = ["503", "502", "500", "UNAVAILABLE", "SERVICE_UNAVAILABLE"]
278
  # Errors β†’ skip to next model immediately
279
  SKIP_TO_NEXT = ["429", "RESOURCE_EXHAUSTED", "quota", "404", "NOT_FOUND"]
280
 
281
  def __init__(
282
  self,
283
- api_key: str = GEMINI_API_KEY,
284
- models: list = None,
285
- summary_file: Path = SUMMARY_FILE,
286
- qa_file: Path = QA_FILE,
287
  ):
288
- self.client = genai.Client(api_key=api_key)
289
- self.models = models or GEMINI_MODELS
290
- self.summary_file = Path(summary_file)
291
- self.qa_file = Path(qa_file)
292
 
293
  def _call_api(self, transcript: str) -> tuple[str, str]:
294
  """
295
- Try each model in order.
296
- Per model: retry up to MAX_RETRIES on transient errors with backoff.
297
- Returns (response_text, model_used).
298
  """
299
  overall_last_error = None
300
 
@@ -366,18 +357,13 @@ class GeminiSummarizer:
366
  return full_text.strip(), ""
367
 
368
  def run(self, transcript: str) -> tuple[str, str, str]:
 
 
 
 
369
  full, model_used = self._call_api(transcript)
370
  summary, qa = self._split(full)
371
-
372
- self.summary_file.parent.mkdir(parents=True, exist_ok=True)
373
- self.qa_file.parent.mkdir(parents=True, exist_ok=True)
374
-
375
- self.summary_file.write_text(summary, encoding="utf-8")
376
- self.qa_file.write_text(qa, encoding="utf-8")
377
-
378
- logger.info("Summary saved β†’ %s", self.summary_file)
379
- logger.info("Q&A saved β†’ %s", self.qa_file)
380
-
381
  return summary, qa, model_used
382
 
383
 
@@ -386,6 +372,10 @@ class GeminiSummarizer:
386
  # ============================================================================
387
 
388
  class TranscriptSummaryPipeline:
 
 
 
 
389
 
390
  def __init__(
391
  self,
@@ -396,7 +386,6 @@ class TranscriptSummaryPipeline:
396
  self.youtube_url = youtube_url
397
  self.fetcher = YouTubeTranscriptFetcher(
398
  youtube_url=youtube_url,
399
- output_file=TRANSCRIPT_FILE,
400
  languages=languages,
401
  polling_config=polling_config,
402
  )
 
8
  import logging
9
  import time
10
 
 
11
  from typing import Optional, List
12
  from urllib.parse import urlparse, parse_qs
13
 
14
+ from google import genai # pip install google-genai
15
  from google.genai import types
16
 
17
  from youtube_transcript_api import (
 
26
  # CONFIG
27
  # ============================================================================
28
 
29
+ GEMINI_API_KEY = "AIzaSyCNz5wQAyJ65kNRkwr0-1A-_Z6-lQzdcyc"
 
 
 
 
 
30
 
31
  GEMINI_MODELS = [
32
  "gemini-2.5-flash",
 
150
  # ============================================================================
151
 
152
  class YouTubeTranscriptFetcher:
153
+ """
154
+ Fetches a YouTube transcript and returns it as a plain string.
155
+ No files are written to disk.
156
+ """
157
 
158
  def __init__(
159
  self,
160
  youtube_url: str,
 
161
  languages: Optional[List[str]] = None,
162
  polling_config: dict = None,
163
  ):
164
  self.youtube_url = youtube_url
 
165
  self.languages = languages or ["en", "en-US", "en-GB"]
166
  self.polling_config = polling_config or POLLING_CONFIG
167
  self.video_id = self._extract_video_id(youtube_url)
 
186
  transcript = self.api.fetch(self.video_id, languages=self.languages)
187
  return " ".join(item.text for item in transcript)
188
 
 
 
 
 
189
  def run(self) -> str:
190
+ """
191
+ Fetch transcript with polling retry.
192
+ Returns transcript as a string β€” nothing is written to disk.
193
+ """
194
+ logger.info("Video ID : %s", self.video_id)
195
+ logger.info("Polling attempts : %d", len(self.polling_config))
196
 
197
  attempts = list(self.polling_config.items())
198
 
 
215
 
216
  try:
217
  text = self._fetch_once()
 
218
  logger.info(
219
  "[%d/%d] βœ… Transcript fetched β€” %d characters",
220
  idx, len(attempts), len(text),
 
260
  # ============================================================================
261
 
262
  class GeminiSummarizer:
263
+ """
264
+ Sends transcript to Gemini and returns (summary, qa, model_used).
265
+ No files are written to disk.
266
+ """
267
 
268
  MAX_RETRIES = 5
269
  BASE_WAIT = 10 # seconds
270
  MAX_WAIT = 120 # seconds cap
271
 
272
+ # Errors β†’ retry same model with exponential backoff
273
  RETRYABLE = ["503", "502", "500", "UNAVAILABLE", "SERVICE_UNAVAILABLE"]
274
  # Errors β†’ skip to next model immediately
275
  SKIP_TO_NEXT = ["429", "RESOURCE_EXHAUSTED", "quota", "404", "NOT_FOUND"]
276
 
277
  def __init__(
278
  self,
279
+ api_key: str = GEMINI_API_KEY,
280
+ models: list = None,
 
 
281
  ):
282
+ self.client = genai.Client(api_key=api_key)
283
+ self.models = models or GEMINI_MODELS
 
 
284
 
285
  def _call_api(self, transcript: str) -> tuple[str, str]:
286
  """
287
+ Try each model in order with per-model retry + backoff.
288
+ Returns (full_response_text, model_used).
 
289
  """
290
  overall_last_error = None
291
 
 
357
  return full_text.strip(), ""
358
 
359
  def run(self, transcript: str) -> tuple[str, str, str]:
360
+ """
361
+ Summarize transcript.
362
+ Returns (summary, qa, model_used) β€” nothing is written to disk.
363
+ """
364
  full, model_used = self._call_api(transcript)
365
  summary, qa = self._split(full)
366
+ logger.info("βœ… Summarization complete β€” model: %s", model_used)
 
 
 
 
 
 
 
 
 
367
  return summary, qa, model_used
368
 
369
 
 
372
  # ============================================================================
373
 
374
  class TranscriptSummaryPipeline:
375
+ """
376
+ Orchestrates fetch β†’ summarize.
377
+ All data flows in memory β€” no disk I/O.
378
+ """
379
 
380
  def __init__(
381
  self,
 
386
  self.youtube_url = youtube_url
387
  self.fetcher = YouTubeTranscriptFetcher(
388
  youtube_url=youtube_url,
 
389
  languages=languages,
390
  polling_config=polling_config,
391
  )