Spaces:

Rsnarsna
/

transcript

Sleeping

App Files Files Community

rsnarsna commited on about 1 month ago

Commit

7b71216

1 Parent(s): ee74594

refactor: Remove unused variables and functions in app.py and gemini_transcript.py; streamline file handling and improve memory usage

Browse files

Files changed (2) hide show

app.py +31 -54
gemini_transcript.py +32 -43

app.py CHANGED Viewed

@@ -5,7 +5,6 @@ import json
 import base64
 import hashlib
 import secrets
-import shutil
 import tempfile
 import threading
@@ -33,12 +32,12 @@ from gemini_transcript import TranscriptSummaryPipeline
 os.environ.setdefault("OAUTHLIB_INSECURE_TRANSPORT", "1")
 BASE_DIR                = Path(__file__).resolve().parent
-CLIENT_SECRETS          = os.getenv("CLIENT_SECRETS",           str(BASE_DIR / "client_secret.json"))
-TOKEN_PATH              = os.getenv("GOOGLE_OAUTH_TOKEN_PATH",   str(BASE_DIR / "Google_oauth_token.json"))
-REDIRECT_URI            = os.getenv("REDIRECT_URI",              "http://localhost:8000/auth/callback")
 STATE_FILE              = BASE_DIR / "oauth_states.json"
-DEFAULT_SPREADSHEET_ID  = os.getenv("DEFAULT_SPREADSHEET_ID",   "1XA3vW_guHBT-ktkYvhktmUqcquECBe8exGZAoSQS3Ag")
-DEFAULT_DRIVE_FOLDER_ID = os.getenv("DEFAULT_DRIVE_FOLDER_ID",  "1hI6dNXysR_2p9gHkDpsI-iwMExmy2hhR")
 SCOPES = [
     "https://www.googleapis.com/auth/spreadsheets",
@@ -46,11 +45,6 @@ SCOPES = [
     "https://www.googleapis.com/auth/drive.file",
 ]
-OUTPUT_DIR      = Path(".") / "output"
-TRANSCRIPT_FILE = OUTPUT_DIR / "transcript.txt"
-SUMMARY_FILE    = OUTPUT_DIR / "summary.txt"
-QA_FILE         = OUTPUT_DIR / "qa.txt"
 SHEETS_HEADERS = [
     "Timestamp",             # A
     "Job ID",                # B
@@ -267,30 +261,6 @@ def create_drive_folder(
     return folder["id"]
-def upload_file_to_drive(
-    filepath: str,
-    creds: Credentials    = None,
-    folder_id: str | None = None,
-    make_public: bool     = True,
-) -> dict:
-    if creds is None:
-        creds = require_credentials()
-    svc  = _drive(creds)
-    meta = {"name": os.path.basename(filepath)}
-    if folder_id:
-        meta["parents"] = [folder_id]
-    media   = MediaFileUpload(filepath, resumable=True)
-    file    = svc.files().create(
-        body=meta, media_body=media,
-        fields="id,name,webViewLink,webContentLink,mimeType,size",
-    ).execute()
-    file_id = file["id"]
-    if make_public:
-        file = _make_public(svc, file_id)
-    file["direct_download_link"] = _direct_link(file_id)
-    return file
 def create_file_on_drive(
     filename: str,
     content: str,
@@ -299,6 +269,10 @@ def create_file_on_drive(
     folder_id: str | None = None,
     make_public: bool     = True,
 ) -> dict:
     if creds is None:
         creds = require_credentials()
     svc  = _drive(creds)
@@ -873,26 +847,29 @@ def list_jobs():
 # PIPELINE BACKGROUND WORKER
 # ============================================================================
-def _upload_with_title(
-    local_file: Path,
-    drive_name: str,
     step_key: str,
     job_id: str,
     folder_id: str,
     creds: Credentials,
 ) -> dict:
-    """Copy file with video-title name, upload to Drive, clean up."""
     _set_step(job_id, step_key, "running")
     try:
-        tmp_path = local_file.parent / drive_name
-        if tmp_path.exists():
-            tmp_path.unlink()
-        shutil.copy2(local_file, tmp_path)
-        result = upload_file_to_drive(
-            str(tmp_path), creds=creds,
-            folder_id=folder_id, make_public=True,
         )
-        tmp_path.unlink()
         _set_step(job_id, step_key, "done")
         return result
     except Exception as exc:
@@ -965,20 +942,20 @@ def _run_pipeline(job_id: str, youtube_url: str, email_to: str):
             _set_step(job_id, "create_drive_folder", "failed")
             raise RuntimeError(f"Drive folder creation failed: {exc}")
-        # ── STEP 4–6: Upload files ───────────────────────────────────────
         _update_job(job_id, status="uploading_drive")
         _update_sheet_record(job_id, creds, status="uploading_to_drive")
-        summary_drive    = _upload_with_title(
-            SUMMARY_FILE,    f"{video_title}__summary.txt",    "upload_summary",
             job_id, folder_id, creds,
         )
-        qa_drive         = _upload_with_title(
-            QA_FILE,         f"{video_title}__qa.txt",         "upload_qa",
             job_id, folder_id, creds,
         )
-        transcript_drive = _upload_with_title(
-            TRANSCRIPT_FILE, f"{video_title}__transcript.txt", "upload_transcript",
             job_id, folder_id, creds,
         )

 import base64
 import hashlib
 import secrets
 import tempfile
 import threading
 os.environ.setdefault("OAUTHLIB_INSECURE_TRANSPORT", "1")
 BASE_DIR                = Path(__file__).resolve().parent
+CLIENT_SECRETS          = os.getenv("CLIENT_SECRETS",          str(BASE_DIR / "client_secret.json"))
+TOKEN_PATH              = os.getenv("GOOGLE_OAUTH_TOKEN_PATH",  str(BASE_DIR / "Google_oauth_token.json"))
+REDIRECT_URI            = os.getenv("REDIRECT_URI",             "http://localhost:8000/auth/callback")
 STATE_FILE              = BASE_DIR / "oauth_states.json"
+DEFAULT_SPREADSHEET_ID  = os.getenv("DEFAULT_SPREADSHEET_ID",  "1XA3vW_guHBT-ktkYvhktmUqcquECBe8exGZAoSQS3Ag")
+DEFAULT_DRIVE_FOLDER_ID = os.getenv("DEFAULT_DRIVE_FOLDER_ID", "1hI6dNXysR_2p9gHkDpsI-iwMExmy2hhR")
 SCOPES = [
     "https://www.googleapis.com/auth/spreadsheets",
     "https://www.googleapis.com/auth/drive.file",
 ]
 SHEETS_HEADERS = [
     "Timestamp",             # A
     "Job ID",                # B
     return folder["id"]
 def create_file_on_drive(
     filename: str,
     content: str,
     folder_id: str | None = None,
     make_public: bool     = True,
 ) -> dict:
+    """
+    Upload a string as a Drive file via a temporary file.
+    No permanent local files are created.
+    """
     if creds is None:
         creds = require_credentials()
     svc  = _drive(creds)
 # PIPELINE BACKGROUND WORKER
 # ============================================================================
+def _upload_content(
+    content: str,
+    filename: str,
     step_key: str,
     job_id: str,
     folder_id: str,
     creds: Credentials,
 ) -> dict:
+    """
+    Upload a string directly to Drive as a text file.
+    Uses create_file_on_drive which handles its own temp file internally.
+    No permanent local files are created.
+    """
     _set_step(job_id, step_key, "running")
     try:
+        result = create_file_on_drive(
+            filename=filename,
+            content=content,
+            mimetype="text/plain",
+            creds=creds,
+            folder_id=folder_id,
+            make_public=True,
         )
         _set_step(job_id, step_key, "done")
         return result
     except Exception as exc:
             _set_step(job_id, "create_drive_folder", "failed")
             raise RuntimeError(f"Drive folder creation failed: {exc}")
+        # ── STEP 4–6: Upload content strings directly to Drive ───────────
         _update_job(job_id, status="uploading_drive")
         _update_sheet_record(job_id, creds, status="uploading_to_drive")
+        summary_drive    = _upload_content(
+            summary,    f"{video_title}__summary.txt",    "upload_summary",
             job_id, folder_id, creds,
         )
+        qa_drive         = _upload_content(
+            qa,         f"{video_title}__qa.txt",         "upload_qa",
             job_id, folder_id, creds,
         )
+        transcript_drive = _upload_content(
+            transcript, f"{video_title}__transcript.txt", "upload_transcript",
             job_id, folder_id, creds,
         )

gemini_transcript.py CHANGED Viewed

@@ -8,11 +8,10 @@ import json
 import logging
 import time
-from pathlib import Path
 from typing import Optional, List
 from urllib.parse import urlparse, parse_qs
-from google import genai                          # pip install google-genai
 from google.genai import types
 from youtube_transcript_api import (
@@ -27,12 +26,7 @@ from youtube_transcript_api import (
 # CONFIG
 # ============================================================================
-BASE_DIR        = Path(".")
-TRANSCRIPT_FILE = BASE_DIR / "output" / "transcript.txt"
-SUMMARY_FILE    = BASE_DIR / "output" / "summary.txt"
-QA_FILE         = BASE_DIR / "output" / "qa.txt"
-GEMINI_API_KEY  = "AIzaSyCNz5wQAyJ65kNRkwr0-1A-_Z6-lQzdcyc"
 GEMINI_MODELS = [
     "gemini-2.5-flash",
@@ -156,17 +150,18 @@ def fetch_video_title(video_id: str) -> str:
 # ============================================================================
 class YouTubeTranscriptFetcher:
-    """Fetches YouTube transcript with polling retry for new uploads."""
     def __init__(
         self,
         youtube_url: str,
-        output_file: Path              = TRANSCRIPT_FILE,
         languages: Optional[List[str]] = None,
         polling_config: dict           = None,
     ):
         self.youtube_url    = youtube_url
-        self.output_file    = Path(output_file)
         self.languages      = languages or ["en", "en-US", "en-GB"]
         self.polling_config = polling_config or POLLING_CONFIG
         self.video_id       = self._extract_video_id(youtube_url)
@@ -191,14 +186,13 @@ class YouTubeTranscriptFetcher:
         transcript = self.api.fetch(self.video_id, languages=self.languages)
         return " ".join(item.text for item in transcript)
-    def _save(self, text: str) -> None:
-        self.output_file.parent.mkdir(parents=True, exist_ok=True)
-        self.output_file.write_text(text, encoding="utf-8")
     def run(self) -> str:
-        logger.info("Video ID    : %s", self.video_id)
-        logger.info("Output file : %s", self.output_file)
-        logger.info("Total polling attempts: %d", len(self.polling_config))
         attempts = list(self.polling_config.items())
@@ -221,7 +215,6 @@ class YouTubeTranscriptFetcher:
             try:
                 text = self._fetch_once()
-                self._save(text)
                 logger.info(
                     "[%d/%d] ✅ Transcript fetched — %d characters",
                     idx, len(attempts), len(text),
@@ -267,34 +260,32 @@ class YouTubeTranscriptFetcher:
 # ============================================================================
 class GeminiSummarizer:
-    """Sends transcript to Gemini with model fallback + per-model retry."""
     MAX_RETRIES = 5
     BASE_WAIT   = 10    # seconds
     MAX_WAIT    = 120   # seconds cap
-    # Errors → retry same model with backoff
     RETRYABLE    = ["503", "502", "500", "UNAVAILABLE", "SERVICE_UNAVAILABLE"]
     # Errors → skip to next model immediately
     SKIP_TO_NEXT = ["429", "RESOURCE_EXHAUSTED", "quota", "404", "NOT_FOUND"]
     def __init__(
         self,
-        api_key: str       = GEMINI_API_KEY,
-        models: list       = None,
-        summary_file: Path = SUMMARY_FILE,
-        qa_file: Path      = QA_FILE,
     ):
-        self.client       = genai.Client(api_key=api_key)
-        self.models       = models or GEMINI_MODELS
-        self.summary_file = Path(summary_file)
-        self.qa_file      = Path(qa_file)
     def _call_api(self, transcript: str) -> tuple[str, str]:
         """
-        Try each model in order.
-        Per model: retry up to MAX_RETRIES on transient errors with backoff.
-        Returns (response_text, model_used).
         """
         overall_last_error = None
@@ -366,18 +357,13 @@ class GeminiSummarizer:
         return full_text.strip(), ""
     def run(self, transcript: str) -> tuple[str, str, str]:
         full, model_used = self._call_api(transcript)
         summary, qa      = self._split(full)
-        self.summary_file.parent.mkdir(parents=True, exist_ok=True)
-        self.qa_file.parent.mkdir(parents=True, exist_ok=True)
-        self.summary_file.write_text(summary, encoding="utf-8")
-        self.qa_file.write_text(qa,           encoding="utf-8")
-        logger.info("Summary saved → %s", self.summary_file)
-        logger.info("Q&A saved    → %s", self.qa_file)
         return summary, qa, model_used
@@ -386,6 +372,10 @@ class GeminiSummarizer:
 # ============================================================================
 class TranscriptSummaryPipeline:
     def __init__(
         self,
@@ -396,7 +386,6 @@ class TranscriptSummaryPipeline:
         self.youtube_url = youtube_url
         self.fetcher     = YouTubeTranscriptFetcher(
             youtube_url=youtube_url,
-            output_file=TRANSCRIPT_FILE,
             languages=languages,
             polling_config=polling_config,
         )

 import logging
 import time
 from typing import Optional, List
 from urllib.parse import urlparse, parse_qs
+from google import genai                 # pip install google-genai
 from google.genai import types
 from youtube_transcript_api import (
 # CONFIG
 # ============================================================================
+GEMINI_API_KEY = "AIzaSyCNz5wQAyJ65kNRkwr0-1A-_Z6-lQzdcyc"
 GEMINI_MODELS = [
     "gemini-2.5-flash",
 # ============================================================================
 class YouTubeTranscriptFetcher:
+    """
+    Fetches a YouTube transcript and returns it as a plain string.
+    No files are written to disk.
+    """
     def __init__(
         self,
         youtube_url: str,
         languages: Optional[List[str]] = None,
         polling_config: dict           = None,
     ):
         self.youtube_url    = youtube_url
         self.languages      = languages or ["en", "en-US", "en-GB"]
         self.polling_config = polling_config or POLLING_CONFIG
         self.video_id       = self._extract_video_id(youtube_url)
         transcript = self.api.fetch(self.video_id, languages=self.languages)
         return " ".join(item.text for item in transcript)
     def run(self) -> str:
+        """
+        Fetch transcript with polling retry.
+        Returns transcript as a string — nothing is written to disk.
+        """
+        logger.info("Video ID          : %s", self.video_id)
+        logger.info("Polling attempts  : %d", len(self.polling_config))
         attempts = list(self.polling_config.items())
             try:
                 text = self._fetch_once()
                 logger.info(
                     "[%d/%d] ✅ Transcript fetched — %d characters",
                     idx, len(attempts), len(text),
 # ============================================================================
 class GeminiSummarizer:
+    """
+    Sends transcript to Gemini and returns (summary, qa, model_used).
+    No files are written to disk.
+    """
     MAX_RETRIES = 5
     BASE_WAIT   = 10    # seconds
     MAX_WAIT    = 120   # seconds cap
+    # Errors → retry same model with exponential backoff
     RETRYABLE    = ["503", "502", "500", "UNAVAILABLE", "SERVICE_UNAVAILABLE"]
     # Errors → skip to next model immediately
     SKIP_TO_NEXT = ["429", "RESOURCE_EXHAUSTED", "quota", "404", "NOT_FOUND"]
     def __init__(
         self,
+        api_key: str = GEMINI_API_KEY,
+        models: list = None,
     ):
+        self.client = genai.Client(api_key=api_key)
+        self.models = models or GEMINI_MODELS
     def _call_api(self, transcript: str) -> tuple[str, str]:
         """
+        Try each model in order with per-model retry + backoff.
+        Returns (full_response_text, model_used).
         """
         overall_last_error = None
         return full_text.strip(), ""
     def run(self, transcript: str) -> tuple[str, str, str]:
+        """
+        Summarize transcript.
+        Returns (summary, qa, model_used) — nothing is written to disk.
+        """
         full, model_used = self._call_api(transcript)
         summary, qa      = self._split(full)
+        logger.info("✅ Summarization complete — model: %s", model_used)
         return summary, qa, model_used
 # ============================================================================
 class TranscriptSummaryPipeline:
+    """
+    Orchestrates fetch → summarize.
+    All data flows in memory — no disk I/O.
+    """
     def __init__(
         self,
         self.youtube_url = youtube_url
         self.fetcher     = YouTubeTranscriptFetcher(
             youtube_url=youtube_url,
             languages=languages,
             polling_config=polling_config,
         )