Spaces:
Sleeping
Sleeping
rsnarsna commited on
Commit Β·
7b71216
1
Parent(s): ee74594
refactor: Remove unused variables and functions in app.py and gemini_transcript.py; streamline file handling and improve memory usage
Browse files- app.py +31 -54
- gemini_transcript.py +32 -43
app.py
CHANGED
|
@@ -5,7 +5,6 @@ import json
|
|
| 5 |
import base64
|
| 6 |
import hashlib
|
| 7 |
import secrets
|
| 8 |
-
import shutil
|
| 9 |
import tempfile
|
| 10 |
import threading
|
| 11 |
|
|
@@ -33,12 +32,12 @@ from gemini_transcript import TranscriptSummaryPipeline
|
|
| 33 |
os.environ.setdefault("OAUTHLIB_INSECURE_TRANSPORT", "1")
|
| 34 |
|
| 35 |
BASE_DIR = Path(__file__).resolve().parent
|
| 36 |
-
CLIENT_SECRETS = os.getenv("CLIENT_SECRETS",
|
| 37 |
-
TOKEN_PATH = os.getenv("GOOGLE_OAUTH_TOKEN_PATH",
|
| 38 |
-
REDIRECT_URI = os.getenv("REDIRECT_URI",
|
| 39 |
STATE_FILE = BASE_DIR / "oauth_states.json"
|
| 40 |
-
DEFAULT_SPREADSHEET_ID = os.getenv("DEFAULT_SPREADSHEET_ID",
|
| 41 |
-
DEFAULT_DRIVE_FOLDER_ID = os.getenv("DEFAULT_DRIVE_FOLDER_ID",
|
| 42 |
|
| 43 |
SCOPES = [
|
| 44 |
"https://www.googleapis.com/auth/spreadsheets",
|
|
@@ -46,11 +45,6 @@ SCOPES = [
|
|
| 46 |
"https://www.googleapis.com/auth/drive.file",
|
| 47 |
]
|
| 48 |
|
| 49 |
-
OUTPUT_DIR = Path(".") / "output"
|
| 50 |
-
TRANSCRIPT_FILE = OUTPUT_DIR / "transcript.txt"
|
| 51 |
-
SUMMARY_FILE = OUTPUT_DIR / "summary.txt"
|
| 52 |
-
QA_FILE = OUTPUT_DIR / "qa.txt"
|
| 53 |
-
|
| 54 |
SHEETS_HEADERS = [
|
| 55 |
"Timestamp", # A
|
| 56 |
"Job ID", # B
|
|
@@ -267,30 +261,6 @@ def create_drive_folder(
|
|
| 267 |
return folder["id"]
|
| 268 |
|
| 269 |
|
| 270 |
-
def upload_file_to_drive(
|
| 271 |
-
filepath: str,
|
| 272 |
-
creds: Credentials = None,
|
| 273 |
-
folder_id: str | None = None,
|
| 274 |
-
make_public: bool = True,
|
| 275 |
-
) -> dict:
|
| 276 |
-
if creds is None:
|
| 277 |
-
creds = require_credentials()
|
| 278 |
-
svc = _drive(creds)
|
| 279 |
-
meta = {"name": os.path.basename(filepath)}
|
| 280 |
-
if folder_id:
|
| 281 |
-
meta["parents"] = [folder_id]
|
| 282 |
-
media = MediaFileUpload(filepath, resumable=True)
|
| 283 |
-
file = svc.files().create(
|
| 284 |
-
body=meta, media_body=media,
|
| 285 |
-
fields="id,name,webViewLink,webContentLink,mimeType,size",
|
| 286 |
-
).execute()
|
| 287 |
-
file_id = file["id"]
|
| 288 |
-
if make_public:
|
| 289 |
-
file = _make_public(svc, file_id)
|
| 290 |
-
file["direct_download_link"] = _direct_link(file_id)
|
| 291 |
-
return file
|
| 292 |
-
|
| 293 |
-
|
| 294 |
def create_file_on_drive(
|
| 295 |
filename: str,
|
| 296 |
content: str,
|
|
@@ -299,6 +269,10 @@ def create_file_on_drive(
|
|
| 299 |
folder_id: str | None = None,
|
| 300 |
make_public: bool = True,
|
| 301 |
) -> dict:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
if creds is None:
|
| 303 |
creds = require_credentials()
|
| 304 |
svc = _drive(creds)
|
|
@@ -873,26 +847,29 @@ def list_jobs():
|
|
| 873 |
# PIPELINE BACKGROUND WORKER
|
| 874 |
# ============================================================================
|
| 875 |
|
| 876 |
-
def
|
| 877 |
-
|
| 878 |
-
|
| 879 |
step_key: str,
|
| 880 |
job_id: str,
|
| 881 |
folder_id: str,
|
| 882 |
creds: Credentials,
|
| 883 |
) -> dict:
|
| 884 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 885 |
_set_step(job_id, step_key, "running")
|
| 886 |
try:
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
|
| 890 |
-
|
| 891 |
-
|
| 892 |
-
|
| 893 |
-
|
| 894 |
)
|
| 895 |
-
tmp_path.unlink()
|
| 896 |
_set_step(job_id, step_key, "done")
|
| 897 |
return result
|
| 898 |
except Exception as exc:
|
|
@@ -965,20 +942,20 @@ def _run_pipeline(job_id: str, youtube_url: str, email_to: str):
|
|
| 965 |
_set_step(job_id, "create_drive_folder", "failed")
|
| 966 |
raise RuntimeError(f"Drive folder creation failed: {exc}")
|
| 967 |
|
| 968 |
-
# ββ STEP 4β6: Upload
|
| 969 |
_update_job(job_id, status="uploading_drive")
|
| 970 |
_update_sheet_record(job_id, creds, status="uploading_to_drive")
|
| 971 |
|
| 972 |
-
summary_drive =
|
| 973 |
-
|
| 974 |
job_id, folder_id, creds,
|
| 975 |
)
|
| 976 |
-
qa_drive =
|
| 977 |
-
|
| 978 |
job_id, folder_id, creds,
|
| 979 |
)
|
| 980 |
-
transcript_drive =
|
| 981 |
-
|
| 982 |
job_id, folder_id, creds,
|
| 983 |
)
|
| 984 |
|
|
|
|
| 5 |
import base64
|
| 6 |
import hashlib
|
| 7 |
import secrets
|
|
|
|
| 8 |
import tempfile
|
| 9 |
import threading
|
| 10 |
|
|
|
|
| 32 |
os.environ.setdefault("OAUTHLIB_INSECURE_TRANSPORT", "1")
|
| 33 |
|
| 34 |
BASE_DIR = Path(__file__).resolve().parent
|
| 35 |
+
CLIENT_SECRETS = os.getenv("CLIENT_SECRETS", str(BASE_DIR / "client_secret.json"))
|
| 36 |
+
TOKEN_PATH = os.getenv("GOOGLE_OAUTH_TOKEN_PATH", str(BASE_DIR / "Google_oauth_token.json"))
|
| 37 |
+
REDIRECT_URI = os.getenv("REDIRECT_URI", "http://localhost:8000/auth/callback")
|
| 38 |
STATE_FILE = BASE_DIR / "oauth_states.json"
|
| 39 |
+
DEFAULT_SPREADSHEET_ID = os.getenv("DEFAULT_SPREADSHEET_ID", "1XA3vW_guHBT-ktkYvhktmUqcquECBe8exGZAoSQS3Ag")
|
| 40 |
+
DEFAULT_DRIVE_FOLDER_ID = os.getenv("DEFAULT_DRIVE_FOLDER_ID", "1hI6dNXysR_2p9gHkDpsI-iwMExmy2hhR")
|
| 41 |
|
| 42 |
SCOPES = [
|
| 43 |
"https://www.googleapis.com/auth/spreadsheets",
|
|
|
|
| 45 |
"https://www.googleapis.com/auth/drive.file",
|
| 46 |
]
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
SHEETS_HEADERS = [
|
| 49 |
"Timestamp", # A
|
| 50 |
"Job ID", # B
|
|
|
|
| 261 |
return folder["id"]
|
| 262 |
|
| 263 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
def create_file_on_drive(
|
| 265 |
filename: str,
|
| 266 |
content: str,
|
|
|
|
| 269 |
folder_id: str | None = None,
|
| 270 |
make_public: bool = True,
|
| 271 |
) -> dict:
|
| 272 |
+
"""
|
| 273 |
+
Upload a string as a Drive file via a temporary file.
|
| 274 |
+
No permanent local files are created.
|
| 275 |
+
"""
|
| 276 |
if creds is None:
|
| 277 |
creds = require_credentials()
|
| 278 |
svc = _drive(creds)
|
|
|
|
| 847 |
# PIPELINE BACKGROUND WORKER
|
| 848 |
# ============================================================================
|
| 849 |
|
| 850 |
+
def _upload_content(
|
| 851 |
+
content: str,
|
| 852 |
+
filename: str,
|
| 853 |
step_key: str,
|
| 854 |
job_id: str,
|
| 855 |
folder_id: str,
|
| 856 |
creds: Credentials,
|
| 857 |
) -> dict:
|
| 858 |
+
"""
|
| 859 |
+
Upload a string directly to Drive as a text file.
|
| 860 |
+
Uses create_file_on_drive which handles its own temp file internally.
|
| 861 |
+
No permanent local files are created.
|
| 862 |
+
"""
|
| 863 |
_set_step(job_id, step_key, "running")
|
| 864 |
try:
|
| 865 |
+
result = create_file_on_drive(
|
| 866 |
+
filename=filename,
|
| 867 |
+
content=content,
|
| 868 |
+
mimetype="text/plain",
|
| 869 |
+
creds=creds,
|
| 870 |
+
folder_id=folder_id,
|
| 871 |
+
make_public=True,
|
| 872 |
)
|
|
|
|
| 873 |
_set_step(job_id, step_key, "done")
|
| 874 |
return result
|
| 875 |
except Exception as exc:
|
|
|
|
| 942 |
_set_step(job_id, "create_drive_folder", "failed")
|
| 943 |
raise RuntimeError(f"Drive folder creation failed: {exc}")
|
| 944 |
|
| 945 |
+
# ββ STEP 4β6: Upload content strings directly to Drive βββββββββββ
|
| 946 |
_update_job(job_id, status="uploading_drive")
|
| 947 |
_update_sheet_record(job_id, creds, status="uploading_to_drive")
|
| 948 |
|
| 949 |
+
summary_drive = _upload_content(
|
| 950 |
+
summary, f"{video_title}__summary.txt", "upload_summary",
|
| 951 |
job_id, folder_id, creds,
|
| 952 |
)
|
| 953 |
+
qa_drive = _upload_content(
|
| 954 |
+
qa, f"{video_title}__qa.txt", "upload_qa",
|
| 955 |
job_id, folder_id, creds,
|
| 956 |
)
|
| 957 |
+
transcript_drive = _upload_content(
|
| 958 |
+
transcript, f"{video_title}__transcript.txt", "upload_transcript",
|
| 959 |
job_id, folder_id, creds,
|
| 960 |
)
|
| 961 |
|
gemini_transcript.py
CHANGED
|
@@ -8,11 +8,10 @@ import json
|
|
| 8 |
import logging
|
| 9 |
import time
|
| 10 |
|
| 11 |
-
from pathlib import Path
|
| 12 |
from typing import Optional, List
|
| 13 |
from urllib.parse import urlparse, parse_qs
|
| 14 |
|
| 15 |
-
from google import genai
|
| 16 |
from google.genai import types
|
| 17 |
|
| 18 |
from youtube_transcript_api import (
|
|
@@ -27,12 +26,7 @@ from youtube_transcript_api import (
|
|
| 27 |
# CONFIG
|
| 28 |
# ============================================================================
|
| 29 |
|
| 30 |
-
|
| 31 |
-
TRANSCRIPT_FILE = BASE_DIR / "output" / "transcript.txt"
|
| 32 |
-
SUMMARY_FILE = BASE_DIR / "output" / "summary.txt"
|
| 33 |
-
QA_FILE = BASE_DIR / "output" / "qa.txt"
|
| 34 |
-
|
| 35 |
-
GEMINI_API_KEY = "AIzaSyCNz5wQAyJ65kNRkwr0-1A-_Z6-lQzdcyc"
|
| 36 |
|
| 37 |
GEMINI_MODELS = [
|
| 38 |
"gemini-2.5-flash",
|
|
@@ -156,17 +150,18 @@ def fetch_video_title(video_id: str) -> str:
|
|
| 156 |
# ============================================================================
|
| 157 |
|
| 158 |
class YouTubeTranscriptFetcher:
|
| 159 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
def __init__(
|
| 162 |
self,
|
| 163 |
youtube_url: str,
|
| 164 |
-
output_file: Path = TRANSCRIPT_FILE,
|
| 165 |
languages: Optional[List[str]] = None,
|
| 166 |
polling_config: dict = None,
|
| 167 |
):
|
| 168 |
self.youtube_url = youtube_url
|
| 169 |
-
self.output_file = Path(output_file)
|
| 170 |
self.languages = languages or ["en", "en-US", "en-GB"]
|
| 171 |
self.polling_config = polling_config or POLLING_CONFIG
|
| 172 |
self.video_id = self._extract_video_id(youtube_url)
|
|
@@ -191,14 +186,13 @@ class YouTubeTranscriptFetcher:
|
|
| 191 |
transcript = self.api.fetch(self.video_id, languages=self.languages)
|
| 192 |
return " ".join(item.text for item in transcript)
|
| 193 |
|
| 194 |
-
def _save(self, text: str) -> None:
|
| 195 |
-
self.output_file.parent.mkdir(parents=True, exist_ok=True)
|
| 196 |
-
self.output_file.write_text(text, encoding="utf-8")
|
| 197 |
-
|
| 198 |
def run(self) -> str:
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
|
|
|
|
|
|
|
|
|
| 202 |
|
| 203 |
attempts = list(self.polling_config.items())
|
| 204 |
|
|
@@ -221,7 +215,6 @@ class YouTubeTranscriptFetcher:
|
|
| 221 |
|
| 222 |
try:
|
| 223 |
text = self._fetch_once()
|
| 224 |
-
self._save(text)
|
| 225 |
logger.info(
|
| 226 |
"[%d/%d] β
Transcript fetched β %d characters",
|
| 227 |
idx, len(attempts), len(text),
|
|
@@ -267,34 +260,32 @@ class YouTubeTranscriptFetcher:
|
|
| 267 |
# ============================================================================
|
| 268 |
|
| 269 |
class GeminiSummarizer:
|
| 270 |
-
"""
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
MAX_RETRIES = 5
|
| 273 |
BASE_WAIT = 10 # seconds
|
| 274 |
MAX_WAIT = 120 # seconds cap
|
| 275 |
|
| 276 |
-
# Errors β retry same model with backoff
|
| 277 |
RETRYABLE = ["503", "502", "500", "UNAVAILABLE", "SERVICE_UNAVAILABLE"]
|
| 278 |
# Errors β skip to next model immediately
|
| 279 |
SKIP_TO_NEXT = ["429", "RESOURCE_EXHAUSTED", "quota", "404", "NOT_FOUND"]
|
| 280 |
|
| 281 |
def __init__(
|
| 282 |
self,
|
| 283 |
-
api_key: str
|
| 284 |
-
models: list
|
| 285 |
-
summary_file: Path = SUMMARY_FILE,
|
| 286 |
-
qa_file: Path = QA_FILE,
|
| 287 |
):
|
| 288 |
-
self.client
|
| 289 |
-
self.models
|
| 290 |
-
self.summary_file = Path(summary_file)
|
| 291 |
-
self.qa_file = Path(qa_file)
|
| 292 |
|
| 293 |
def _call_api(self, transcript: str) -> tuple[str, str]:
|
| 294 |
"""
|
| 295 |
-
Try each model in order.
|
| 296 |
-
|
| 297 |
-
Returns (response_text, model_used).
|
| 298 |
"""
|
| 299 |
overall_last_error = None
|
| 300 |
|
|
@@ -366,18 +357,13 @@ class GeminiSummarizer:
|
|
| 366 |
return full_text.strip(), ""
|
| 367 |
|
| 368 |
def run(self, transcript: str) -> tuple[str, str, str]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 369 |
full, model_used = self._call_api(transcript)
|
| 370 |
summary, qa = self._split(full)
|
| 371 |
-
|
| 372 |
-
self.summary_file.parent.mkdir(parents=True, exist_ok=True)
|
| 373 |
-
self.qa_file.parent.mkdir(parents=True, exist_ok=True)
|
| 374 |
-
|
| 375 |
-
self.summary_file.write_text(summary, encoding="utf-8")
|
| 376 |
-
self.qa_file.write_text(qa, encoding="utf-8")
|
| 377 |
-
|
| 378 |
-
logger.info("Summary saved β %s", self.summary_file)
|
| 379 |
-
logger.info("Q&A saved β %s", self.qa_file)
|
| 380 |
-
|
| 381 |
return summary, qa, model_used
|
| 382 |
|
| 383 |
|
|
@@ -386,6 +372,10 @@ class GeminiSummarizer:
|
|
| 386 |
# ============================================================================
|
| 387 |
|
| 388 |
class TranscriptSummaryPipeline:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 389 |
|
| 390 |
def __init__(
|
| 391 |
self,
|
|
@@ -396,7 +386,6 @@ class TranscriptSummaryPipeline:
|
|
| 396 |
self.youtube_url = youtube_url
|
| 397 |
self.fetcher = YouTubeTranscriptFetcher(
|
| 398 |
youtube_url=youtube_url,
|
| 399 |
-
output_file=TRANSCRIPT_FILE,
|
| 400 |
languages=languages,
|
| 401 |
polling_config=polling_config,
|
| 402 |
)
|
|
|
|
| 8 |
import logging
|
| 9 |
import time
|
| 10 |
|
|
|
|
| 11 |
from typing import Optional, List
|
| 12 |
from urllib.parse import urlparse, parse_qs
|
| 13 |
|
| 14 |
+
from google import genai # pip install google-genai
|
| 15 |
from google.genai import types
|
| 16 |
|
| 17 |
from youtube_transcript_api import (
|
|
|
|
| 26 |
# CONFIG
|
| 27 |
# ============================================================================
|
| 28 |
|
| 29 |
+
GEMINI_API_KEY = "AIzaSyCNz5wQAyJ65kNRkwr0-1A-_Z6-lQzdcyc"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
GEMINI_MODELS = [
|
| 32 |
"gemini-2.5-flash",
|
|
|
|
| 150 |
# ============================================================================
|
| 151 |
|
| 152 |
class YouTubeTranscriptFetcher:
|
| 153 |
+
"""
|
| 154 |
+
Fetches a YouTube transcript and returns it as a plain string.
|
| 155 |
+
No files are written to disk.
|
| 156 |
+
"""
|
| 157 |
|
| 158 |
def __init__(
|
| 159 |
self,
|
| 160 |
youtube_url: str,
|
|
|
|
| 161 |
languages: Optional[List[str]] = None,
|
| 162 |
polling_config: dict = None,
|
| 163 |
):
|
| 164 |
self.youtube_url = youtube_url
|
|
|
|
| 165 |
self.languages = languages or ["en", "en-US", "en-GB"]
|
| 166 |
self.polling_config = polling_config or POLLING_CONFIG
|
| 167 |
self.video_id = self._extract_video_id(youtube_url)
|
|
|
|
| 186 |
transcript = self.api.fetch(self.video_id, languages=self.languages)
|
| 187 |
return " ".join(item.text for item in transcript)
|
| 188 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
def run(self) -> str:
|
| 190 |
+
"""
|
| 191 |
+
Fetch transcript with polling retry.
|
| 192 |
+
Returns transcript as a string β nothing is written to disk.
|
| 193 |
+
"""
|
| 194 |
+
logger.info("Video ID : %s", self.video_id)
|
| 195 |
+
logger.info("Polling attempts : %d", len(self.polling_config))
|
| 196 |
|
| 197 |
attempts = list(self.polling_config.items())
|
| 198 |
|
|
|
|
| 215 |
|
| 216 |
try:
|
| 217 |
text = self._fetch_once()
|
|
|
|
| 218 |
logger.info(
|
| 219 |
"[%d/%d] β
Transcript fetched β %d characters",
|
| 220 |
idx, len(attempts), len(text),
|
|
|
|
| 260 |
# ============================================================================
|
| 261 |
|
| 262 |
class GeminiSummarizer:
|
| 263 |
+
"""
|
| 264 |
+
Sends transcript to Gemini and returns (summary, qa, model_used).
|
| 265 |
+
No files are written to disk.
|
| 266 |
+
"""
|
| 267 |
|
| 268 |
MAX_RETRIES = 5
|
| 269 |
BASE_WAIT = 10 # seconds
|
| 270 |
MAX_WAIT = 120 # seconds cap
|
| 271 |
|
| 272 |
+
# Errors β retry same model with exponential backoff
|
| 273 |
RETRYABLE = ["503", "502", "500", "UNAVAILABLE", "SERVICE_UNAVAILABLE"]
|
| 274 |
# Errors β skip to next model immediately
|
| 275 |
SKIP_TO_NEXT = ["429", "RESOURCE_EXHAUSTED", "quota", "404", "NOT_FOUND"]
|
| 276 |
|
| 277 |
def __init__(
|
| 278 |
self,
|
| 279 |
+
api_key: str = GEMINI_API_KEY,
|
| 280 |
+
models: list = None,
|
|
|
|
|
|
|
| 281 |
):
|
| 282 |
+
self.client = genai.Client(api_key=api_key)
|
| 283 |
+
self.models = models or GEMINI_MODELS
|
|
|
|
|
|
|
| 284 |
|
| 285 |
def _call_api(self, transcript: str) -> tuple[str, str]:
|
| 286 |
"""
|
| 287 |
+
Try each model in order with per-model retry + backoff.
|
| 288 |
+
Returns (full_response_text, model_used).
|
|
|
|
| 289 |
"""
|
| 290 |
overall_last_error = None
|
| 291 |
|
|
|
|
| 357 |
return full_text.strip(), ""
|
| 358 |
|
| 359 |
def run(self, transcript: str) -> tuple[str, str, str]:
|
| 360 |
+
"""
|
| 361 |
+
Summarize transcript.
|
| 362 |
+
Returns (summary, qa, model_used) β nothing is written to disk.
|
| 363 |
+
"""
|
| 364 |
full, model_used = self._call_api(transcript)
|
| 365 |
summary, qa = self._split(full)
|
| 366 |
+
logger.info("β
Summarization complete β model: %s", model_used)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
return summary, qa, model_used
|
| 368 |
|
| 369 |
|
|
|
|
| 372 |
# ============================================================================
|
| 373 |
|
| 374 |
class TranscriptSummaryPipeline:
|
| 375 |
+
"""
|
| 376 |
+
Orchestrates fetch β summarize.
|
| 377 |
+
All data flows in memory β no disk I/O.
|
| 378 |
+
"""
|
| 379 |
|
| 380 |
def __init__(
|
| 381 |
self,
|
|
|
|
| 386 |
self.youtube_url = youtube_url
|
| 387 |
self.fetcher = YouTubeTranscriptFetcher(
|
| 388 |
youtube_url=youtube_url,
|
|
|
|
| 389 |
languages=languages,
|
| 390 |
polling_config=polling_config,
|
| 391 |
)
|