Spaces:
Sleeping
Sleeping
rsnarsna commited on
Commit Β·
f59712d
1
Parent(s): 24cacb3
refactor: Clean up imports and improve file path handling in app.py and gemini_transcript.py; update requirements.txt for new dependencies
Browse files- app.py +20 -21
- gemini_transcript.py +16 -12
- requirements.txt +3 -1
app.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import json
|
| 3 |
import base64
|
|
@@ -11,9 +13,8 @@ from pathlib import Path
|
|
| 11 |
from email.mime.text import MIMEText
|
| 12 |
from datetime import datetime, timezone
|
| 13 |
|
| 14 |
-
from fastapi.responses import FileResponse
|
| 15 |
from fastapi import FastAPI, Request, HTTPException
|
| 16 |
-
from fastapi.responses import RedirectResponse, HTMLResponse
|
| 17 |
from pydantic import BaseModel
|
| 18 |
|
| 19 |
from google_auth_oauthlib.flow import Flow
|
|
@@ -32,12 +33,12 @@ from gemini_transcript import TranscriptSummaryPipeline
|
|
| 32 |
os.environ.setdefault("OAUTHLIB_INSECURE_TRANSPORT", "1")
|
| 33 |
|
| 34 |
BASE_DIR = Path(__file__).resolve().parent
|
| 35 |
-
CLIENT_SECRETS = os.getenv("CLIENT_SECRETS",
|
| 36 |
-
TOKEN_PATH = os.getenv("GOOGLE_OAUTH_TOKEN_PATH",
|
| 37 |
-
REDIRECT_URI = os.getenv("REDIRECT_URI",
|
| 38 |
STATE_FILE = BASE_DIR / "oauth_states.json"
|
| 39 |
-
DEFAULT_SPREADSHEET_ID = os.getenv("DEFAULT_SPREADSHEET_ID",
|
| 40 |
-
DEFAULT_DRIVE_FOLDER_ID = os.getenv("DEFAULT_DRIVE_FOLDER_ID",
|
| 41 |
|
| 42 |
SCOPES = [
|
| 43 |
"https://www.googleapis.com/auth/spreadsheets",
|
|
@@ -45,10 +46,10 @@ SCOPES = [
|
|
| 45 |
"https://www.googleapis.com/auth/drive.file",
|
| 46 |
]
|
| 47 |
|
| 48 |
-
|
| 49 |
-
TRANSCRIPT_FILE =
|
| 50 |
-
SUMMARY_FILE =
|
| 51 |
-
QA_FILE =
|
| 52 |
|
| 53 |
SHEETS_HEADERS = [
|
| 54 |
"Timestamp", # A
|
|
@@ -274,8 +275,8 @@ def upload_file_to_drive(
|
|
| 274 |
) -> dict:
|
| 275 |
if creds is None:
|
| 276 |
creds = require_credentials()
|
| 277 |
-
svc
|
| 278 |
-
meta
|
| 279 |
if folder_id:
|
| 280 |
meta["parents"] = [folder_id]
|
| 281 |
media = MediaFileUpload(filepath, resumable=True)
|
|
@@ -556,7 +557,6 @@ def _update_sheet_record(
|
|
| 556 |
print(f"[WARN] Row for job {job_id} not found in sheet.")
|
| 557 |
return
|
| 558 |
|
| 559 |
-
# Read existing to preserve immutable columns
|
| 560 |
existing = read_sheet(
|
| 561 |
DEFAULT_SPREADSHEET_ID,
|
| 562 |
f"Sheet1!A{row_num}:N{row_num}",
|
|
@@ -613,7 +613,7 @@ def on_startup():
|
|
| 613 |
|
| 614 |
@app.get("/")
|
| 615 |
def root():
|
| 616 |
-
|
| 617 |
|
| 618 |
|
| 619 |
@app.get("/health")
|
|
@@ -641,9 +641,9 @@ def health():
|
|
| 641 |
|
| 642 |
@app.get("/auth/start")
|
| 643 |
def auth_start():
|
| 644 |
-
flow
|
| 645 |
-
verifier
|
| 646 |
-
challenge
|
| 647 |
base64.urlsafe_b64encode(hashlib.sha256(verifier.encode()).digest())
|
| 648 |
.rstrip(b"=").decode()
|
| 649 |
)
|
|
@@ -904,7 +904,6 @@ def _run_pipeline(job_id: str, youtube_url: str, email_to: str):
|
|
| 904 |
creds = load_credentials()
|
| 905 |
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
| 906 |
|
| 907 |
-
# Create initial sheet row immediately
|
| 908 |
_create_sheet_record(
|
| 909 |
job_id=job_id,
|
| 910 |
timestamp=timestamp,
|
|
@@ -1063,11 +1062,11 @@ Google Integration API
|
|
| 1063 |
"model_used": model_used,
|
| 1064 |
"drive": {
|
| 1065 |
"folder_id": folder_id,
|
| 1066 |
-
"summary":
|
| 1067 |
"web_view_link": summary_drive.get("webViewLink"),
|
| 1068 |
"direct_download_link": summary_link,
|
| 1069 |
},
|
| 1070 |
-
"qa":
|
| 1071 |
"web_view_link": qa_drive.get("webViewLink"),
|
| 1072 |
"direct_download_link": qa_link,
|
| 1073 |
},
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
|
| 3 |
import os
|
| 4 |
import json
|
| 5 |
import base64
|
|
|
|
| 13 |
from email.mime.text import MIMEText
|
| 14 |
from datetime import datetime, timezone
|
| 15 |
|
|
|
|
| 16 |
from fastapi import FastAPI, Request, HTTPException
|
| 17 |
+
from fastapi.responses import RedirectResponse, HTMLResponse, FileResponse
|
| 18 |
from pydantic import BaseModel
|
| 19 |
|
| 20 |
from google_auth_oauthlib.flow import Flow
|
|
|
|
| 33 |
os.environ.setdefault("OAUTHLIB_INSECURE_TRANSPORT", "1")
|
| 34 |
|
| 35 |
BASE_DIR = Path(__file__).resolve().parent
|
| 36 |
+
CLIENT_SECRETS = os.getenv("CLIENT_SECRETS", str(BASE_DIR / "client_secret.json"))
|
| 37 |
+
TOKEN_PATH = os.getenv("GOOGLE_OAUTH_TOKEN_PATH", str(BASE_DIR / "Google_oauth_token.json"))
|
| 38 |
+
REDIRECT_URI = os.getenv("REDIRECT_URI", "http://localhost:8000/auth/callback")
|
| 39 |
STATE_FILE = BASE_DIR / "oauth_states.json"
|
| 40 |
+
DEFAULT_SPREADSHEET_ID = os.getenv("DEFAULT_SPREADSHEET_ID", "1XA3vW_guHBT-ktkYvhktmUqcquECBe8exGZAoSQS3Ag")
|
| 41 |
+
DEFAULT_DRIVE_FOLDER_ID = os.getenv("DEFAULT_DRIVE_FOLDER_ID", "1hI6dNXysR_2p9gHkDpsI-iwMExmy2hhR")
|
| 42 |
|
| 43 |
SCOPES = [
|
| 44 |
"https://www.googleapis.com/auth/spreadsheets",
|
|
|
|
| 46 |
"https://www.googleapis.com/auth/drive.file",
|
| 47 |
]
|
| 48 |
|
| 49 |
+
OUTPUT_DIR = Path(".") / "output"
|
| 50 |
+
TRANSCRIPT_FILE = OUTPUT_DIR / "transcript.txt"
|
| 51 |
+
SUMMARY_FILE = OUTPUT_DIR / "summary.txt"
|
| 52 |
+
QA_FILE = OUTPUT_DIR / "qa.txt"
|
| 53 |
|
| 54 |
SHEETS_HEADERS = [
|
| 55 |
"Timestamp", # A
|
|
|
|
| 275 |
) -> dict:
|
| 276 |
if creds is None:
|
| 277 |
creds = require_credentials()
|
| 278 |
+
svc = _drive(creds)
|
| 279 |
+
meta = {"name": os.path.basename(filepath)}
|
| 280 |
if folder_id:
|
| 281 |
meta["parents"] = [folder_id]
|
| 282 |
media = MediaFileUpload(filepath, resumable=True)
|
|
|
|
| 557 |
print(f"[WARN] Row for job {job_id} not found in sheet.")
|
| 558 |
return
|
| 559 |
|
|
|
|
| 560 |
existing = read_sheet(
|
| 561 |
DEFAULT_SPREADSHEET_ID,
|
| 562 |
f"Sheet1!A{row_num}:N{row_num}",
|
|
|
|
| 613 |
|
| 614 |
@app.get("/")
|
| 615 |
def root():
|
| 616 |
+
return FileResponse("index.html")
|
| 617 |
|
| 618 |
|
| 619 |
@app.get("/health")
|
|
|
|
| 641 |
|
| 642 |
@app.get("/auth/start")
|
| 643 |
def auth_start():
|
| 644 |
+
flow = create_flow()
|
| 645 |
+
verifier = secrets.token_urlsafe(64)
|
| 646 |
+
challenge = (
|
| 647 |
base64.urlsafe_b64encode(hashlib.sha256(verifier.encode()).digest())
|
| 648 |
.rstrip(b"=").decode()
|
| 649 |
)
|
|
|
|
| 904 |
creds = load_credentials()
|
| 905 |
timestamp = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S UTC")
|
| 906 |
|
|
|
|
| 907 |
_create_sheet_record(
|
| 908 |
job_id=job_id,
|
| 909 |
timestamp=timestamp,
|
|
|
|
| 1062 |
"model_used": model_used,
|
| 1063 |
"drive": {
|
| 1064 |
"folder_id": folder_id,
|
| 1065 |
+
"summary": {
|
| 1066 |
"web_view_link": summary_drive.get("webViewLink"),
|
| 1067 |
"direct_download_link": summary_link,
|
| 1068 |
},
|
| 1069 |
+
"qa": {
|
| 1070 |
"web_view_link": qa_drive.get("webViewLink"),
|
| 1071 |
"direct_download_link": qa_link,
|
| 1072 |
},
|
gemini_transcript.py
CHANGED
|
@@ -11,7 +11,9 @@ import time
|
|
| 11 |
from pathlib import Path
|
| 12 |
from typing import Optional, List
|
| 13 |
from urllib.parse import urlparse, parse_qs
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
|
| 16 |
from youtube_transcript_api import (
|
| 17 |
YouTubeTranscriptApi,
|
|
@@ -267,10 +269,9 @@ class YouTubeTranscriptFetcher:
|
|
| 267 |
class GeminiSummarizer:
|
| 268 |
"""Sends transcript to Gemini with model fallback + per-model retry."""
|
| 269 |
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
MAX_WAIT = 120 # seconds cap
|
| 274 |
|
| 275 |
# Errors β retry same model with backoff
|
| 276 |
RETRYABLE = ["503", "502", "500", "UNAVAILABLE", "SERVICE_UNAVAILABLE"]
|
|
@@ -308,7 +309,9 @@ class GeminiSummarizer:
|
|
| 308 |
response = self.client.models.generate_content(
|
| 309 |
model=model,
|
| 310 |
contents=transcript,
|
| 311 |
-
config=
|
|
|
|
|
|
|
| 312 |
)
|
| 313 |
logger.info(
|
| 314 |
"β
Response received from: %s (attempt %d)",
|
|
@@ -325,21 +328,19 @@ class GeminiSummarizer:
|
|
| 325 |
" [%d/%d] %s β quota/not-found, skipping to next model.",
|
| 326 |
attempt, self.MAX_RETRIES, model,
|
| 327 |
)
|
| 328 |
-
break
|
| 329 |
|
| 330 |
elif any(k in err for k in self.RETRYABLE):
|
| 331 |
if attempt < self.MAX_RETRIES:
|
| 332 |
logger.warning(
|
| 333 |
-
" [%d/%d] %s β transient error. "
|
| 334 |
-
"Retrying in %ds...",
|
| 335 |
attempt, self.MAX_RETRIES, model, wait,
|
| 336 |
)
|
| 337 |
time.sleep(wait)
|
| 338 |
wait = min(wait * 2, self.MAX_WAIT)
|
| 339 |
else:
|
| 340 |
logger.warning(
|
| 341 |
-
" [%d/%d] %s β max retries reached, "
|
| 342 |
-
"trying next model.",
|
| 343 |
attempt, self.MAX_RETRIES, model,
|
| 344 |
)
|
| 345 |
|
|
@@ -368,6 +369,9 @@ class GeminiSummarizer:
|
|
| 368 |
full, model_used = self._call_api(transcript)
|
| 369 |
summary, qa = self._split(full)
|
| 370 |
|
|
|
|
|
|
|
|
|
|
| 371 |
self.summary_file.write_text(summary, encoding="utf-8")
|
| 372 |
self.qa_file.write_text(qa, encoding="utf-8")
|
| 373 |
|
|
@@ -425,7 +429,7 @@ class TranscriptSummaryPipeline:
|
|
| 425 |
|
| 426 |
def main():
|
| 427 |
if len(sys.argv) < 2:
|
| 428 |
-
print("Usage: python
|
| 429 |
sys.exit(1)
|
| 430 |
|
| 431 |
pipeline = TranscriptSummaryPipeline(
|
|
|
|
| 11 |
from pathlib import Path
|
| 12 |
from typing import Optional, List
|
| 13 |
from urllib.parse import urlparse, parse_qs
|
| 14 |
+
|
| 15 |
+
from google import genai # pip install google-genai
|
| 16 |
+
from google.genai import types
|
| 17 |
|
| 18 |
from youtube_transcript_api import (
|
| 19 |
YouTubeTranscriptApi,
|
|
|
|
| 269 |
class GeminiSummarizer:
|
| 270 |
"""Sends transcript to Gemini with model fallback + per-model retry."""
|
| 271 |
|
| 272 |
+
MAX_RETRIES = 5
|
| 273 |
+
BASE_WAIT = 10 # seconds
|
| 274 |
+
MAX_WAIT = 120 # seconds cap
|
|
|
|
| 275 |
|
| 276 |
# Errors β retry same model with backoff
|
| 277 |
RETRYABLE = ["503", "502", "500", "UNAVAILABLE", "SERVICE_UNAVAILABLE"]
|
|
|
|
| 309 |
response = self.client.models.generate_content(
|
| 310 |
model=model,
|
| 311 |
contents=transcript,
|
| 312 |
+
config=types.GenerateContentConfig(
|
| 313 |
+
system_instruction=SYSTEM_PROMPT,
|
| 314 |
+
),
|
| 315 |
)
|
| 316 |
logger.info(
|
| 317 |
"β
Response received from: %s (attempt %d)",
|
|
|
|
| 328 |
" [%d/%d] %s β quota/not-found, skipping to next model.",
|
| 329 |
attempt, self.MAX_RETRIES, model,
|
| 330 |
)
|
| 331 |
+
break
|
| 332 |
|
| 333 |
elif any(k in err for k in self.RETRYABLE):
|
| 334 |
if attempt < self.MAX_RETRIES:
|
| 335 |
logger.warning(
|
| 336 |
+
" [%d/%d] %s β transient error. Retrying in %ds...",
|
|
|
|
| 337 |
attempt, self.MAX_RETRIES, model, wait,
|
| 338 |
)
|
| 339 |
time.sleep(wait)
|
| 340 |
wait = min(wait * 2, self.MAX_WAIT)
|
| 341 |
else:
|
| 342 |
logger.warning(
|
| 343 |
+
" [%d/%d] %s β max retries reached, trying next model.",
|
|
|
|
| 344 |
attempt, self.MAX_RETRIES, model,
|
| 345 |
)
|
| 346 |
|
|
|
|
| 369 |
full, model_used = self._call_api(transcript)
|
| 370 |
summary, qa = self._split(full)
|
| 371 |
|
| 372 |
+
self.summary_file.parent.mkdir(parents=True, exist_ok=True)
|
| 373 |
+
self.qa_file.parent.mkdir(parents=True, exist_ok=True)
|
| 374 |
+
|
| 375 |
self.summary_file.write_text(summary, encoding="utf-8")
|
| 376 |
self.qa_file.write_text(qa, encoding="utf-8")
|
| 377 |
|
|
|
|
| 429 |
|
| 430 |
def main():
|
| 431 |
if len(sys.argv) < 2:
|
| 432 |
+
print("Usage: python gemini_transcript.py <youtube_url>", file=sys.stderr)
|
| 433 |
sys.exit(1)
|
| 434 |
|
| 435 |
pipeline = TranscriptSummaryPipeline(
|
requirements.txt
CHANGED
|
@@ -4,4 +4,6 @@ google-api-python-client
|
|
| 4 |
google-auth-httplib2
|
| 5 |
google-auth-oauthlib
|
| 6 |
requests
|
| 7 |
-
youtube_transcript_api
|
|
|
|
|
|
|
|
|
| 4 |
google-auth-httplib2
|
| 5 |
google-auth-oauthlib
|
| 6 |
requests
|
| 7 |
+
youtube_transcript_api
|
| 8 |
+
google-generativeai
|
| 9 |
+
google-genai
|