yt-agent-streamlit / src /functions.py
hwang2006's picture
Update src/functions.py
96966f9 verified
# src/functions.py
import os
import re
from urllib.parse import urlparse, parse_qs, urlunparse
from agents import function_tool
from youtube_transcript_api import (
YouTubeTranscriptApi,
TranscriptsDisabled,
NoTranscriptFound,
VideoUnavailable,
)
from youtube_transcript_api.proxies import GenericProxyConfig
# ---------------------------
# YouTube URL / ID utilities
# ---------------------------
_YT_ID_RE = re.compile(r"^[a-zA-Z0-9_-]{11}$")
def _extract_video_id(url_or_id: str) -> str | None:
"""
Accepts a raw 11-char video ID or any common YouTube URL:
- https://www.youtube.com/watch?v=VIDEOID&...
- https://youtu.be/VIDEOID?t=123
- https://www.youtube.com/shorts/VIDEOID
- https://www.youtube.com/embed/VIDEOID
Ignores extra params (list, t, etc.).
"""
s = (url_or_id or "").strip()
# Bare ID
if _YT_ID_RE.match(s):
return s
p = urlparse(s)
if not p.netloc:
return None
# /watch?v=VIDEOID
if p.path == "/watch":
v = parse_qs(p.query).get("v", [None])[0]
return v if v and _YT_ID_RE.match(v) else None
# youtu.be/VIDEOID
if p.netloc.endswith("youtu.be"):
vid = p.path.lstrip("/")
return vid if _YT_ID_RE.match(vid) else None
# /shorts/VIDEOID or /embed/VIDEOID
parts = p.path.strip("/").split("/")
if len(parts) >= 2 and parts[0] in ("shorts", "embed"):
vid = parts[1]
return vid if _YT_ID_RE.match(vid) else None
return None
# ---------------------------
# Proxy configuration
# ---------------------------
def _build_proxy_config() -> GenericProxyConfig | None:
"""
Supports these envs (Repository secrets on HF Spaces):
- PROXY_AUTH_URL = http://USER:PASS@HOST:PORT (preferred)
- OR:
PROXY_URL = http://HOST:PORT (or https://HOST:PORT)
PROXY_USERNAME = user (optional)
PROXY_PASSWORD = pass (optional)
Returns a youtube_transcript_api GenericProxyConfig if possible, else None.
"""
auth_url = os.getenv("PROXY_AUTH_URL", "").strip()
if auth_url:
# If scheme missing, assume http
if not auth_url.startswith(("http://", "https://")):
auth_url = "http://" + auth_url
# Build both http/https variants if needed
http_url = auth_url.replace("https://", "http://")
https_url = auth_url.replace("http://", "https://")
return GenericProxyConfig(http_url=http_url, https_url=https_url)
base = os.getenv("PROXY_URL", "").strip()
user = os.getenv("PROXY_USERNAME", "").strip()
pwd = os.getenv("PROXY_PASSWORD", "").strip()
if not base:
return None
# Ensure scheme; default to http
if not base.startswith(("http://", "https://")):
base = "http://" + base
if user and pwd:
# Insert credentials into netloc
p = urlparse(base)
netloc = f"{user}:{pwd}@{p.hostname}"
if p.port:
netloc += f":{p.port}"
authd = urlunparse((p.scheme, netloc, p.path or "", "", "", ""))
http_url = authd.replace("https://", "http://")
https_url = authd.replace("http://", "https://")
return GenericProxyConfig(http_url=http_url, https_url=https_url)
else:
# No-auth proxy
http_url = base.replace("https://", "http://")
https_url = base.replace("http://", "https://")
return GenericProxyConfig(http_url=http_url, https_url=https_url)
def _export_proxy_env() -> None:
"""
Universal fallback: export HTTP(S)_PROXY so underlying HTTP client (requests/httpx)
uses the proxy even if youtube-transcript-api signature changes.
"""
p = (os.getenv("PROXY_AUTH_URL") or os.getenv("PROXY_URL") or "").strip()
if not p:
return
if not p.startswith(("http://", "https://")):
p = "http://" + p
os.environ["HTTP_PROXY"] = p
os.environ["HTTPS_PROXY"] = p
# ---------------------------
# Formatting
# ---------------------------
def _format_transcript(entries: list[dict]) -> str:
"""
entries: list of dicts like {"text": "...", "start": 12.34, "duration": 3.21}
Output: one line per entry, "[MM:SS] Text"
"""
lines = []
for e in entries:
try:
start = float(e.get("start", 0))
except Exception:
start = 0.0
minutes = int(start // 60)
seconds = int(start % 60)
ts = f"[{minutes:02d}:{seconds:02d}]"
text = (e.get("text") or "").replace("\n", " ").strip()
if text:
lines.append(f"{ts} {text}")
return "\n".join(lines)
# ---------------------------
# Tools
# ---------------------------
@function_tool
def fetch_video_transcript(url: str) -> str:
"""
Extract transcript with timestamps from a YouTube video URL and format it for LLM consumption.
Args:
url (str): YouTube video URL (any common form is accepted)
Returns:
str: Formatted transcript with timestamps, one per line: "[MM:SS] Text"
or a specific, user-friendly error message.
"""
video_id = _extract_video_id(url)
if not video_id:
return "❌ I couldn’t parse a valid YouTube video ID from your input. Please paste a direct video link."
# Make sure the environment knows about the proxy universally
_export_proxy_env()
proxy_cfg = _build_proxy_config()
preferred_langs = ["en", "en-US", "en-GB", "ko", "ja"]
# Helper: call get_transcript with fallback to older API style
def _get_transcript_any() -> list[dict]:
try:
return YouTubeTranscriptApi.get_transcript(
video_id,
languages=preferred_langs,
proxy=proxy_cfg, # newer APIs
)
except TypeError:
# older style
return YouTubeTranscriptApi(proxy_config=proxy_cfg).fetch(video_id)
# Helper: call list_transcripts with fallback
def _list_transcripts_any():
try:
return YouTubeTranscriptApi.list_transcripts(video_id, proxy=proxy_cfg)
except TypeError:
return YouTubeTranscriptApi(proxy_config=proxy_cfg).list_transcripts(video_id)
try:
# Fast path: direct fetch with preferred langs
entries = _get_transcript_any()
return _format_transcript(entries)
except NoTranscriptFound:
# Try listing to find auto-generated or translatable transcripts
try:
listing = _list_transcripts_any()
# 1) Exact language match (non-generated)
for lang in preferred_langs:
try:
t = listing.find_transcript([lang])
return _format_transcript(t.fetch())
except Exception:
pass
# 2) Auto-generated (first available)
for tr in listing:
if getattr(tr, "is_generated", False):
try:
return _format_transcript(tr.fetch())
except Exception:
pass
# 3) Translate to English as last resort
for tr in listing:
try:
t_en = tr.translate("en")
return _format_transcript(t_en.fetch())
except Exception:
continue
return "❌ No transcript is available for this video (no captions found, even auto-generated)."
except TranscriptsDisabled:
return "❌ Transcripts are disabled for this video."
except VideoUnavailable:
return "❌ This video is unavailable in the current region or has restrictions."
except Exception as e:
# Likely network/restrictions if not a true NoTranscriptFound
return f"⚠️ Error while searching transcripts: {e}"
except TranscriptsDisabled:
return "❌ Transcripts are disabled for this video."
except VideoUnavailable:
return "❌ This video is unavailable in the current region or has restrictions."
except Exception as e:
# Most common here: connection error / blocked / proxy needed
hint = ""
if not (os.getenv("PROXY_AUTH_URL") or os.getenv("PROXY_URL")):
hint = " (Tip: if this Space is on Hugging Face, set a proxy via PROXY_AUTH_URL or PROXY_URL/USERNAME/PASSWORD in Repository secrets.)"
return f"⚠️ Error fetching transcript: {e}{hint}"
@function_tool
def fetch_intstructions(prompt_name: str) -> str:
"""
Fetch instructions for a given prompt name from the prompts/ directory.
Available prompts:
- write_blog_post
- write_social_post
- write_video_chapters
"""
script_dir = os.path.dirname(__file__)
prompt_path = os.path.join(script_dir, "prompts", f"{prompt_name}.md")
with open(prompt_path, "r", encoding="utf-8") as f:
return f.read()