Spaces:
Running
Running
Codex commited on
Commit ยท
494d4ff
1
Parent(s): a05e32e
Harden YouTube transcript loading on Hugging Face
Browse files
README.md
CHANGED
|
@@ -23,3 +23,12 @@ This Space runs a Streamlit app for summarizing:
|
|
| 23 |
Add this secret in the Space settings:
|
| 24 |
|
| 25 |
- `GROQ_API_KEY`
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
Add this secret in the Space settings:
|
| 24 |
|
| 25 |
- `GROQ_API_KEY`
|
| 26 |
+
|
| 27 |
+
## YouTube On Hugging Face Spaces
|
| 28 |
+
|
| 29 |
+
YouTube transcript loading may work locally but fail on Hugging Face Spaces because YouTube frequently blocks or rate-limits datacenter IP ranges. The app now retries transient HTTPS failures and supports proxy configuration through Space secrets:
|
| 30 |
+
|
| 31 |
+
- `YOUTUBE_HTTP_PROXY`
|
| 32 |
+
- `YOUTUBE_HTTPS_PROXY`
|
| 33 |
+
|
| 34 |
+
You can also use the standard `HTTP_PROXY` and `HTTPS_PROXY` environment variables if that matches your setup.
|
app.py
CHANGED
|
@@ -16,12 +16,16 @@ from langchain_core.prompts import PromptTemplate
|
|
| 16 |
from langchain_groq import ChatGroq
|
| 17 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 18 |
from pypdf import PdfReader
|
|
|
|
| 19 |
from requests import RequestException
|
|
|
|
|
|
|
| 20 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 21 |
|
| 22 |
|
| 23 |
load_dotenv()
|
| 24 |
|
|
|
|
| 25 |
SAMPLE_YOUTUBE_URL = "https://youtu.be/ocBh08fjIfU"
|
| 26 |
LANGUAGE_OPTIONS = ["Original", "English", "Arabic", "French", "Bahasa Malay"]
|
| 27 |
LANGUAGE_CODE_MAP = {
|
|
@@ -36,6 +40,12 @@ LANGUAGE_LABEL_MAP = {
|
|
| 36 |
"French": "French",
|
| 37 |
"Bahasa Malay": "Bahasa Melayu",
|
| 38 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
st.set_page_config(page_title="Summarize Text From PDF, YouTube, Website", page_icon="๐")
|
| 41 |
st.title("๐ Summarize Text From PDF, YouTube, Website")
|
|
@@ -75,6 +85,7 @@ transcript_language = "Original"
|
|
| 75 |
|
| 76 |
with st.sidebar:
|
| 77 |
st.header("Options")
|
|
|
|
| 78 |
input_source_mode = st.radio(
|
| 79 |
"Content source",
|
| 80 |
options=["URL", "Upload documents", "Both"],
|
|
@@ -111,6 +122,14 @@ with st.sidebar:
|
|
| 111 |
"`stuff` is fastest for short content, `map_reduce` is safer for long content, "
|
| 112 |
"and `refine` is useful when building a summary progressively across chunks."
|
| 113 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
st.caption(f"Sample YouTube URL: `{SAMPLE_YOUTUBE_URL}`")
|
| 115 |
if st.button("Use sample YouTube URL"):
|
| 116 |
st.session_state.url_input = SAMPLE_YOUTUBE_URL
|
|
@@ -272,9 +291,70 @@ def _translate_documents_with_llm(docs: list[Document], target_language: str) ->
|
|
| 272 |
return translated_docs
|
| 273 |
|
| 274 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
def _resolve_transcript(video_id: str, selected_language: str):
|
| 276 |
-
api =
|
| 277 |
-
|
|
|
|
|
|
|
|
|
|
| 278 |
available_transcripts = list(transcript_list)
|
| 279 |
|
| 280 |
if selected_language == "Original":
|
|
@@ -322,7 +402,10 @@ def _load_youtube_documents(url: str, selected_language: str) -> list[Document]:
|
|
| 322 |
transcript, transcript_language_label = _resolve_transcript(video_id, "Original")
|
| 323 |
should_translate_with_llm = True
|
| 324 |
|
| 325 |
-
|
|
|
|
|
|
|
|
|
|
| 326 |
transcript_text = " ".join(snippet.text.strip() for snippet in fetched_transcript if snippet.text.strip())
|
| 327 |
if not transcript_text:
|
| 328 |
raise ValueError("No transcript text could be extracted from this video.")
|
|
|
|
| 16 |
from langchain_groq import ChatGroq
|
| 17 |
from langchain_text_splitters import RecursiveCharacterTextSplitter
|
| 18 |
from pypdf import PdfReader
|
| 19 |
+
from requests.adapters import HTTPAdapter
|
| 20 |
from requests import RequestException
|
| 21 |
+
from requests.exceptions import SSLError
|
| 22 |
+
from urllib3.util.retry import Retry
|
| 23 |
from youtube_transcript_api import YouTubeTranscriptApi
|
| 24 |
|
| 25 |
|
| 26 |
load_dotenv()
|
| 27 |
|
| 28 |
+
APP_VERSION = "2026-04-23-hf-youtube-fix-2"
|
| 29 |
SAMPLE_YOUTUBE_URL = "https://youtu.be/ocBh08fjIfU"
|
| 30 |
LANGUAGE_OPTIONS = ["Original", "English", "Arabic", "French", "Bahasa Malay"]
|
| 31 |
LANGUAGE_CODE_MAP = {
|
|
|
|
| 40 |
"French": "French",
|
| 41 |
"Bahasa Malay": "Bahasa Melayu",
|
| 42 |
}
|
| 43 |
+
YOUTUBE_PROXY_ENV_VARS = (
|
| 44 |
+
"YOUTUBE_HTTP_PROXY",
|
| 45 |
+
"YOUTUBE_HTTPS_PROXY",
|
| 46 |
+
"HTTP_PROXY",
|
| 47 |
+
"HTTPS_PROXY",
|
| 48 |
+
)
|
| 49 |
|
| 50 |
st.set_page_config(page_title="Summarize Text From PDF, YouTube, Website", page_icon="๐")
|
| 51 |
st.title("๐ Summarize Text From PDF, YouTube, Website")
|
|
|
|
| 85 |
|
| 86 |
with st.sidebar:
|
| 87 |
st.header("Options")
|
| 88 |
+
st.caption(f"App version: `{APP_VERSION}`")
|
| 89 |
input_source_mode = st.radio(
|
| 90 |
"Content source",
|
| 91 |
options=["URL", "Upload documents", "Both"],
|
|
|
|
| 122 |
"`stuff` is fastest for short content, `map_reduce` is safer for long content, "
|
| 123 |
"and `refine` is useful when building a summary progressively across chunks."
|
| 124 |
)
|
| 125 |
+
if os.getenv("SPACE_ID"):
|
| 126 |
+
if _has_youtube_proxy_config():
|
| 127 |
+
st.info("Hugging Face Space detected. YouTube proxy configuration is present.")
|
| 128 |
+
else:
|
| 129 |
+
st.warning(
|
| 130 |
+
"Hugging Face Space detected. YouTube transcript loading may fail without "
|
| 131 |
+
"a proxy because YouTube often blocks datacenter IPs."
|
| 132 |
+
)
|
| 133 |
st.caption(f"Sample YouTube URL: `{SAMPLE_YOUTUBE_URL}`")
|
| 134 |
if st.button("Use sample YouTube URL"):
|
| 135 |
st.session_state.url_input = SAMPLE_YOUTUBE_URL
|
|
|
|
| 291 |
return translated_docs
|
| 292 |
|
| 293 |
|
| 294 |
+
def _has_youtube_proxy_config() -> bool:
|
| 295 |
+
return any(os.getenv(var_name) for var_name in YOUTUBE_PROXY_ENV_VARS)
|
| 296 |
+
|
| 297 |
+
|
| 298 |
+
def _build_youtube_http_client() -> requests.Session:
|
| 299 |
+
session = requests.Session()
|
| 300 |
+
session.headers.update(
|
| 301 |
+
{
|
| 302 |
+
"User-Agent": (
|
| 303 |
+
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
|
| 304 |
+
"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
|
| 305 |
+
),
|
| 306 |
+
"Accept-Language": "en-US,en;q=0.9",
|
| 307 |
+
"Accept": "*/*",
|
| 308 |
+
}
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
retry_config = Retry(
|
| 312 |
+
total=3,
|
| 313 |
+
connect=3,
|
| 314 |
+
read=3,
|
| 315 |
+
backoff_factor=1,
|
| 316 |
+
status_forcelist=[429, 500, 502, 503, 504],
|
| 317 |
+
allowed_methods=["GET"],
|
| 318 |
+
raise_on_status=False,
|
| 319 |
+
)
|
| 320 |
+
adapter = HTTPAdapter(max_retries=retry_config)
|
| 321 |
+
session.mount("https://", adapter)
|
| 322 |
+
session.mount("http://", adapter)
|
| 323 |
+
|
| 324 |
+
if os.getenv("YOUTUBE_CA_BUNDLE"):
|
| 325 |
+
session.verify = os.getenv("YOUTUBE_CA_BUNDLE")
|
| 326 |
+
|
| 327 |
+
return session
|
| 328 |
+
|
| 329 |
+
|
| 330 |
+
def _build_youtube_transcript_api() -> YouTubeTranscriptApi:
|
| 331 |
+
return YouTubeTranscriptApi(http_client=_build_youtube_http_client())
|
| 332 |
+
|
| 333 |
+
|
| 334 |
+
def _format_youtube_transcript_error(error: Exception) -> str:
|
| 335 |
+
if isinstance(error, (SSLError, RequestException)):
|
| 336 |
+
proxy_hint = (
|
| 337 |
+
" Configure `YOUTUBE_HTTP_PROXY` / `YOUTUBE_HTTPS_PROXY` "
|
| 338 |
+
"or standard `HTTP_PROXY` / `HTTPS_PROXY` in the Space secrets."
|
| 339 |
+
if not _has_youtube_proxy_config()
|
| 340 |
+
else " Check that the configured outbound proxy is reachable from the Space."
|
| 341 |
+
)
|
| 342 |
+
return (
|
| 343 |
+
"[HF-YT-SSL-001] The deployment could not establish a stable HTTPS connection to YouTube. "
|
| 344 |
+
"This is common on cloud-hosted runtimes such as Hugging Face Spaces because "
|
| 345 |
+
"YouTube often blocks or interrupts traffic from datacenter IPs."
|
| 346 |
+
f"{proxy_hint}"
|
| 347 |
+
)
|
| 348 |
+
|
| 349 |
+
return str(error)
|
| 350 |
+
|
| 351 |
+
|
| 352 |
def _resolve_transcript(video_id: str, selected_language: str):
|
| 353 |
+
api = _build_youtube_transcript_api()
|
| 354 |
+
try:
|
| 355 |
+
transcript_list = api.list(video_id)
|
| 356 |
+
except Exception as exc:
|
| 357 |
+
raise RuntimeError(_format_youtube_transcript_error(exc)) from exc
|
| 358 |
available_transcripts = list(transcript_list)
|
| 359 |
|
| 360 |
if selected_language == "Original":
|
|
|
|
| 402 |
transcript, transcript_language_label = _resolve_transcript(video_id, "Original")
|
| 403 |
should_translate_with_llm = True
|
| 404 |
|
| 405 |
+
try:
|
| 406 |
+
fetched_transcript = transcript.fetch()
|
| 407 |
+
except Exception as exc:
|
| 408 |
+
raise RuntimeError(_format_youtube_transcript_error(exc)) from exc
|
| 409 |
transcript_text = " ".join(snippet.text.strip() for snippet in fetched_transcript if snippet.text.strip())
|
| 410 |
if not transcript_text:
|
| 411 |
raise ValueError("No transcript text could be extracted from this video.")
|