Codex commited on
Commit
494d4ff
ยท
1 Parent(s): a05e32e

Harden YouTube transcript loading on Hugging Face

Browse files
Files changed (2) hide show
  1. README.md +9 -0
  2. app.py +86 -3
README.md CHANGED
@@ -23,3 +23,12 @@ This Space runs a Streamlit app for summarizing:
23
  Add this secret in the Space settings:
24
 
25
  - `GROQ_API_KEY`
 
 
 
 
 
 
 
 
 
 
23
  Add this secret in the Space settings:
24
 
25
  - `GROQ_API_KEY`
26
+
27
+ ## YouTube On Hugging Face Spaces
28
+
29
+ YouTube transcript loading may work locally but fail on Hugging Face Spaces because YouTube frequently blocks or rate-limits datacenter IP ranges. The app now retries transient HTTPS failures and supports proxy configuration through Space secrets:
30
+
31
+ - `YOUTUBE_HTTP_PROXY`
32
+ - `YOUTUBE_HTTPS_PROXY`
33
+
34
+ You can also use the standard `HTTP_PROXY` and `HTTPS_PROXY` environment variables if that matches your setup.
app.py CHANGED
@@ -16,12 +16,16 @@ from langchain_core.prompts import PromptTemplate
16
  from langchain_groq import ChatGroq
17
  from langchain_text_splitters import RecursiveCharacterTextSplitter
18
  from pypdf import PdfReader
 
19
  from requests import RequestException
 
 
20
  from youtube_transcript_api import YouTubeTranscriptApi
21
 
22
 
23
  load_dotenv()
24
 
 
25
  SAMPLE_YOUTUBE_URL = "https://youtu.be/ocBh08fjIfU"
26
  LANGUAGE_OPTIONS = ["Original", "English", "Arabic", "French", "Bahasa Malay"]
27
  LANGUAGE_CODE_MAP = {
@@ -36,6 +40,12 @@ LANGUAGE_LABEL_MAP = {
36
  "French": "French",
37
  "Bahasa Malay": "Bahasa Melayu",
38
  }
 
 
 
 
 
 
39
 
40
  st.set_page_config(page_title="Summarize Text From PDF, YouTube, Website", page_icon="๐Ÿ“")
41
  st.title("๐Ÿ“ Summarize Text From PDF, YouTube, Website")
@@ -75,6 +85,7 @@ transcript_language = "Original"
75
 
76
  with st.sidebar:
77
  st.header("Options")
 
78
  input_source_mode = st.radio(
79
  "Content source",
80
  options=["URL", "Upload documents", "Both"],
@@ -111,6 +122,14 @@ with st.sidebar:
111
  "`stuff` is fastest for short content, `map_reduce` is safer for long content, "
112
  "and `refine` is useful when building a summary progressively across chunks."
113
  )
 
 
 
 
 
 
 
 
114
  st.caption(f"Sample YouTube URL: `{SAMPLE_YOUTUBE_URL}`")
115
  if st.button("Use sample YouTube URL"):
116
  st.session_state.url_input = SAMPLE_YOUTUBE_URL
@@ -272,9 +291,70 @@ def _translate_documents_with_llm(docs: list[Document], target_language: str) ->
272
  return translated_docs
273
 
274
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  def _resolve_transcript(video_id: str, selected_language: str):
276
- api = YouTubeTranscriptApi()
277
- transcript_list = api.list(video_id)
 
 
 
278
  available_transcripts = list(transcript_list)
279
 
280
  if selected_language == "Original":
@@ -322,7 +402,10 @@ def _load_youtube_documents(url: str, selected_language: str) -> list[Document]:
322
  transcript, transcript_language_label = _resolve_transcript(video_id, "Original")
323
  should_translate_with_llm = True
324
 
325
- fetched_transcript = transcript.fetch()
 
 
 
326
  transcript_text = " ".join(snippet.text.strip() for snippet in fetched_transcript if snippet.text.strip())
327
  if not transcript_text:
328
  raise ValueError("No transcript text could be extracted from this video.")
 
16
  from langchain_groq import ChatGroq
17
  from langchain_text_splitters import RecursiveCharacterTextSplitter
18
  from pypdf import PdfReader
19
+ from requests.adapters import HTTPAdapter
20
  from requests import RequestException
21
+ from requests.exceptions import SSLError
22
+ from urllib3.util.retry import Retry
23
  from youtube_transcript_api import YouTubeTranscriptApi
24
 
25
 
26
  load_dotenv()
27
 
28
+ APP_VERSION = "2026-04-23-hf-youtube-fix-2"
29
  SAMPLE_YOUTUBE_URL = "https://youtu.be/ocBh08fjIfU"
30
  LANGUAGE_OPTIONS = ["Original", "English", "Arabic", "French", "Bahasa Malay"]
31
  LANGUAGE_CODE_MAP = {
 
40
  "French": "French",
41
  "Bahasa Malay": "Bahasa Melayu",
42
  }
43
+ YOUTUBE_PROXY_ENV_VARS = (
44
+ "YOUTUBE_HTTP_PROXY",
45
+ "YOUTUBE_HTTPS_PROXY",
46
+ "HTTP_PROXY",
47
+ "HTTPS_PROXY",
48
+ )
49
 
50
  st.set_page_config(page_title="Summarize Text From PDF, YouTube, Website", page_icon="๐Ÿ“")
51
  st.title("๐Ÿ“ Summarize Text From PDF, YouTube, Website")
 
85
 
86
  with st.sidebar:
87
  st.header("Options")
88
+ st.caption(f"App version: `{APP_VERSION}`")
89
  input_source_mode = st.radio(
90
  "Content source",
91
  options=["URL", "Upload documents", "Both"],
 
122
  "`stuff` is fastest for short content, `map_reduce` is safer for long content, "
123
  "and `refine` is useful when building a summary progressively across chunks."
124
  )
125
+ if os.getenv("SPACE_ID"):
126
+ if _has_youtube_proxy_config():
127
+ st.info("Hugging Face Space detected. YouTube proxy configuration is present.")
128
+ else:
129
+ st.warning(
130
+ "Hugging Face Space detected. YouTube transcript loading may fail without "
131
+ "a proxy because YouTube often blocks datacenter IPs."
132
+ )
133
  st.caption(f"Sample YouTube URL: `{SAMPLE_YOUTUBE_URL}`")
134
  if st.button("Use sample YouTube URL"):
135
  st.session_state.url_input = SAMPLE_YOUTUBE_URL
 
291
  return translated_docs
292
 
293
 
294
+ def _has_youtube_proxy_config() -> bool:
295
+ return any(os.getenv(var_name) for var_name in YOUTUBE_PROXY_ENV_VARS)
296
+
297
+
298
+ def _build_youtube_http_client() -> requests.Session:
299
+ session = requests.Session()
300
+ session.headers.update(
301
+ {
302
+ "User-Agent": (
303
+ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
304
+ "(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
305
+ ),
306
+ "Accept-Language": "en-US,en;q=0.9",
307
+ "Accept": "*/*",
308
+ }
309
+ )
310
+
311
+ retry_config = Retry(
312
+ total=3,
313
+ connect=3,
314
+ read=3,
315
+ backoff_factor=1,
316
+ status_forcelist=[429, 500, 502, 503, 504],
317
+ allowed_methods=["GET"],
318
+ raise_on_status=False,
319
+ )
320
+ adapter = HTTPAdapter(max_retries=retry_config)
321
+ session.mount("https://", adapter)
322
+ session.mount("http://", adapter)
323
+
324
+ if os.getenv("YOUTUBE_CA_BUNDLE"):
325
+ session.verify = os.getenv("YOUTUBE_CA_BUNDLE")
326
+
327
+ return session
328
+
329
+
330
+ def _build_youtube_transcript_api() -> YouTubeTranscriptApi:
331
+ return YouTubeTranscriptApi(http_client=_build_youtube_http_client())
332
+
333
+
334
+ def _format_youtube_transcript_error(error: Exception) -> str:
335
+ if isinstance(error, (SSLError, RequestException)):
336
+ proxy_hint = (
337
+ " Configure `YOUTUBE_HTTP_PROXY` / `YOUTUBE_HTTPS_PROXY` "
338
+ "or standard `HTTP_PROXY` / `HTTPS_PROXY` in the Space secrets."
339
+ if not _has_youtube_proxy_config()
340
+ else " Check that the configured outbound proxy is reachable from the Space."
341
+ )
342
+ return (
343
+ "[HF-YT-SSL-001] The deployment could not establish a stable HTTPS connection to YouTube. "
344
+ "This is common on cloud-hosted runtimes such as Hugging Face Spaces because "
345
+ "YouTube often blocks or interrupts traffic from datacenter IPs."
346
+ f"{proxy_hint}"
347
+ )
348
+
349
+ return str(error)
350
+
351
+
352
  def _resolve_transcript(video_id: str, selected_language: str):
353
+ api = _build_youtube_transcript_api()
354
+ try:
355
+ transcript_list = api.list(video_id)
356
+ except Exception as exc:
357
+ raise RuntimeError(_format_youtube_transcript_error(exc)) from exc
358
  available_transcripts = list(transcript_list)
359
 
360
  if selected_language == "Original":
 
402
  transcript, transcript_language_label = _resolve_transcript(video_id, "Original")
403
  should_translate_with_llm = True
404
 
405
+ try:
406
+ fetched_transcript = transcript.fetch()
407
+ except Exception as exc:
408
+ raise RuntimeError(_format_youtube_transcript_error(exc)) from exc
409
  transcript_text = " ".join(snippet.text.strip() for snippet in fetched_transcript if snippet.text.strip())
410
  if not transcript_text:
411
  raise ValueError("No transcript text could be extracted from this video.")