Spaces:

msaid1976
/

Text_Summarization

Sleeping

Codex

Add Space-only YouTube fallback strategies

e6f021c 16 days ago

42.6 kB

	import os
	import tempfile
	from io import BytesIO
	from urllib.parse import quote_plus, urlparse
	from xml.etree import ElementTree as ET
	from zipfile import ZipFile

	import requests
	import streamlit as st
	import validators
	from bs4 import BeautifulSoup
	from dotenv import load_dotenv
	from langchain_classic.chains.summarize import load_summarize_chain
	from langchain_community.document_loaders import UnstructuredURLLoader, YoutubeLoader
	from langchain_core.documents import Document
	from langchain_core.prompts import PromptTemplate
	from langchain_groq import ChatGroq
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from pypdf import PdfReader
	from requests.adapters import HTTPAdapter
	from requests import RequestException
	from requests.exceptions import SSLError
	from urllib3.util.retry import Retry
	from youtube_transcript_api import YouTubeTranscriptApi


	load_dotenv()

	APP_VERSION = "2026-04-23-hf-youtube-fallbacks-1"
	SAMPLE_YOUTUBE_URL = "https://youtu.be/ocBh08fjIfU"
	LANGUAGE_OPTIONS = ["Original", "English", "Arabic", "French", "Bahasa Malay"]
	LANGUAGE_CODE_MAP = {
	"English": "en",
	"Arabic": "ar",
	"French": "fr",
	"Bahasa Malay": "ms",
	}
	LANGUAGE_LABEL_MAP = {
	"English": "English",
	"Arabic": "Arabic",
	"French": "French",
	"Bahasa Malay": "Bahasa Melayu",
	}
	YOUTUBE_PROXY_ENV_VARS = (
	"YOUTUBE_HTTP_PROXY",
	"YOUTUBE_HTTPS_PROXY",
	"HTTP_PROXY",
	"HTTPS_PROXY",
	)
	YOUTUBE_AUDIO_EXTENSIONS = (".m4a", ".mp3", ".mp4", ".mpeg", ".mpga", ".ogg", ".wav", ".webm")


	def _is_youtube_url(url: str) -> bool:
	host = urlparse(url).netloc.lower()
	return "youtube.com" in host or "youtu.be" in host

	st.set_page_config(page_title="Summarize Text From PDF, YouTube, Website", page_icon="📝")
	st.title("📝 Summarize Text From PDF, YouTube, Website")
	# st.subheader("Summarize URL")

	st.markdown(
	"""
	<style>
	.source-section-label {
	font-size: 1rem;
	font-weight: 600;
	margin-top: 0.35rem;
	margin-bottom: 0.3rem;
	}
	</style>
	""",
	unsafe_allow_html=True,
	)

	groq_api_key = os.getenv("GROQ_API_KEY", "")

	if "url_input" not in st.session_state:
	st.session_state.url_input = ""
	if "summary_word_limit" not in st.session_state:
	st.session_state.summary_word_limit = 400
	if "youtube_transcript_text" not in st.session_state:
	st.session_state.youtube_transcript_text = ""
	if "youtube_transcript_name" not in st.session_state:
	st.session_state.youtube_transcript_name = "youtube_transcript.txt"
	if "youtube_transcript_source_url" not in st.session_state:
	st.session_state.youtube_transcript_source_url = ""
	if "youtube_transcript_language_label" not in st.session_state:
	st.session_state.youtube_transcript_language_label = "Original"
	if "youtube_transcript_source_mode" not in st.session_state:
	st.session_state.youtube_transcript_source_mode = ""

	summary_language = "Original"
	transcript_language = "Original"

	with st.sidebar:
	st.header("Options")
	st.caption(f"App version: `{APP_VERSION}`")
	input_source_mode = st.radio(
	"Content source",
	options=["URL", "Upload documents", "Both"],
	index=0,
	help="Choose which source the app should use for summarization.",
	)
	summary_word_limit = st.slider(
	"Summary word limit",
	min_value=100,
	max_value=1500,
	step=50,
	key="summary_word_limit",
	help="Increase or decrease the target length of the summary.",
	)
	# summary_language = st.selectbox(
	# "Summary language",
	# options=LANGUAGE_OPTIONS,
	# index=0,
	# help="Choose the language for the generated summary. `Original` keeps the source language when possible.",
	# )
	# transcript_language = st.selectbox(
	# "Transcript language",
	# options=LANGUAGE_OPTIONS,
	# index=0,
	# help="Choose the language used for YouTube transcript fetching/export. `Original` keeps the available source transcript language.",
	# )
	selected_chain_type = st.radio(
	"Summarization method",
	options=["auto", "stuff", "map_reduce", "refine"],
	index=0,
	help="`auto` picks the best method based on content size and will upgrade if a simpler method is not a good fit.",
	)
	st.caption(
	"`stuff` is fastest for short content, `map_reduce` is safer for long content, "
	"and `refine` is useful when building a summary progressively across chunks."
	)
	if os.getenv("SPACE_ID"):
	if any(os.getenv(var_name) for var_name in YOUTUBE_PROXY_ENV_VARS):
	st.info("Hugging Face Space detected. YouTube proxy configuration is present.")
	else:
	st.warning(
	"Hugging Face Space detected. YouTube transcript loading may fail without "
	"a proxy because YouTube often blocks datacenter IPs."
	)
	st.caption(f"Sample YouTube URL: `{SAMPLE_YOUTUBE_URL}`")
	if st.button("Use sample YouTube URL"):
	st.session_state.url_input = SAMPLE_YOUTUBE_URL

	generic_url = ""
	uploaded_files = []
	youtube_source_mode = "Auto"
	manual_transcript_text = ""
	manual_transcript_file = None

	if input_source_mode in {"URL", "Both"}:
	st.markdown('<div class="source-section-label">Summarize URL</div>', unsafe_allow_html=True)
	generic_url = st.text_input(
	"URL",
	key="url_input",
	label_visibility="collapsed",
	placeholder=f"Paste a YouTube or website URL, or try {SAMPLE_YOUTUBE_URL}",
	help="Enter the full YouTube or website URL you want to summarize.",
	)

	if input_source_mode in {"Upload documents", "Both"}:
	st.markdown('<div class="source-section-label">Upload documents</div>', unsafe_allow_html=True)
	uploaded_files = st.file_uploader(
	"Upload documents",
	type=["pdf", "txt", "md", "csv", "docx"],
	accept_multiple_files=True,
	label_visibility="collapsed",
	help="Upload one or more documents. Supported formats: PDF, TXT, MD, CSV, DOCX.",
	)
	if uploaded_files:
	st.caption(
	"Uploaded files: " + ", ".join(uploaded_file.name for uploaded_file in uploaded_files)
	)

	if input_source_mode in {"URL", "Both"} and generic_url.strip() and _is_youtube_url(generic_url):
	st.markdown('<div class="source-section-label">YouTube Fallback Options</div>', unsafe_allow_html=True)
	youtube_source_mode = st.radio(
	"YouTube transcript source",
	options=[
	"Auto",
	"Direct transcript",
	"External transcript API",
	"Audio transcription (yt-dlp + Groq)",
	"Manual transcript",
	],
	index=0,
	help=(
	"`Auto` tries direct transcript first, then external API, then yt-dlp + Groq audio transcription. "
	"`Manual transcript` lets you paste or upload transcript text."
	),
	)
	if youtube_source_mode == "Manual transcript":
	manual_transcript_text = st.text_area(
	"Paste transcript",
	height=220,
	placeholder="Paste the YouTube transcript here if direct fetching is blocked.",
	)
	manual_transcript_file = st.file_uploader(
	"Upload transcript file",
	type=["txt", "md", "csv", "srt", "vtt"],
	help="Upload a transcript file to summarize when direct YouTube access is blocked.",
	)
	else:
	configured_modes = []
	if any(os.getenv(var_name) for var_name in YOUTUBE_PROXY_ENV_VARS):
	configured_modes.append("direct transcript via proxy")
	if os.getenv("YOUTUBE_TRANSCRIPT_API_URL"):
	configured_modes.append("external transcript API")
	configured_modes.append("audio transcription via yt-dlp + Groq")
	st.caption("Available fallbacks: " + ", ".join(configured_modes) + ".")

	llm = ChatGroq(model="llama-3.1-8b-instant", groq_api_key=groq_api_key)

	REQUEST_HEADERS = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
	"Accept-Language": "en-US,en;q=0.9",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Referer": "https://www.google.com/",
	}

	def _summary_language_instruction(selected_language: str) -> str:
	if selected_language == "Original":
	return "Write the summary in the original language of the source content. If the source is mixed-language, use the dominant language."
	return f"Write the summary in {LANGUAGE_LABEL_MAP[selected_language]}."


	def _translation_language_instruction(selected_language: str) -> str:
	if selected_language == "Original":
	return "Keep the text in its original language."
	return f"Translate the text into {LANGUAGE_LABEL_MAP[selected_language]}."


	def _get_summary_prompts(word_limit: int, selected_language: str) -> dict[str, PromptTemplate]:
	language_instruction = _summary_language_instruction(selected_language)
	stuff_prompt = PromptTemplate(
	template=(
	f"Provide a clear summary of the following content in about {word_limit} words.\n"
	"Focus on the main ideas, important details, and conclusions.\n"
	f"{language_instruction}\n"
	"Content:\n{text}"
	),
	input_variables=["text"],
	)
	map_prompt = PromptTemplate(
	template=(
	"Write a concise summary of the following section.\n"
	f"{language_instruction}\n"
	"Content:\n{text}"
	),
	input_variables=["text"],
	)
	combine_prompt = PromptTemplate(
	template=(
	f"Combine the following partial summaries into a final summary in about {word_limit} words.\n"
	"Keep the result coherent, non-repetitive, and focused on the most important points.\n"
	f"{language_instruction}\n"
	"Partial summaries:\n{text}"
	),
	input_variables=["text"],
	)
	refine_question_prompt = PromptTemplate(
	template=(
	f"Provide an initial summary of the following content in about {word_limit} words.\n"
	f"{language_instruction}\n"
	"Content:\n{text}"
	),
	input_variables=["text"],
	)
	refine_prompt = PromptTemplate(
	template=(
	f"We already have an existing summary:\n{{existing_answer}}\n\n"
	"Refine it using the additional content below.\n"
	f"Keep the final summary close to {word_limit} words, avoid repetition, and preserve the most important details.\n"
	f"{language_instruction}\n"
	"Additional content:\n{text}"
	),
	input_variables=["existing_answer", "text"],
	)
	return {
	"stuff": stuff_prompt,
	"map": map_prompt,
	"combine": combine_prompt,
	"refine_question": refine_question_prompt,
	"refine": refine_prompt,
	}


	def _extract_summary_text(result) -> str:
	if isinstance(result, dict):
	return result.get("output_text") or result.get("text") or str(result)
	return str(result)


	def _translate_documents_with_llm(docs: list[Document], target_language: str) -> list[Document]:
	if target_language == "Original":
	return docs

	translation_prompt = PromptTemplate(
	template=(
	f"{_translation_language_instruction(target_language)}\n"
	"Preserve the meaning faithfully. Do not summarize. Return only the translated text.\n"
	"Text:\n{text}"
	),
	input_variables=["text"],
	)
	translation_chain = load_summarize_chain(
	llm,
	chain_type="stuff",
	prompt=translation_prompt,
	)
	splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=200)
	translated_docs: list[Document] = []

	for doc in docs:
	chunks = splitter.split_documents([doc])
	translated_chunks = []
	for chunk in chunks:
	translated_text = _extract_summary_text(
	translation_chain.invoke({"input_documents": [chunk]})
	)
	translated_chunks.append(translated_text.strip())

	translated_docs.append(
	Document(
	page_content="\n\n".join(part for part in translated_chunks if part),
	metadata={
	**doc.metadata,
	"translated_to": target_language,
	},
	)
	)

	return translated_docs


	def _build_youtube_http_client() -> requests.Session:
	session = requests.Session()
	session.headers.update(
	{
	"User-Agent": (
	"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
	"(KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"
	),
	"Accept-Language": "en-US,en;q=0.9",
	"Accept": "/",
	}
	)

	retry_config = Retry(
	total=3,
	connect=3,
	read=3,
	backoff_factor=1,
	status_forcelist=[429, 500, 502, 503, 504],
	allowed_methods=["GET"],
	raise_on_status=False,
	)
	adapter = HTTPAdapter(max_retries=retry_config)
	session.mount("https://", adapter)
	session.mount("http://", adapter)

	if os.getenv("YOUTUBE_CA_BUNDLE"):
	session.verify = os.getenv("YOUTUBE_CA_BUNDLE")

	return session


	def _build_youtube_transcript_api() -> YouTubeTranscriptApi:
	return YouTubeTranscriptApi(http_client=_build_youtube_http_client())


	def _looks_like_youtube_ssl_failure(error: Exception) -> bool:
	error_text = str(error)
	ssl_markers = (
	"HTTPSConnectionPool",
	"SSLError",
	"UNEXPECTED_EOF_WHILE_READING",
	"EOF occurred in violation of protocol",
	"Max retries exceeded with url",
	)
	return isinstance(error, (SSLError, RequestException)) or any(
	marker in error_text for marker in ssl_markers
	)


	def _format_youtube_transcript_error(error: Exception) -> str:
	if _looks_like_youtube_ssl_failure(error):
	proxy_hint = (
	" Configure `YOUTUBE_HTTP_PROXY` / `YOUTUBE_HTTPS_PROXY` "
	"or standard `HTTP_PROXY` / `HTTPS_PROXY` in the Space secrets."
	if not any(os.getenv(var_name) for var_name in YOUTUBE_PROXY_ENV_VARS)
	else " Check that the configured outbound proxy is reachable from the Space."
	)
	return (
	"[HF-YT-SSL-001] The deployment could not establish a stable HTTPS connection to YouTube. "
	"This is common on cloud-hosted runtimes such as Hugging Face Spaces because "
	"YouTube often blocks or interrupts traffic from datacenter IPs."
	f"{proxy_hint}"
	)

	return str(error)


	def _resolve_transcript(video_id: str, selected_language: str):
	api = _build_youtube_transcript_api()
	try:
	transcript_list = api.list(video_id)
	except Exception as exc:
	raise RuntimeError(_format_youtube_transcript_error(exc)) from exc
	available_transcripts = list(transcript_list)

	if selected_language == "Original":
	if not available_transcripts:
	raise ValueError("No transcript is available for this video.")
	return available_transcripts[0], "Original"

	if not available_transcripts:
	raise ValueError("No transcript is available for this video.")

	target_language_code = LANGUAGE_CODE_MAP[selected_language]
	try:
	return transcript_list.find_transcript([target_language_code]), selected_language
	except Exception:
	for base_transcript in available_transcripts:
	if not base_transcript.is_translatable:
	continue
	try:
	return base_transcript.translate(target_language_code), selected_language
	except Exception:
	continue

	available_languages = ", ".join(
	sorted(
	{
	f"{transcript.language} ({transcript.language_code})"
	for transcript in available_transcripts
	}
	)
	)
	raise ValueError(
	f"Could not provide transcript in {selected_language}. "
	f"Available transcript languages: {available_languages}"
	)


	def _load_youtube_documents(url: str, selected_language: str) -> list[Document]:
	video_id = YoutubeLoader.extract_video_id(url)
	should_translate_with_llm = False
	try:
	transcript, transcript_language_label = _resolve_transcript(video_id, selected_language)
	except ValueError:
	if selected_language == "Original":
	raise
	transcript, transcript_language_label = _resolve_transcript(video_id, "Original")
	should_translate_with_llm = True

	try:
	fetched_transcript = transcript.fetch()
	except Exception as exc:
	raise RuntimeError(_format_youtube_transcript_error(exc)) from exc
	transcript_text = " ".join(snippet.text.strip() for snippet in fetched_transcript if snippet.text.strip())
	if not transcript_text:
	raise ValueError("No transcript text could be extracted from this video.")

	docs = [
	Document(
	page_content=transcript_text,
	metadata={
	"source": url,
	"video_id": video_id,
	"language": fetched_transcript.language,
	"language_code": fetched_transcript.language_code,
	"is_generated": fetched_transcript.is_generated,
	"transcript_language_label": transcript_language_label,
	},
	)
	]

	if should_translate_with_llm:
	docs = _translate_documents_with_llm(docs, selected_language)
	for doc in docs:
	doc.metadata["transcript_language_label"] = f"{selected_language} (LLM translated)"

	return docs


	def _make_transcript_filename(url: str) -> str:
	video_id = YoutubeLoader.extract_video_id(url)
	return f"youtube_transcript_{video_id}.txt"


	def _reset_youtube_transcript_state() -> None:
	st.session_state.youtube_transcript_text = ""
	st.session_state.youtube_transcript_name = "youtube_transcript.txt"
	st.session_state.youtube_transcript_source_url = ""
	st.session_state.youtube_transcript_language_label = "Original"
	st.session_state.youtube_transcript_source_mode = ""


	def _store_youtube_transcript(url: str, docs: list[Document]) -> None:
	st.session_state.youtube_transcript_text = "\n\n".join(
	doc.page_content for doc in docs if doc.page_content.strip()
	)
	st.session_state.youtube_transcript_name = _make_transcript_filename(url)
	st.session_state.youtube_transcript_source_url = url
	st.session_state.youtube_transcript_language_label = docs[0].metadata.get(
	"transcript_language_label",
	docs[0].metadata.get("language", "Original"),
	)
	st.session_state.youtube_transcript_source_mode = docs[0].metadata.get(
	"transcript_source_mode",
	"Direct transcript",
	)


	def _normalize_transcript_text(raw_text: str) -> str:
	lines = [line.strip() for line in raw_text.splitlines()]
	return "\n".join(line for line in lines if line)


	def _read_uploaded_text_file(uploaded_file) -> str:
	return uploaded_file.getvalue().decode("utf-8", errors="ignore").strip()


	def _build_transcript_documents(
	url: str,
	transcript_text: str,
	language_label: str,
	source_mode: str,
	) -> list[Document]:
	normalized_text = _normalize_transcript_text(transcript_text)
	if not normalized_text:
	raise ValueError("Transcript text is empty.")

	return [
	Document(
	page_content=normalized_text,
	metadata={
	"source": url,
	"video_id": YoutubeLoader.extract_video_id(url),
	"transcript_language_label": language_label,
	"transcript_source_mode": source_mode,
	},
	)
	]


	def _load_manual_transcript_documents(
	url: str,
	selected_language: str,
	transcript_text: str,
	transcript_file,
	) -> list[Document]:
	combined_parts = []
	if transcript_text.strip():
	combined_parts.append(transcript_text.strip())
	if transcript_file is not None:
	combined_parts.append(_read_uploaded_text_file(transcript_file))

	combined_text = "\n\n".join(part for part in combined_parts if part.strip())
	if not combined_text.strip():
	raise ValueError("Please paste a transcript or upload a transcript file.")

	docs = _build_transcript_documents(
	url,
	combined_text,
	"Original",
	"Manual transcript",
	)
	if selected_language != "Original":
	docs = _translate_documents_with_llm(docs, selected_language)
	for doc in docs:
	doc.metadata["transcript_language_label"] = f"{selected_language} (LLM translated)"
	return docs


	def _extract_transcript_text_from_payload(payload) -> str:
	if isinstance(payload, str):
	return payload.strip()

	if isinstance(payload, list):
	text_parts = []
	for item in payload:
	extracted = _extract_transcript_text_from_payload(item)
	if extracted:
	text_parts.append(extracted)
	return "\n".join(part for part in text_parts if part)

	if isinstance(payload, dict):
	for key in ("text", "transcript", "content", "full_text", "body"):
	value = payload.get(key)
	if isinstance(value, str) and value.strip():
	return value.strip()

	for key in ("data", "result", "results", "transcription", "response"):
	if key in payload:
	extracted = _extract_transcript_text_from_payload(payload[key])
	if extracted:
	return extracted

	for key in ("segments", "items", "captions", "chunks", "utterances"):
	value = payload.get(key)
	if isinstance(value, list):
	extracted = _extract_transcript_text_from_payload(value)
	if extracted:
	return extracted

	return ""


	def _load_youtube_documents_via_external_api(url: str, selected_language: str) -> list[Document]:
	api_url = os.getenv("YOUTUBE_TRANSCRIPT_API_URL", "").strip()
	if not api_url:
	raise ValueError(
	"External transcript API is not configured. Set `YOUTUBE_TRANSCRIPT_API_URL` in Space secrets."
	)

	video_id = YoutubeLoader.extract_video_id(url)
	language_code = LANGUAGE_CODE_MAP.get(selected_language, "")
	formatted_url = api_url.format(
	video_id=video_id,
	url=quote_plus(url),
	language_code=language_code,
	)

	method = os.getenv("YOUTUBE_TRANSCRIPT_API_METHOD", "GET").strip().upper()
	timeout_seconds = int(os.getenv("YOUTUBE_TRANSCRIPT_API_TIMEOUT", "45"))
	api_key = os.getenv("YOUTUBE_TRANSCRIPT_API_KEY", "").strip()
	api_key_header = os.getenv("YOUTUBE_TRANSCRIPT_API_KEY_HEADER", "Authorization").strip()

	headers = {"Accept": "application/json"}
	if api_key:
	if api_key_header.lower() == "authorization":
	headers[api_key_header] = f"Bearer {api_key}"
	else:
	headers[api_key_header] = api_key

	payload = {
	"video_id": video_id,
	"url": url,
	"language": language_code or None,
	}

	if method == "POST":
	response = requests.post(formatted_url, json=payload, headers=headers, timeout=timeout_seconds)
	else:
	response = requests.get(formatted_url, params=payload, headers=headers, timeout=timeout_seconds)
	response.raise_for_status()

	try:
	parsed_payload = response.json()
	except ValueError:
	parsed_payload = response.text

	transcript_text = _extract_transcript_text_from_payload(parsed_payload)
	if not transcript_text:
	raise ValueError("External transcript API response did not contain usable transcript text.")

	docs = _build_transcript_documents(
	url,
	transcript_text,
	selected_language if selected_language != "Original" else "Original",
	"External transcript API",
	)
	if selected_language != "Original":
	for doc in docs:
	doc.metadata["transcript_language_label"] = selected_language
	return docs


	def _download_youtube_audio(url: str, video_id: str) -> str:
	try:
	import yt_dlp
	except ImportError as exc:
	raise RuntimeError("`yt-dlp` is not installed in this Space build.") from exc

	with tempfile.TemporaryDirectory() as temp_dir:
	output_template = os.path.join(temp_dir, f"{video_id}.%(ext)s")
	ydl_opts = {
	"format": "bestaudio[ext=m4a]/bestaudio[ext=webm]/bestaudio/best",
	"outtmpl": output_template,
	"quiet": True,
	"no_warnings": True,
	"noprogress": True,
	"skip_download": False,
	}
	with yt_dlp.YoutubeDL(ydl_opts) as ydl:
	ydl.extract_info(url, download=True)

	audio_files = [
	os.path.join(temp_dir, file_name)
	for file_name in os.listdir(temp_dir)
	if os.path.splitext(file_name)[1].lower() in YOUTUBE_AUDIO_EXTENSIONS
	]
	if not audio_files:
	raise RuntimeError("yt-dlp did not produce a supported audio file for transcription.")

	source_path = max(audio_files, key=os.path.getsize)
	persisted_path = os.path.join(tempfile.gettempdir(), os.path.basename(source_path))
	with open(source_path, "rb") as source_file, open(persisted_path, "wb") as target_file:
	target_file.write(source_file.read())
	return persisted_path


	def _transcribe_audio_with_groq(audio_path: str, selected_language: str) -> str:
	if not groq_api_key.strip():
	raise ValueError("`GROQ_API_KEY` is required for audio transcription fallback.")

	model_name = os.getenv("GROQ_AUDIO_TRANSCRIPTION_MODEL", "whisper-large-v3-turbo")
	payload = {
	"model": model_name,
	"response_format": "json",
	"temperature": "0",
	}
	if selected_language != "Original":
	payload["language"] = LANGUAGE_CODE_MAP[selected_language]

	with open(audio_path, "rb") as audio_file:
	response = requests.post(
	"https://api.groq.com/openai/v1/audio/transcriptions",
	headers={"Authorization": f"Bearer {groq_api_key}"},
	data=payload,
	files={"file": (os.path.basename(audio_path), audio_file)},
	timeout=300,
	)
	response.raise_for_status()
	transcript_text = response.json().get("text", "").strip()
	if not transcript_text:
	raise ValueError("Groq audio transcription returned empty text.")
	return transcript_text


	def _load_youtube_documents_via_audio_transcription(url: str, selected_language: str) -> list[Document]:
	video_id = YoutubeLoader.extract_video_id(url)
	audio_path = _download_youtube_audio(url, video_id)
	try:
	transcript_text = _transcribe_audio_with_groq(audio_path, selected_language)
	finally:
	if os.path.exists(audio_path):
	os.remove(audio_path)

	return _build_transcript_documents(
	url,
	transcript_text,
	selected_language if selected_language != "Original" else "Original",
	"Audio transcription (yt-dlp + Groq)",
	)


	def _load_youtube_documents_with_fallbacks(
	url: str,
	selected_language: str,
	source_mode: str,
	transcript_text: str,
	transcript_file,
	) -> list[Document]:
	if source_mode == "Manual transcript":
	return _load_manual_transcript_documents(url, selected_language, transcript_text, transcript_file)

	strategies = []
	if source_mode in {"Auto", "Direct transcript"}:
	strategies.append(("Direct transcript", lambda: _load_youtube_documents(url, selected_language)))
	if source_mode in {"Auto", "External transcript API"}:
	strategies.append(
	("External transcript API", lambda: _load_youtube_documents_via_external_api(url, selected_language))
	)
	if source_mode in {"Auto", "Audio transcription (yt-dlp + Groq)"}:
	strategies.append(
	(
	"Audio transcription (yt-dlp + Groq)",
	lambda: _load_youtube_documents_via_audio_transcription(url, selected_language),
	)
	)

	failures = []
	for strategy_name, loader in strategies:
	try:
	return loader()
	except Exception as exc:
	failures.append(f"{strategy_name}: {exc}")

	if source_mode == "Auto" and (transcript_text.strip() or transcript_file is not None):
	return _load_manual_transcript_documents(url, selected_language, transcript_text, transcript_file)

	if not failures:
	raise ValueError("No YouTube transcript strategy is available for the selected mode.")

	raise RuntimeError("All YouTube transcript strategies failed.\n" + "\n".join(failures))


	def _has_meaningful_content(docs: list[Document], min_chars: int = 300) -> bool:
	combined_text = " ".join(doc.page_content.strip() for doc in docs if doc.page_content.strip())
	return len(combined_text) >= min_chars


	def _extract_text_from_html(html: str) -> str:
	soup = BeautifulSoup(html, "html.parser")

	for tag in soup(["script", "style", "noscript", "svg"]):
	tag.decompose()

	meta_description = ""
	meta_tag = soup.find("meta", attrs={"name": "description"})
	if meta_tag and meta_tag.get("content"):
	meta_description = meta_tag["content"].strip()

	main_candidates = soup.select("main, article, [role='main'], .content, .article-body")
	text_parts = []

	for candidate in main_candidates:
	candidate_text = " ".join(candidate.stripped_strings)
	if len(candidate_text) > 200:
	text_parts.append(candidate_text)

	if not text_parts:
	body_text = " ".join(soup.stripped_strings)
	if body_text:
	text_parts.append(body_text)

	if meta_description:
	text_parts.insert(0, meta_description)

	return "\n\n".join(dict.fromkeys(part for part in text_parts if part))


	def _load_web_documents(url: str) -> list[Document]:
	try:
	loader = UnstructuredURLLoader(
	urls=[url],
	ssl_verify=False,
	headers=REQUEST_HEADERS,
	)
	docs = loader.load()
	if _has_meaningful_content(docs):
	return docs
	except Exception as loader_error:
	last_error = loader_error
	else:
	last_error = ValueError("Primary URL loader returned too little readable content.")

	session = requests.Session()

	for candidate_url in [url, url.rstrip("/")]:
	if not candidate_url:
	continue

	try:
	response = session.get(
	candidate_url,
	headers=REQUEST_HEADERS,
	timeout=20,
	verify=False,
	allow_redirects=True,
	)
	response.encoding = response.encoding or response.apparent_encoding or "utf-8"

	if not response.text.strip():
	continue

	text = _extract_text_from_html(response.text)
	if not text or len(text) < 300:
	continue

	soup = BeautifulSoup(response.text, "html.parser")
	title = soup.title.string.strip() if soup.title and soup.title.string else candidate_url
	st.info("Primary URL loader failed or returned too little content. Used HTML fallback extraction instead.")
	return [
	Document(
	page_content=text,
	metadata={
	"source": candidate_url,
	"title": title,
	"http_status": response.status_code,
	},
	)
	]
	except RequestException as request_error:
	last_error = request_error

	raise ValueError(
	f"Could not load readable text from the URL. Last loader error: {last_error}"
	)


	def _load_uploaded_documents(files) -> list[Document]:
	docs: list[Document] = []

	for uploaded_file in files:
	file_name = uploaded_file.name
	extension = os.path.splitext(file_name)[1].lower()
	file_bytes = uploaded_file.getvalue()

	if extension == ".pdf":
	reader = PdfReader(BytesIO(file_bytes))
	pages = []
	for page_number, page in enumerate(reader.pages, start=1):
	page_text = (page.extract_text() or "").strip()
	if page_text:
	pages.append(
	Document(
	page_content=page_text,
	metadata={
	"source": file_name,
	"page": page_number,
	"type": "uploaded_file",
	},
	)
	)
	docs.extend(pages)
	continue

	if extension in {".txt", ".md", ".csv"}:
	text = file_bytes.decode("utf-8", errors="ignore").strip()
	if text:
	docs.append(
	Document(
	page_content=text,
	metadata={"source": file_name, "type": "uploaded_file"},
	)
	)
	continue

	if extension == ".docx":
	with ZipFile(BytesIO(file_bytes)) as docx_zip:
	document_xml = docx_zip.read("word/document.xml")
	root = ET.fromstring(document_xml)
	namespace = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
	paragraphs = []
	for paragraph in root.findall(".//w:p", namespace):
	texts = [
	node.text
	for node in paragraph.findall(".//w:t", namespace)
	if node.text
	]
	paragraph_text = "".join(texts).strip()
	if paragraph_text:
	paragraphs.append(paragraph_text)

	text = "\n\n".join(paragraphs).strip()
	if text:
	docs.append(
	Document(
	page_content=text,
	metadata={"source": file_name, "type": "uploaded_file"},
	)
	)
	continue

	raise ValueError(f"Unsupported file type: {file_name}")

	return docs


	def _build_chain(selected_chain_type: str):
	prompts = _get_summary_prompts(summary_word_limit, summary_language)
	if selected_chain_type == "stuff":
	return load_summarize_chain(llm, chain_type="stuff", prompt=prompts["stuff"])
	if selected_chain_type == "map_reduce":
	return load_summarize_chain(
	llm,
	chain_type="map_reduce",
	map_prompt=prompts["map"],
	combine_prompt=prompts["combine"],
	)
	return load_summarize_chain(
	llm,
	chain_type="refine",
	question_prompt=prompts["refine_question"],
	refine_prompt=prompts["refine"],
	)


	def _prepare_summary_documents(docs: list[Document], selected_chain_type: str) -> list[Document]:
	splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
	split_docs = splitter.split_documents(docs)

	if selected_chain_type == "stuff":
	return split_docs[:3]
	if selected_chain_type == "refine":
	return split_docs[:10]
	return split_docs[:8]


	def _choose_effective_chain_type(requested_chain_type: str, docs: list[Document]) -> tuple[str, str \| None]:
	splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
	split_docs = splitter.split_documents(docs)
	chunk_count = len(split_docs)
	total_chars = sum(len(doc.page_content) for doc in split_docs)

	if chunk_count <= 3 and total_chars <= 6000:
	recommended = "stuff"
	elif chunk_count <= 10:
	recommended = "refine"
	else:
	recommended = "map_reduce"

	if requested_chain_type == "auto":
	return recommended, f"Auto-selected `{recommended}` based on content size."

	if requested_chain_type == "stuff" and recommended != "stuff":
	return recommended, f"Switched from `stuff` to `{recommended}` because the content is too large for a reliable single-pass summary."

	if requested_chain_type == "refine" and chunk_count > 12:
	return "map_reduce", "Switched from `refine` to `map_reduce` because the content is large enough that map-reduce is more reliable."

	return requested_chain_type, None


	if input_source_mode in {"URL", "Both"} and _is_youtube_url(generic_url):
	st.video(generic_url)

	transcript_col, export_col = st.columns(2)
	with transcript_col:
	if st.button("Fetch transcript"):
	if not generic_url.strip():
	st.error("Please enter a YouTube URL.")
	elif not validators.url(generic_url):
	st.error("Please enter a valid YouTube URL.")
	else:
	try:
	with st.spinner("Loading transcript..."):
	docs = _load_youtube_documents_with_fallbacks(
	generic_url,
	transcript_language,
	youtube_source_mode,
	manual_transcript_text,
	manual_transcript_file,
	)
	if not docs:
	st.error("No transcript could be extracted from the provided YouTube video.")
	else:
	_store_youtube_transcript(generic_url, docs)
	st.success(
	"Transcript ready for export in "
	f"{st.session_state.youtube_transcript_language_label} "
	f"via {st.session_state.youtube_transcript_source_mode}."
	)
	except Exception as transcript_err:
	st.error(f"Failed to load YouTube transcript: {transcript_err}")
	with export_col:
	if (
	st.session_state.youtube_transcript_text
	and st.session_state.youtube_transcript_source_url == generic_url
	):
	st.caption(
	"Prepared transcript: "
	f"`{st.session_state.youtube_transcript_language_label}` via "
	f"`{st.session_state.youtube_transcript_source_mode}`"
	)
	st.download_button(
	"Export transcript",
	data=st.session_state.youtube_transcript_text,
	file_name=st.session_state.youtube_transcript_name,
	mime="text/plain",
	)


	if st.button("Summarize content"):
	if not groq_api_key.strip():
	st.error("Please provide the information to get started")
	elif input_source_mode == "URL" and not generic_url.strip():
	st.error("Content source is `URL`, so please provide a URL.")
	elif input_source_mode == "Upload documents" and not uploaded_files:
	st.error("Content source is `Upload documents`, so please upload at least one file.")
	elif input_source_mode == "Both" and (not generic_url.strip() or not uploaded_files):
	st.error("Content source is `Both`, so please provide a URL and upload at least one file.")
	elif generic_url.strip() and not validators.url(generic_url):
	st.error("Please enter a valid URL when using the URL field.")
	else:
	try:
	with st.spinner("waiting ...."):
	docs: list[Document] = []

	if input_source_mode in {"URL", "Both"} and generic_url.strip():
	if _is_youtube_url(generic_url):
	try:
	url_docs = _load_youtube_documents_with_fallbacks(
	generic_url,
	transcript_language,
	youtube_source_mode,
	manual_transcript_text,
	manual_transcript_file,
	)
	_store_youtube_transcript(generic_url, url_docs)
	except Exception as load_err:
	st.error(f"Failed to load YouTube transcript: {load_err}")
	st.stop()
	else:
	_reset_youtube_transcript_state()
	try:
	url_docs = _load_web_documents(generic_url)
	except Exception as load_err:
	st.error(f"Failed to fetch URL content: {load_err}")
	st.stop()

	docs.extend(url_docs)
	else:
	_reset_youtube_transcript_state()

	if input_source_mode in {"Upload documents", "Both"} and uploaded_files:
	try:
	uploaded_docs = _load_uploaded_documents(uploaded_files)
	except Exception as load_err:
	st.error(f"Failed to read uploaded document(s): {load_err}")
	st.stop()
	docs.extend(uploaded_docs)

	if input_source_mode == "Both" and generic_url.strip() and uploaded_files:
	st.info("Summarizing combined content from the URL and uploaded documents.")

	if not docs:
	st.error("No content could be extracted from the selected source.")
	st.stop()

	effective_chain_type, chain_message = _choose_effective_chain_type(
	selected_chain_type,
	docs,
	)
	if chain_message:
	st.info(chain_message)

	docs_for_summary = _prepare_summary_documents(docs, effective_chain_type)
	chain = _build_chain(effective_chain_type)
	output_summary = _extract_summary_text(
	chain.invoke({"input_documents": docs_for_summary})
	)

	st.success(output_summary)
	except Exception as e:
	st.error(f"Summarization failed: {e}")