Spaces:

msaid1976
/

Text_Summarization

Sleeping

App Files Files Community

Codex commited on Apr 23

Commit

6a9bc08

1 Parent(s): 78bc895

Deploy text summarization app

Browse files

Files changed (3) hide show

README.md +16 -10
requirements.txt +15 -34
src/streamlit_app.py +692 -38

README.md CHANGED Viewed

@@ -1,19 +1,25 @@
 ---
 title: Text Summarization
-emoji: 🚀
-colorFrom: red
-colorTo: red
 sdk: docker
 app_port: 8501
-tags:
-- streamlit
 pinned: false
-short_description: Summarize Text From PDF, YouTube, Website
 ---
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

 ---
 title: Text Summarization
+emoji: 📝
+colorFrom: blue
+colorTo: indigo
 sdk: docker
 app_port: 8501
 pinned: false
+license: mit
+short_description: Summarize YouTube videos, webpages, and uploaded documents with LangChain and Groq.
 ---
+# Text Summarization
+This Space runs a Streamlit app for summarizing:
+- YouTube videos
+- website URLs
+- uploaded PDF, TXT, MD, CSV, and DOCX files
+## Required Secret
+Add this secret in the Space settings:
+- `GROQ_API_KEY`

requirements.txt CHANGED Viewed

@@ -1,35 +1,16 @@
-altair
-pandas
-streamlit
-langchain
-python-dotenv
-ipykernel
-langchain-community
-pypdf
-bs4
-arxiv
-pymupdf
-wikipedia
-langchain-text-splitters
-langchain-openai
-chromadb
-sentence_transformers
-langchain_huggingface
-faiss-cpu
-langchain_chroma
-duckdb
-pandas
-openai
-langchain-groq
-duckduckgo_search==5.3.1b1
-pymupdf
-arxiv
-wikipedia
-mysql-connector-python
-SQLAlchemy
 validators==0.28.1
-youtube_transcript_api
-unstructured
-pytube
-numexpr
-huggingface_hub

+streamlit>=1.44.0
+python-dotenv>=1.0.1
 validators==0.28.1
+requests>=2.32.0
+bs4>=0.0.2
+pypdf>=6.0.0
+langchain>=1.2.15
+langchain-community>=0.4.1
+langchain-classic>=1.0.4
+langchain-groq>=1.1.2
+langchain-text-splitters>=1.1.2
+youtube-transcript-api>=1.2.4
+unstructured>=0.22.22
+pytube>=15.0.0

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,694 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import os
+from io import BytesIO
+from urllib.parse import urlparse
+from xml.etree import ElementTree as ET
+from zipfile import ZipFile
+import requests
 import streamlit as st
+import validators
+from bs4 import BeautifulSoup
+from dotenv import load_dotenv
+from langchain_classic.chains.summarize import load_summarize_chain
+from langchain_community.document_loaders import UnstructuredURLLoader, YoutubeLoader
+from langchain_core.documents import Document
+from langchain_core.prompts import PromptTemplate
+from langchain_groq import ChatGroq
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+from pypdf import PdfReader
+from requests import RequestException
+from youtube_transcript_api import YouTubeTranscriptApi
+load_dotenv()
+SAMPLE_YOUTUBE_URL = "https://youtu.be/ocBh08fjIfU"
+LANGUAGE_OPTIONS = ["Original", "English", "Arabic", "French", "Bahasa Malay"]
+LANGUAGE_CODE_MAP = {
+    "English": "en",
+    "Arabic": "ar",
+    "French": "fr",
+    "Bahasa Malay": "ms",
+}
+LANGUAGE_LABEL_MAP = {
+    "English": "English",
+    "Arabic": "Arabic",
+    "French": "French",
+    "Bahasa Malay": "Bahasa Melayu",
+}
+st.set_page_config(page_title="Summarize Text From PDF, YouTube, Website", page_icon="📝")
+st.title("📝 Summarize Text From PDF, YouTube, Website")
+st.subheader("Summarize URL")
+st.markdown(
+    """
+    <style>
+    .source-section-label {
+        font-size: 1rem;
+        font-weight: 600;
+        margin-top: 0.35rem;
+        margin-bottom: 0.3rem;
+    }
+    </style>
+    """,
+    unsafe_allow_html=True,
+)
+groq_api_key = os.getenv("GROQ_API_KEY", "")
+if "url_input" not in st.session_state:
+    st.session_state.url_input = ""
+if "summary_word_limit" not in st.session_state:
+    st.session_state.summary_word_limit = 400
+if "youtube_transcript_text" not in st.session_state:
+    st.session_state.youtube_transcript_text = ""
+if "youtube_transcript_name" not in st.session_state:
+    st.session_state.youtube_transcript_name = "youtube_transcript.txt"
+if "youtube_transcript_source_url" not in st.session_state:
+    st.session_state.youtube_transcript_source_url = ""
+if "youtube_transcript_language_label" not in st.session_state:
+    st.session_state.youtube_transcript_language_label = "Original"
+summary_language = "Original"
+transcript_language = "Original"
+with st.sidebar:
+    st.header("Options")
+    input_source_mode = st.radio(
+        "Content source",
+        options=["URL", "Upload documents", "Both"],
+        index=0,
+        help="Choose which source the app should use for summarization.",
+    )
+    summary_word_limit = st.slider(
+        "Summary word limit",
+        min_value=100,
+        max_value=1500,
+        step=50,
+        key="summary_word_limit",
+        help="Increase or decrease the target length of the summary.",
+    )
+    # summary_language = st.selectbox(
+    #     "Summary language",
+    #     options=LANGUAGE_OPTIONS,
+    #     index=0,
+    #     help="Choose the language for the generated summary. `Original` keeps the source language when possible.",
+    # )
+    # transcript_language = st.selectbox(
+    #     "Transcript language",
+    #     options=LANGUAGE_OPTIONS,
+    #     index=0,
+    #     help="Choose the language used for YouTube transcript fetching/export. `Original` keeps the available source transcript language.",
+    # )
+    selected_chain_type = st.radio(
+        "Summarization method",
+        options=["auto", "stuff", "map_reduce", "refine"],
+        index=0,
+        help="`auto` picks the best method based on content size and will upgrade if a simpler method is not a good fit.",
+    )
+    st.caption(
+        "`stuff` is fastest for short content, `map_reduce` is safer for long content, "
+        "and `refine` is useful when building a summary progressively across chunks."
+    )
+    st.caption(f"Sample YouTube URL: `{SAMPLE_YOUTUBE_URL}`")
+    if st.button("Use sample YouTube URL"):
+        st.session_state.url_input = SAMPLE_YOUTUBE_URL
+generic_url = ""
+uploaded_files = []
+if input_source_mode in {"URL", "Both"}:
+    st.markdown('<div class="source-section-label">Summarize URL</div>', unsafe_allow_html=True)
+    generic_url = st.text_input(
+        "URL",
+        key="url_input",
+        label_visibility="collapsed",
+        placeholder=f"Paste a YouTube or website URL, or try {SAMPLE_YOUTUBE_URL}",
+        help="Enter the full YouTube or website URL you want to summarize.",
+    )
+if input_source_mode in {"Upload documents", "Both"}:
+    st.markdown('<div class="source-section-label">Upload documents</div>', unsafe_allow_html=True)
+    uploaded_files = st.file_uploader(
+        "Upload documents",
+        type=["pdf", "txt", "md", "csv", "docx"],
+        accept_multiple_files=True,
+        label_visibility="collapsed",
+        help="Upload one or more documents. Supported formats: PDF, TXT, MD, CSV, DOCX.",
+    )
+    if uploaded_files:
+        st.caption(
+            "Uploaded files: " + ", ".join(uploaded_file.name for uploaded_file in uploaded_files)
+        )
+llm = ChatGroq(model="llama-3.1-8b-instant", groq_api_key=groq_api_key)
+REQUEST_HEADERS = {
+    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36",
+    "Accept-Language": "en-US,en;q=0.9",
+    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+    "Referer": "https://www.google.com/",
+}
+def _is_youtube_url(url: str) -> bool:
+    host = urlparse(url).netloc.lower()
+    return "youtube.com" in host or "youtu.be" in host
+def _summary_language_instruction(selected_language: str) -> str:
+    if selected_language == "Original":
+        return "Write the summary in the original language of the source content. If the source is mixed-language, use the dominant language."
+    return f"Write the summary in {LANGUAGE_LABEL_MAP[selected_language]}."
+def _translation_language_instruction(selected_language: str) -> str:
+    if selected_language == "Original":
+        return "Keep the text in its original language."
+    return f"Translate the text into {LANGUAGE_LABEL_MAP[selected_language]}."
+def _get_summary_prompts(word_limit: int, selected_language: str) -> dict[str, PromptTemplate]:
+    language_instruction = _summary_language_instruction(selected_language)
+    stuff_prompt = PromptTemplate(
+        template=(
+            f"Provide a clear summary of the following content in about {word_limit} words.\n"
+            "Focus on the main ideas, important details, and conclusions.\n"
+            f"{language_instruction}\n"
+            "Content:\n{text}"
+        ),
+        input_variables=["text"],
+    )
+    map_prompt = PromptTemplate(
+        template=(
+            "Write a concise summary of the following section.\n"
+            f"{language_instruction}\n"
+            "Content:\n{text}"
+        ),
+        input_variables=["text"],
+    )
+    combine_prompt = PromptTemplate(
+        template=(
+            f"Combine the following partial summaries into a final summary in about {word_limit} words.\n"
+            "Keep the result coherent, non-repetitive, and focused on the most important points.\n"
+            f"{language_instruction}\n"
+            "Partial summaries:\n{text}"
+        ),
+        input_variables=["text"],
+    )
+    refine_question_prompt = PromptTemplate(
+        template=(
+            f"Provide an initial summary of the following content in about {word_limit} words.\n"
+            f"{language_instruction}\n"
+            "Content:\n{text}"
+        ),
+        input_variables=["text"],
+    )
+    refine_prompt = PromptTemplate(
+        template=(
+            f"We already have an existing summary:\n{{existing_answer}}\n\n"
+            "Refine it using the additional content below.\n"
+            f"Keep the final summary close to {word_limit} words, avoid repetition, and preserve the most important details.\n"
+            f"{language_instruction}\n"
+            "Additional content:\n{text}"
+        ),
+        input_variables=["existing_answer", "text"],
+    )
+    return {
+        "stuff": stuff_prompt,
+        "map": map_prompt,
+        "combine": combine_prompt,
+        "refine_question": refine_question_prompt,
+        "refine": refine_prompt,
+    }
+def _extract_summary_text(result) -> str:
+    if isinstance(result, dict):
+        return result.get("output_text") or result.get("text") or str(result)
+    return str(result)
+def _translate_documents_with_llm(docs: list[Document], target_language: str) -> list[Document]:
+    if target_language == "Original":
+        return docs
+    translation_prompt = PromptTemplate(
+        template=(
+            f"{_translation_language_instruction(target_language)}\n"
+            "Preserve the meaning faithfully. Do not summarize. Return only the translated text.\n"
+            "Text:\n{text}"
+        ),
+        input_variables=["text"],
+    )
+    translation_chain = load_summarize_chain(
+        llm,
+        chain_type="stuff",
+        prompt=translation_prompt,
+    )
+    splitter = RecursiveCharacterTextSplitter(chunk_size=2500, chunk_overlap=200)
+    translated_docs: list[Document] = []
+    for doc in docs:
+        chunks = splitter.split_documents([doc])
+        translated_chunks = []
+        for chunk in chunks:
+            translated_text = _extract_summary_text(
+                translation_chain.invoke({"input_documents": [chunk]})
+            )
+            translated_chunks.append(translated_text.strip())
+        translated_docs.append(
+            Document(
+                page_content="\n\n".join(part for part in translated_chunks if part),
+                metadata={
+                    **doc.metadata,
+                    "translated_to": target_language,
+                },
+            )
+        )
+    return translated_docs
+def _resolve_transcript(video_id: str, selected_language: str):
+    api = YouTubeTranscriptApi()
+    transcript_list = api.list(video_id)
+    available_transcripts = list(transcript_list)
+    if selected_language == "Original":
+        if not available_transcripts:
+            raise ValueError("No transcript is available for this video.")
+        return available_transcripts[0], "Original"
+    if not available_transcripts:
+        raise ValueError("No transcript is available for this video.")
+    target_language_code = LANGUAGE_CODE_MAP[selected_language]
+    try:
+        return transcript_list.find_transcript([target_language_code]), selected_language
+    except Exception:
+        for base_transcript in available_transcripts:
+            if not base_transcript.is_translatable:
+                continue
+            try:
+                return base_transcript.translate(target_language_code), selected_language
+            except Exception:
+                continue
+        available_languages = ", ".join(
+            sorted(
+                {
+                    f"{transcript.language} ({transcript.language_code})"
+                    for transcript in available_transcripts
+                }
+            )
+        )
+        raise ValueError(
+            f"Could not provide transcript in {selected_language}. "
+            f"Available transcript languages: {available_languages}"
+        )
+def _load_youtube_documents(url: str, selected_language: str) -> list[Document]:
+    video_id = YoutubeLoader.extract_video_id(url)
+    should_translate_with_llm = False
+    try:
+        transcript, transcript_language_label = _resolve_transcript(video_id, selected_language)
+    except ValueError:
+        if selected_language == "Original":
+            raise
+        transcript, transcript_language_label = _resolve_transcript(video_id, "Original")
+        should_translate_with_llm = True
+    fetched_transcript = transcript.fetch()
+    transcript_text = " ".join(snippet.text.strip() for snippet in fetched_transcript if snippet.text.strip())
+    if not transcript_text:
+        raise ValueError("No transcript text could be extracted from this video.")
+    docs = [
+        Document(
+            page_content=transcript_text,
+            metadata={
+                "source": url,
+                "video_id": video_id,
+                "language": fetched_transcript.language,
+                "language_code": fetched_transcript.language_code,
+                "is_generated": fetched_transcript.is_generated,
+                "transcript_language_label": transcript_language_label,
+            },
+        )
+    ]
+    if should_translate_with_llm:
+        docs = _translate_documents_with_llm(docs, selected_language)
+        for doc in docs:
+            doc.metadata["transcript_language_label"] = f"{selected_language} (LLM translated)"
+    return docs
+def _make_transcript_filename(url: str) -> str:
+    video_id = YoutubeLoader.extract_video_id(url)
+    return f"youtube_transcript_{video_id}.txt"
+def _store_youtube_transcript(url: str, docs: list[Document]) -> None:
+    st.session_state.youtube_transcript_text = "\n\n".join(
+        doc.page_content for doc in docs if doc.page_content.strip()
+    )
+    st.session_state.youtube_transcript_name = _make_transcript_filename(url)
+    st.session_state.youtube_transcript_source_url = url
+    st.session_state.youtube_transcript_language_label = docs[0].metadata.get(
+        "transcript_language_label",
+        docs[0].metadata.get("language", "Original"),
+    )
+def _has_meaningful_content(docs: list[Document], min_chars: int = 300) -> bool:
+    combined_text = " ".join(doc.page_content.strip() for doc in docs if doc.page_content.strip())
+    return len(combined_text) >= min_chars
+def _extract_text_from_html(html: str) -> str:
+    soup = BeautifulSoup(html, "html.parser")
+    for tag in soup(["script", "style", "noscript", "svg"]):
+        tag.decompose()
+    meta_description = ""
+    meta_tag = soup.find("meta", attrs={"name": "description"})
+    if meta_tag and meta_tag.get("content"):
+        meta_description = meta_tag["content"].strip()
+    main_candidates = soup.select("main, article, [role='main'], .content, .article-body")
+    text_parts = []
+    for candidate in main_candidates:
+        candidate_text = " ".join(candidate.stripped_strings)
+        if len(candidate_text) > 200:
+            text_parts.append(candidate_text)
+    if not text_parts:
+        body_text = " ".join(soup.stripped_strings)
+        if body_text:
+            text_parts.append(body_text)
+    if meta_description:
+        text_parts.insert(0, meta_description)
+    return "\n\n".join(dict.fromkeys(part for part in text_parts if part))
+def _load_web_documents(url: str) -> list[Document]:
+    try:
+        loader = UnstructuredURLLoader(
+            urls=[url],
+            ssl_verify=False,
+            headers=REQUEST_HEADERS,
+        )
+        docs = loader.load()
+        if _has_meaningful_content(docs):
+            return docs
+    except Exception as loader_error:
+        last_error = loader_error
+    else:
+        last_error = ValueError("Primary URL loader returned too little readable content.")
+    session = requests.Session()
+    for candidate_url in [url, url.rstrip("/")]:
+        if not candidate_url:
+            continue
+        try:
+            response = session.get(
+                candidate_url,
+                headers=REQUEST_HEADERS,
+                timeout=20,
+                verify=False,
+                allow_redirects=True,
+            )
+            response.encoding = response.encoding or response.apparent_encoding or "utf-8"
+            if not response.text.strip():
+                continue
+            text = _extract_text_from_html(response.text)
+            if not text or len(text) < 300:
+                continue
+            soup = BeautifulSoup(response.text, "html.parser")
+            title = soup.title.string.strip() if soup.title and soup.title.string else candidate_url
+            st.info("Primary URL loader failed or returned too little content. Used HTML fallback extraction instead.")
+            return [
+                Document(
+                    page_content=text,
+                    metadata={
+                        "source": candidate_url,
+                        "title": title,
+                        "http_status": response.status_code,
+                    },
+                )
+            ]
+        except RequestException as request_error:
+            last_error = request_error
+    raise ValueError(
+        f"Could not load readable text from the URL. Last loader error: {last_error}"
+    )
+def _load_uploaded_documents(files) -> list[Document]:
+    docs: list[Document] = []
+    for uploaded_file in files:
+        file_name = uploaded_file.name
+        extension = os.path.splitext(file_name)[1].lower()
+        file_bytes = uploaded_file.getvalue()
+        if extension == ".pdf":
+            reader = PdfReader(BytesIO(file_bytes))
+            pages = []
+            for page_number, page in enumerate(reader.pages, start=1):
+                page_text = (page.extract_text() or "").strip()
+                if page_text:
+                    pages.append(
+                        Document(
+                            page_content=page_text,
+                            metadata={
+                                "source": file_name,
+                                "page": page_number,
+                                "type": "uploaded_file",
+                            },
+                        )
+                    )
+            docs.extend(pages)
+            continue
+        if extension in {".txt", ".md", ".csv"}:
+            text = file_bytes.decode("utf-8", errors="ignore").strip()
+            if text:
+                docs.append(
+                    Document(
+                        page_content=text,
+                        metadata={"source": file_name, "type": "uploaded_file"},
+                    )
+                )
+            continue
+        if extension == ".docx":
+            with ZipFile(BytesIO(file_bytes)) as docx_zip:
+                document_xml = docx_zip.read("word/document.xml")
+            root = ET.fromstring(document_xml)
+            namespace = {"w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main"}
+            paragraphs = []
+            for paragraph in root.findall(".//w:p", namespace):
+                texts = [
+                    node.text
+                    for node in paragraph.findall(".//w:t", namespace)
+                    if node.text
+                ]
+                paragraph_text = "".join(texts).strip()
+                if paragraph_text:
+                    paragraphs.append(paragraph_text)
+            text = "\n\n".join(paragraphs).strip()
+            if text:
+                docs.append(
+                    Document(
+                        page_content=text,
+                        metadata={"source": file_name, "type": "uploaded_file"},
+                    )
+                )
+            continue
+        raise ValueError(f"Unsupported file type: {file_name}")
+    return docs
+def _build_chain(selected_chain_type: str):
+    prompts = _get_summary_prompts(summary_word_limit, summary_language)
+    if selected_chain_type == "stuff":
+        return load_summarize_chain(llm, chain_type="stuff", prompt=prompts["stuff"])
+    if selected_chain_type == "map_reduce":
+        return load_summarize_chain(
+            llm,
+            chain_type="map_reduce",
+            map_prompt=prompts["map"],
+            combine_prompt=prompts["combine"],
+        )
+    return load_summarize_chain(
+        llm,
+        chain_type="refine",
+        question_prompt=prompts["refine_question"],
+        refine_prompt=prompts["refine"],
+    )
+def _prepare_summary_documents(docs: list[Document], selected_chain_type: str) -> list[Document]:
+    splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
+    split_docs = splitter.split_documents(docs)
+    if selected_chain_type == "stuff":
+        return split_docs[:3]
+    if selected_chain_type == "refine":
+        return split_docs[:10]
+    return split_docs[:8]
+def _choose_effective_chain_type(requested_chain_type: str, docs: list[Document]) -> tuple[str, str | None]:
+    splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=200)
+    split_docs = splitter.split_documents(docs)
+    chunk_count = len(split_docs)
+    total_chars = sum(len(doc.page_content) for doc in split_docs)
+    if chunk_count <= 3 and total_chars <= 6000:
+        recommended = "stuff"
+    elif chunk_count <= 10:
+        recommended = "refine"
+    else:
+        recommended = "map_reduce"
+    if requested_chain_type == "auto":
+        return recommended, f"Auto-selected `{recommended}` based on content size."
+    if requested_chain_type == "stuff" and recommended != "stuff":
+        return recommended, f"Switched from `stuff` to `{recommended}` because the content is too large for a reliable single-pass summary."
+    if requested_chain_type == "refine" and chunk_count > 12:
+        return "map_reduce", "Switched from `refine` to `map_reduce` because the content is large enough that map-reduce is more reliable."
+    return requested_chain_type, None
+if input_source_mode in {"URL", "Both"} and _is_youtube_url(generic_url):
+    st.video(generic_url)
+    transcript_col, export_col = st.columns(2)
+    with transcript_col:
+        if st.button("Fetch transcript"):
+            if not generic_url.strip():
+                st.error("Please enter a YouTube URL.")
+            elif not validators.url(generic_url):
+                st.error("Please enter a valid YouTube URL.")
+            else:
+                try:
+                    with st.spinner("Loading transcript..."):
+                        docs = _load_youtube_documents(generic_url, transcript_language)
+                        if not docs:
+                            st.error("No transcript could be extracted from the provided YouTube video.")
+                        else:
+                            _store_youtube_transcript(generic_url, docs)
+                            st.success(
+                                f"Transcript ready for export in {st.session_state.youtube_transcript_language_label}."
+                            )
+                except Exception as transcript_err:
+                    st.error(f"Failed to load YouTube transcript: {transcript_err}")
+    with export_col:
+        if (
+            st.session_state.youtube_transcript_text
+            and st.session_state.youtube_transcript_source_url == generic_url
+        ):
+            st.caption(f"Prepared transcript: `{st.session_state.youtube_transcript_language_label}`")
+            st.download_button(
+                "Export transcript",
+                data=st.session_state.youtube_transcript_text,
+                file_name=st.session_state.youtube_transcript_name,
+                mime="text/plain",
+            )
+if st.button("Summarize content"):
+    if not groq_api_key.strip():
+        st.error("Please provide the information to get started")
+    elif input_source_mode == "URL" and not generic_url.strip():
+        st.error("Content source is `URL`, so please provide a URL.")
+    elif input_source_mode == "Upload documents" and not uploaded_files:
+        st.error("Content source is `Upload documents`, so please upload at least one file.")
+    elif input_source_mode == "Both" and (not generic_url.strip() or not uploaded_files):
+        st.error("Content source is `Both`, so please provide a URL and upload at least one file.")
+    elif generic_url.strip() and not validators.url(generic_url):
+        st.error("Please enter a valid URL when using the URL field.")
+    else:
+        try:
+            with st.spinner("waiting ...."):
+                docs: list[Document] = []
+                if input_source_mode in {"URL", "Both"} and generic_url.strip():
+                    if _is_youtube_url(generic_url):
+                        try:
+                            url_docs = _load_youtube_documents(generic_url, transcript_language)
+                            _store_youtube_transcript(generic_url, url_docs)
+                        except Exception as load_err:
+                            st.error(f"Failed to load YouTube transcript: {load_err}")
+                            st.stop()
+                    else:
+                        st.session_state.youtube_transcript_text = ""
+                        st.session_state.youtube_transcript_name = "youtube_transcript.txt"
+                        st.session_state.youtube_transcript_source_url = ""
+                        try:
+                            url_docs = _load_web_documents(generic_url)
+                        except Exception as load_err:
+                            st.error(f"Failed to fetch URL content: {load_err}")
+                            st.stop()
+                    docs.extend(url_docs)
+                else:
+                    st.session_state.youtube_transcript_text = ""
+                    st.session_state.youtube_transcript_name = "youtube_transcript.txt"
+                    st.session_state.youtube_transcript_source_url = ""
+                if input_source_mode in {"Upload documents", "Both"} and uploaded_files:
+                    try:
+                        uploaded_docs = _load_uploaded_documents(uploaded_files)
+                    except Exception as load_err:
+                        st.error(f"Failed to read uploaded document(s): {load_err}")
+                        st.stop()
+                    docs.extend(uploaded_docs)
+                if input_source_mode == "Both" and generic_url.strip() and uploaded_files:
+                    st.info("Summarizing combined content from the URL and uploaded documents.")
+                if not docs:
+                    st.error("No content could be extracted from the selected source.")
+                    st.stop()
+                effective_chain_type, chain_message = _choose_effective_chain_type(
+                    selected_chain_type,
+                    docs,
+                )
+                if chain_message:
+                    st.info(chain_message)
+                docs_for_summary = _prepare_summary_documents(docs, effective_chain_type)
+                chain = _build_chain(effective_chain_type)
+                output_summary = _extract_summary_text(
+                    chain.invoke({"input_documents": docs_for_summary})
+                )
+                st.success(output_summary)
+        except Exception as e:
+            st.error(f"Summarization failed: {e}")