Spaces:
Sleeping
Sleeping
| import os, re, validators, streamlit as st | |
| from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable | |
| from langchain.prompts import PromptTemplate | |
| from langchain.chains.summarize import load_summarize_chain | |
| from langchain_groq import ChatGroq | |
| from langchain.schema import Document | |
| from langchain_community.document_loaders import UnstructuredURLLoader | |
| from langchain.document_loaders import PyPDFLoader | |
| # βββββββββββββββββββββββββ STREAMLIT CONFIG ββββββββββββββββββββββββββ | |
| st.set_page_config(page_title="LangChain Summarizer", page_icon="π¦") | |
| st.title("π¦ LangChain: Summarize YT / Webpage / PDF") | |
| # ββββββββββββββββββββββββββββ API KEY INPUT ββββββββββββββββββββββββββ | |
| with st.sidebar: | |
| st.header("API keys") | |
| groq_api_key = st.text_input("Groq API Key", type="password") | |
| if groq_api_key: | |
| os.environ["GROQ_API_KEY"] = groq_api_key # for libraries | |
| # βββββββββββββββββββββ PLACEHOLDERS / FILE & URL INPUT βββββββββββββββ | |
| generic_url = st.text_input("Paste a YouTube / web URL here:") | |
| uploaded_file = st.file_uploader("β¦or upload a PDF", type=["pdf"]) | |
| # ββββββββββββββββββββββββββ UTILITY FUNCTIONS ββββββββββββββββββββββββ | |
| def get_video_id(url: str) -> str | None: | |
| m = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url) | |
| return m.group(1) if m else None | |
| SUMMARY_PROMPT = PromptTemplate( | |
| template="Provide a concise summary (~300 words):\n\nContent:\n{text}", | |
| input_variables=["text"], | |
| ) | |
| def build_llm() -> ChatGroq: | |
| """Instantiate ChatGroq once and cache it in session_state.""" | |
| if "llm" not in st.session_state: | |
| st.session_state.llm = ChatGroq( | |
| model="llama3-70b-8192", | |
| groq_api_key=os.environ["GROQ_API_KEY"], | |
| ) | |
| return st.session_state.llm | |
| def summarize(docs): | |
| llm = build_llm() | |
| chain = load_summarize_chain(llm, chain_type="stuff", prompt=SUMMARY_PROMPT) | |
| return chain({"input_documents": docs})["output_text"] | |
| # βββββββββββββββββββββββββββββ MAIN ACTION βββββββββββββββββββββββββββ | |
| if st.button("Summarize"): | |
| if not groq_api_key: | |
| st.error("Please enter your Groq API key in the sidebar.") | |
| elif not generic_url and not uploaded_file: | |
| st.error("Provide a URL or upload a PDF, then press Summarize.") | |
| else: | |
| try: | |
| with st.spinner("Fetching and summarizingβ¦"): | |
| # ---------- PDF ---------- | |
| if uploaded_file: | |
| tmp_path = f"/tmp/{uploaded_file.name}" | |
| with open(tmp_path, "wb") as f: | |
| f.write(uploaded_file.read()) | |
| docs = PyPDFLoader(tmp_path).load() | |
| st.success(summarize(docs)) | |
| os.remove(tmp_path) | |
| # ---------- YouTube ---------- | |
| elif "youtube" in generic_url or "youtu.be" in generic_url: | |
| vid = get_video_id(generic_url) | |
| if not vid: | |
| st.error("Couldnβt extract a YouTube video ID π€") | |
| else: | |
| transcript = YouTubeTranscriptApi.get_transcript(vid) | |
| text = " ".join(t["text"] for t in transcript) | |
| st.success(summarize([Document(page_content=text)])) | |
| # ---------- Plain Webpage ---------- | |
| else: | |
| if not validators.url(generic_url): | |
| st.error("That doesnβt look like a valid URL.") | |
| else: | |
| docs = UnstructuredURLLoader( | |
| urls=[generic_url], | |
| ssl_verify=False, | |
| headers={ | |
| "User-Agent": | |
| "Mozilla/5.0 (X11; Linux) AppleWebKit/537.36 " | |
| "(KHTML, like Gecko) Chrome/121.0 Safari/537.36" | |
| }, | |
| ).load() | |
| st.success(summarize(docs)) | |
| except (TranscriptsDisabled, VideoUnavailable) as yt_err: | |
| st.error(str(yt_err)) | |
| except Exception as e: | |
| st.exception(e) | |