Ashkchamp commited on
Commit
df45ad8
Β·
verified Β·
1 Parent(s): aa3d604

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +97 -112
app.py CHANGED
@@ -1,112 +1,97 @@
1
- import streamlit as st
2
- from youtube_transcript_api import YouTubeTranscriptApi
3
- from youtube_transcript_api._errors import TranscriptsDisabled, VideoUnavailable
4
- from langchain.prompts import PromptTemplate
5
- from langchain.chains.summarize import load_summarize_chain
6
- from langchain_groq import ChatGroq
7
- from langchain.schema import Document
8
- from langchain_community.document_loaders import UnstructuredURLLoader
9
- from langchain.document_loaders import PyPDFLoader
10
- import validators
11
- import re
12
- import os
13
-
14
- # Streamlit App Configuration
15
- st.set_page_config(page_title="LangChain: Summarize Text From YT, Website, or PDF", page_icon="🦜")
16
- st.title("🦜 LangChain: Summarize Text From YT, Website, or PDF")
17
- st.subheader("Summarize URL or PDF")
18
-
19
- # Sidebar for API Key
20
- with st.sidebar:
21
- groq_api_key = st.text_input("Groq API Key", value="your api key", type="password")
22
-
23
- # Input URL or File
24
- generic_url = st.text_input("URL", label_visibility="collapsed")
25
- uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
26
-
27
- # LLM Configuration
28
- llm = ChatGroq(model="llama-3.3-70b-versatile", groq_api_key=groq_api_key)
29
-
30
- # Summarization Prompt Template
31
- prompt_template = """
32
- Provide a summary of the following content in 300 words:
33
- Content: {text}
34
- """
35
- prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
36
-
37
- # Function to Extract Video ID
38
- def get_video_id(url):
39
- pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
40
- match = re.search(pattern, url)
41
- return match.group(1) if match else None
42
-
43
- # Button to Summarize Content
44
- if st.button("Summarize the Content from YT, Website, or PDF"):
45
- if not groq_api_key.strip() or (not generic_url.strip() and not uploaded_file):
46
- st.error("Please provide a URL or upload a PDF file.")
47
- elif generic_url and not validators.url(generic_url):
48
- st.error("Please enter a valid URL. It can be a YouTube video URL or a website URL.")
49
- else:
50
- try:
51
- with st.spinner("Fetching content..."):
52
- if uploaded_file:
53
- # PDF Summarization
54
- temp_file_path = f"temp_{uploaded_file.name}"
55
- with open(temp_file_path, "wb") as temp_file:
56
- temp_file.write(uploaded_file.read())
57
-
58
- loader = PyPDFLoader(file_path=temp_file_path)
59
- docs = loader.load()
60
-
61
- # Chain for Summarization
62
- chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
63
- output_summary = chain({"input_documents": docs})
64
-
65
- # Display the summarized text
66
- st.success(output_summary["output_text"])
67
-
68
- # Clean up the temporary file
69
- os.remove(temp_file_path)
70
-
71
- elif "youtube.com" in generic_url or "youtu.be" in generic_url:
72
- # YouTube Transcript Summarization
73
- video_id = get_video_id(generic_url)
74
- if not video_id:
75
- st.error("Invalid YouTube URL. Please check and try again.")
76
- else:
77
- # Fetch transcript
78
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
79
- transcript_text = " ".join([entry["text"] for entry in transcript])
80
-
81
- # Convert transcript to list of documents
82
- docs = [Document(page_content=transcript_text)]
83
-
84
- # Chain for Summarization
85
- chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
86
- output_summary = chain({"input_documents": docs})
87
-
88
- # Display the summarized text
89
- st.success(output_summary["output_text"])
90
-
91
- else:
92
- # Generic URL Summarization
93
- loader = UnstructuredURLLoader(
94
- urls=[generic_url],
95
- ssl_verify=False,
96
- headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"}
97
- )
98
- docs = loader.load()
99
-
100
- # Chain for Summarization
101
- chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
102
- output_summary = chain({"input_documents": docs})
103
-
104
- # Display the summarized text
105
- st.success(output_summary["output_text"])
106
-
107
- except TranscriptsDisabled:
108
- st.error("Transcripts are disabled for this video.")
109
- except VideoUnavailable:
110
- st.error("The video is unavailable. Please check the URL.")
111
- except Exception as e:
112
- st.exception(f"Exception: {e}")
 
1
+ import os, re, validators, streamlit as st
2
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
3
+ from langchain.prompts import PromptTemplate
4
+ from langchain.chains.summarize import load_summarize_chain
5
+ from langchain_groq import ChatGroq
6
+ from langchain.schema import Document
7
+ from langchain_community.document_loaders import UnstructuredURLLoader
8
+ from langchain.document_loaders import PyPDFLoader
9
+
10
+ # ───────────────────────── STREAMLIT CONFIG ──────────────────────────
11
+ st.set_page_config(page_title="LangChain Summarizer", page_icon="🦜")
12
+ st.title("🦜 LangChain: Summarize YT / Webpage / PDF")
13
+
14
+ # ──────────────────────────── API KEY INPUT ──────────────────────────
15
+ with st.sidebar:
16
+ st.header("API keys")
17
+ groq_api_key = st.text_input("Groq API Key", type="password")
18
+ if groq_api_key:
19
+ os.environ["GROQ_API_KEY"] = groq_api_key # for libraries
20
+
21
+ # ───────────────────── PLACEHOLDERS / FILE & URL INPUT ───────────────
22
+ generic_url = st.text_input("Paste a YouTube / web URL here:")
23
+ uploaded_file = st.file_uploader("…or upload a PDF", type=["pdf"])
24
+
25
+ # ────────────────────────── UTILITY FUNCTIONS ────────────────────────
26
+ def get_video_id(url: str) -> str | None:
27
+ m = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url)
28
+ return m.group(1) if m else None
29
+
30
+ SUMMARY_PROMPT = PromptTemplate(
31
+ template="Provide a concise summary (~300 words):\n\nContent:\n{text}",
32
+ input_variables=["text"],
33
+ )
34
+
35
+ def build_llm() -> ChatGroq:
36
+ """Instantiate ChatGroq once and cache it in session_state."""
37
+ if "llm" not in st.session_state:
38
+ st.session_state.llm = ChatGroq(
39
+ model="llama3-70b-8192",
40
+ groq_api_key=os.environ["GROQ_API_KEY"],
41
+ )
42
+ return st.session_state.llm
43
+
44
+ def summarize(docs):
45
+ llm = build_llm()
46
+ chain = load_summarize_chain(llm, chain_type="stuff", prompt=SUMMARY_PROMPT)
47
+ return chain({"input_documents": docs})["output_text"]
48
+
49
+ # ───────────────────────────── MAIN ACTION ───────────────────────────
50
+ if st.button("Summarize"):
51
+ if not groq_api_key:
52
+ st.error("Please enter your Groq API key in the sidebar.")
53
+ elif not generic_url and not uploaded_file:
54
+ st.error("Provide a URL or upload a PDF, then press Summarize.")
55
+ else:
56
+ try:
57
+ with st.spinner("Fetching and summarizing…"):
58
+
59
+ # ---------- PDF ----------
60
+ if uploaded_file:
61
+ tmp_path = f"/tmp/{uploaded_file.name}"
62
+ with open(tmp_path, "wb") as f:
63
+ f.write(uploaded_file.read())
64
+ docs = PyPDFLoader(tmp_path).load()
65
+ st.success(summarize(docs))
66
+ os.remove(tmp_path)
67
+
68
+ # ---------- YouTube ----------
69
+ elif "youtube" in generic_url or "youtu.be" in generic_url:
70
+ vid = get_video_id(generic_url)
71
+ if not vid:
72
+ st.error("Couldn’t extract a YouTube video ID πŸ€”")
73
+ else:
74
+ transcript = YouTubeTranscriptApi.get_transcript(vid)
75
+ text = " ".join(t["text"] for t in transcript)
76
+ st.success(summarize([Document(page_content=text)]))
77
+
78
+ # ---------- Plain Webpage ----------
79
+ else:
80
+ if not validators.url(generic_url):
81
+ st.error("That doesn’t look like a valid URL.")
82
+ else:
83
+ docs = UnstructuredURLLoader(
84
+ urls=[generic_url],
85
+ ssl_verify=False,
86
+ headers={
87
+ "User-Agent":
88
+ "Mozilla/5.0 (X11; Linux) AppleWebKit/537.36 "
89
+ "(KHTML, like Gecko) Chrome/121.0 Safari/537.36"
90
+ },
91
+ ).load()
92
+ st.success(summarize(docs))
93
+
94
+ except (TranscriptsDisabled, VideoUnavailable) as yt_err:
95
+ st.error(str(yt_err))
96
+ except Exception as e:
97
+ st.exception(e)