Ashkchamp commited on
Commit
dbd3c04
Β·
verified Β·
1 Parent(s): ea01fbf

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -52
app.py CHANGED
@@ -1,6 +1,5 @@
1
- # app.py – Streamlit Summarizer (dotenv version)
2
  import os, re, validators, streamlit as st
3
- from dotenv import load_dotenv # ← NEW
4
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
5
  from langchain.prompts import PromptTemplate
6
  from langchain.chains.summarize import load_summarize_chain
@@ -9,92 +8,66 @@ from langchain.schema import Document
9
  from langchain_community.document_loaders import UnstructuredURLLoader
10
  from langchain.document_loaders import PyPDFLoader
11
 
12
- # ─────────────── load variables from .env (runs before anything else)
13
- load_dotenv() # looks for .env in project root
14
- GROQ_KEY = os.getenv("GROQ_API_KEY") # expect GROQ_API_KEY=xxx in .env
15
 
16
- # ────────────────────────── STREAMLIT CONFIG ────────────────────────
17
  st.set_page_config(page_title="LangChain Summarizer", page_icon="🦜")
18
  st.title("🦜 LangChain: Summarize YT / Webpage / PDF")
19
 
20
- # ───────────────────── PLACEHOLDERS / FILE & URL INPUT ──────────────
21
- generic_url = st.text_input("Paste a YouTube / web URL here:")
22
  uploaded_file = st.file_uploader("…or upload a PDF", type=["pdf"])
23
 
24
- # ────────────────────────── UTILITY FUNCTIONS ───────────────────────
25
- def get_video_id(url: str) -> str | None:
 
 
26
  m = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url)
27
  return m.group(1) if m else None
28
 
29
- SUMMARY_PROMPT = PromptTemplate(
30
- template="Provide a concise summary (~300 words):\n\nContent:\n{text}",
31
- input_variables=["text"],
32
- )
33
-
34
- def build_llm() -> ChatGroq:
35
- """Instantiate ChatGroq once and cache it in session_state."""
36
  if "llm" not in st.session_state:
37
  if not GROQ_KEY:
38
- raise RuntimeError(
39
- "❌ Groq API key not found. "
40
- "Add GROQ_API_KEY to a `.env` file or the environment."
41
- )
42
- st.session_state.llm = ChatGroq(
43
- model="deepseek-r1-distill-llama-70b",
44
- groq_api_key=GROQ_KEY,
45
- )
46
  return st.session_state.llm
47
 
48
  def summarize(docs):
49
- llm = build_llm()
50
- chain = load_summarize_chain(llm, chain_type="stuff", prompt=SUMMARY_PROMPT)
51
- return chain({"input_documents": docs})["output_text"]
 
 
 
52
 
53
- # ───────────────────────────── MAIN ACTION ──────────────────────────
54
  if st.button("Summarize"):
55
  if not GROQ_KEY:
56
- st.error("Groq key missing. Set **GROQ_API_KEY** in your `.env`.")
57
  elif not generic_url and not uploaded_file:
58
- st.error("Provide a URL or upload a PDF, then press Summarize.")
59
  else:
60
  try:
61
- with st.spinner("Fetching and summarizing…"):
62
-
63
- # ---------- PDF ----------
64
  if uploaded_file:
65
  tmp_path = f"/tmp/{uploaded_file.name}"
66
  with open(tmp_path, "wb") as f:
67
  f.write(uploaded_file.read())
68
  docs = PyPDFLoader(tmp_path).load()
69
- st.success(summarize(docs))
70
  os.remove(tmp_path)
71
-
72
- # ---------- YouTube ----------
73
  elif "youtube" in generic_url or "youtu.be" in generic_url:
74
  vid = get_video_id(generic_url)
75
  if not vid:
76
- st.error("Couldn’t extract a YouTube video ID πŸ€”")
77
  else:
78
  transcript = YouTubeTranscriptApi.get_transcript(vid)
79
  text = " ".join(t["text"] for t in transcript)
80
- st.success(summarize([Document(page_content=text)]))
81
-
82
- # ---------- Plain Webpage ----------
83
  else:
84
  if not validators.url(generic_url):
85
- st.error("That doesn’t look like a valid URL.")
86
  else:
87
- docs = UnstructuredURLLoader(
88
- urls=[generic_url],
89
- ssl_verify=False,
90
- headers={
91
- "User-Agent":
92
- "Mozilla/5.0 (X11; Linux) AppleWebKit/537.36 "
93
- "(KHTML, like Gecko) Chrome/121.0 Safari/537.36"
94
- },
95
- ).load()
96
  st.success(summarize(docs))
97
-
98
  except (TranscriptsDisabled, VideoUnavailable) as yt_err:
99
  st.error(str(yt_err))
100
  except Exception as e:
 
 
1
  import os, re, validators, streamlit as st
2
+ from dotenv import load_dotenv
3
  from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, VideoUnavailable
4
  from langchain.prompts import PromptTemplate
5
  from langchain.chains.summarize import load_summarize_chain
 
8
  from langchain_community.document_loaders import UnstructuredURLLoader
9
  from langchain.document_loaders import PyPDFLoader
10
 
11
+ load_dotenv()
12
+ GROQ_KEY = os.getenv("GROQ_API_KEY")
 
13
 
 
14
  st.set_page_config(page_title="LangChain Summarizer", page_icon="🦜")
15
  st.title("🦜 LangChain: Summarize YT / Webpage / PDF")
16
 
17
+ generic_url = st.text_input("Paste a YouTube / web URL here:")
 
18
  uploaded_file = st.file_uploader("…or upload a PDF", type=["pdf"])
19
 
20
+ MAP_PROMPT = PromptTemplate(template="Write a concise summary of the following:\n\n{text}", input_variables=["text"])
21
+ COMBINE_PROMPT = PromptTemplate(template="Provide an overall summary (~300 words):\n\n{text}", input_variables=["text"])
22
+
23
+ def get_video_id(url: str):
24
  m = re.search(r"(?:v=|\/)([0-9A-Za-z_-]{11})", url)
25
  return m.group(1) if m else None
26
 
27
+ def build_llm():
 
 
 
 
 
 
28
  if "llm" not in st.session_state:
29
  if not GROQ_KEY:
30
+ raise RuntimeError("Groq API key missing")
31
+ st.session_state.llm = ChatGroq(model="llama3-70b-8192", groq_api_key=GROQ_KEY, timeout=60_000)
 
 
 
 
 
 
32
  return st.session_state.llm
33
 
34
  def summarize(docs):
35
+ llm = build_llm()
36
+ chain = load_summarize_chain(llm, chain_type="map_reduce", map_prompt=MAP_PROMPT, combine_prompt=COMBINE_PROMPT)
37
+ return chain.invoke({"input_documents": docs})["output_text"]
38
+
39
+ def chunk_text(text, size=4000):
40
+ return [Document(page_content=text[i:i+size]) for i in range(0, len(text), size)]
41
 
 
42
  if st.button("Summarize"):
43
  if not GROQ_KEY:
44
+ st.error("Groq key missing")
45
  elif not generic_url and not uploaded_file:
46
+ st.error("Provide a URL or upload a PDF")
47
  else:
48
  try:
49
+ with st.spinner("Processing"):
 
 
50
  if uploaded_file:
51
  tmp_path = f"/tmp/{uploaded_file.name}"
52
  with open(tmp_path, "wb") as f:
53
  f.write(uploaded_file.read())
54
  docs = PyPDFLoader(tmp_path).load()
 
55
  os.remove(tmp_path)
56
+ st.success(summarize(docs))
 
57
  elif "youtube" in generic_url or "youtu.be" in generic_url:
58
  vid = get_video_id(generic_url)
59
  if not vid:
60
+ st.error("Invalid YouTube URL")
61
  else:
62
  transcript = YouTubeTranscriptApi.get_transcript(vid)
63
  text = " ".join(t["text"] for t in transcript)
64
+ st.success(summarize(chunk_text(text)))
 
 
65
  else:
66
  if not validators.url(generic_url):
67
+ st.error("Invalid URL")
68
  else:
69
+ docs = UnstructuredURLLoader(urls=[generic_url], ssl_verify=False, headers={"User-Agent":"Mozilla/5.0"}).load()
 
 
 
 
 
 
 
 
70
  st.success(summarize(docs))
 
71
  except (TranscriptsDisabled, VideoUnavailable) as yt_err:
72
  st.error(str(yt_err))
73
  except Exception as e: