Ashkchamp commited on
Commit
aa3d604
·
verified ·
1 Parent(s): 4cacd91

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +112 -0
  2. requirements.txt +34 -0
app.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from youtube_transcript_api import YouTubeTranscriptApi
3
+ from youtube_transcript_api._errors import TranscriptsDisabled, VideoUnavailable
4
+ from langchain.prompts import PromptTemplate
5
+ from langchain.chains.summarize import load_summarize_chain
6
+ from langchain_groq import ChatGroq
7
+ from langchain.schema import Document
8
+ from langchain_community.document_loaders import UnstructuredURLLoader
9
+ from langchain.document_loaders import PyPDFLoader
10
+ import validators
11
+ import re
12
+ import os
13
+
14
+ # Streamlit App Configuration
15
+ st.set_page_config(page_title="LangChain: Summarize Text From YT, Website, or PDF", page_icon="🦜")
16
+ st.title("🦜 LangChain: Summarize Text From YT, Website, or PDF")
17
+ st.subheader("Summarize URL or PDF")
18
+
19
+ # Sidebar for API Key
20
+ with st.sidebar:
21
+ groq_api_key = st.text_input("Groq API Key", value="your api key", type="password")
22
+
23
+ # Input URL or File
24
+ generic_url = st.text_input("URL", label_visibility="collapsed")
25
+ uploaded_file = st.file_uploader("Upload a PDF", type=["pdf"])
26
+
27
+ # LLM Configuration
28
+ llm = ChatGroq(model="llama-3.3-70b-versatile", groq_api_key=groq_api_key)
29
+
30
+ # Summarization Prompt Template
31
+ prompt_template = """
32
+ Provide a summary of the following content in 300 words:
33
+ Content: {text}
34
+ """
35
+ prompt = PromptTemplate(template=prompt_template, input_variables=["text"])
36
+
37
+ # Function to Extract Video ID
38
+ def get_video_id(url):
39
+ pattern = r"(?:v=|\/)([0-9A-Za-z_-]{11}).*"
40
+ match = re.search(pattern, url)
41
+ return match.group(1) if match else None
42
+
43
+ # Button to Summarize Content
44
+ if st.button("Summarize the Content from YT, Website, or PDF"):
45
+ if not groq_api_key.strip() or (not generic_url.strip() and not uploaded_file):
46
+ st.error("Please provide a URL or upload a PDF file.")
47
+ elif generic_url and not validators.url(generic_url):
48
+ st.error("Please enter a valid URL. It can be a YouTube video URL or a website URL.")
49
+ else:
50
+ try:
51
+ with st.spinner("Fetching content..."):
52
+ if uploaded_file:
53
+ # PDF Summarization
54
+ temp_file_path = f"temp_{uploaded_file.name}"
55
+ with open(temp_file_path, "wb") as temp_file:
56
+ temp_file.write(uploaded_file.read())
57
+
58
+ loader = PyPDFLoader(file_path=temp_file_path)
59
+ docs = loader.load()
60
+
61
+ # Chain for Summarization
62
+ chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
63
+ output_summary = chain({"input_documents": docs})
64
+
65
+ # Display the summarized text
66
+ st.success(output_summary["output_text"])
67
+
68
+ # Clean up the temporary file
69
+ os.remove(temp_file_path)
70
+
71
+ elif "youtube.com" in generic_url or "youtu.be" in generic_url:
72
+ # YouTube Transcript Summarization
73
+ video_id = get_video_id(generic_url)
74
+ if not video_id:
75
+ st.error("Invalid YouTube URL. Please check and try again.")
76
+ else:
77
+ # Fetch transcript
78
+ transcript = YouTubeTranscriptApi.get_transcript(video_id)
79
+ transcript_text = " ".join([entry["text"] for entry in transcript])
80
+
81
+ # Convert transcript to list of documents
82
+ docs = [Document(page_content=transcript_text)]
83
+
84
+ # Chain for Summarization
85
+ chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
86
+ output_summary = chain({"input_documents": docs})
87
+
88
+ # Display the summarized text
89
+ st.success(output_summary["output_text"])
90
+
91
+ else:
92
+ # Generic URL Summarization
93
+ loader = UnstructuredURLLoader(
94
+ urls=[generic_url],
95
+ ssl_verify=False,
96
+ headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"}
97
+ )
98
+ docs = loader.load()
99
+
100
+ # Chain for Summarization
101
+ chain = load_summarize_chain(llm, chain_type="stuff", prompt=prompt)
102
+ output_summary = chain({"input_documents": docs})
103
+
104
+ # Display the summarized text
105
+ st.success(output_summary["output_text"])
106
+
107
+ except TranscriptsDisabled:
108
+ st.error("Transcripts are disabled for this video.")
109
+ except VideoUnavailable:
110
+ st.error("The video is unavailable. Please check the URL.")
111
+ except Exception as e:
112
+ st.exception(f"Exception: {e}")
requirements.txt ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ langchain
2
+ python-dotenv
3
+ ipykernel
4
+ langchain-community
5
+ pypdf
6
+ bs4
7
+ arxiv
8
+ pymupdf
9
+ wikipedia
10
+ langchain-text-splitters
11
+ langchain-openai
12
+ chromadb
13
+ sentence_transformers
14
+ langchain_huggingface
15
+ faiss-cpu
16
+ langchain_chroma
17
+ duckdb
18
+ pandas
19
+ openai
20
+ langchain-groq
21
+ duckduckgo_search==5.3.1b1
22
+ pymupdf
23
+ arxiv
24
+ wikipedia
25
+ mysql-connector-python
26
+ SQLAlchemy
27
+ validators==0.28.1
28
+ youtube_transcript_api
29
+ unstructured
30
+ pytube
31
+ numexpr
32
+ huggingface_hub
33
+ youtube-transcript-api
34
+ streamlit