Akshay Kumar BM commited on
Commit
476cfe8
·
unverified ·
1 Parent(s): 60ca045

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -210
app.py CHANGED
@@ -1,231 +1,248 @@
 
 
1
  import validators
2
  import streamlit as st
 
3
  from langchain.prompts import PromptTemplate
4
  from langchain_groq import ChatGroq
5
  from langchain.chains.summarize import load_summarize_chain
6
  from langchain_community.document_loaders import YoutubeLoader, UnstructuredURLLoader, PyPDFLoader, TextLoader
7
  from langchain_text_splitters import RecursiveCharacterTextSplitter
8
  from langchain.schema import Document
9
- import os
10
- import tempfile
11
- #
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- st.set_page_config(page_title="LangChain: Process Content from Multiple Sources", page_icon="🦜")
14
- st.title("🦜 LangChain: Process Content from Multiple Sources")
15
 
16
- if 'pdf_page_ranges' not in st.session_state:
17
- st.session_state.pdf_page_ranges = {}
18
 
19
- def calculate_chunk_size(text_length, model_context_length):
20
- target_chunk_size = model_context_length // 3
21
- return max(1000, min(target_chunk_size, model_context_length // 2))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- with st.sidebar:
24
- st.header("Configuration")
25
- groq_api_key = st.text_input("Groq API Key", type="password")
26
- model = st.selectbox("Select Model", ["llama3-8b-8192", "gemma2-9b-it", "mixtral-8x7b-32768"])
27
-
28
- st.header("PDF Settings")
29
 
30
- st.subheader('Select Sources to Process')
31
- use_urls = st.checkbox("URLs (YouTube or websites)")
32
- use_files = st.checkbox("File Upload (PDF or text files)")
33
- use_text = st.checkbox("Text Input")
34
 
35
- sources = {}
 
 
 
 
36
 
37
- if use_urls:
38
- sources['urls'] = st.text_area("Enter URLs (one per line)", placeholder="https://example.com\nhttps://youtube.com/watch?v=...")
39
 
40
- if use_files:
41
- uploaded_files = st.file_uploader("Upload PDF or text files", type=["pdf", "txt"], accept_multiple_files=True)
42
- if uploaded_files:
43
- sources['files'] = uploaded_files
44
-
45
- for uploaded_file in uploaded_files:
46
- if uploaded_file.type == "application/pdf":
47
- with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
48
- temp_file.write(uploaded_file.getvalue())
49
- temp_file_path = temp_file.name
50
 
51
- loader = PyPDFLoader(temp_file_path)
52
- pdf_pages = loader.load()
53
- total_pages = len(pdf_pages)
54
 
55
- file_key = f"pdf_range_{uploaded_file.name}"
56
-
57
- if file_key not in st.session_state.pdf_page_ranges:
58
- st.session_state.pdf_page_ranges[file_key] = (1, total_pages)
59
-
60
- with st.sidebar:
61
- st.write(f"PDF: {uploaded_file.name}")
62
- st.write(f"Total pages: {total_pages}")
63
- if total_pages > 1:
64
- page_range = st.slider(
65
- f"Select page range for {uploaded_file.name}",
66
- 1, total_pages,
67
- value=st.session_state.pdf_page_ranges[file_key],
68
- key=file_key
69
- )
70
- st.session_state.pdf_page_ranges[file_key] = page_range
71
- else:
72
- st.write("This PDF has only one page.")
73
- st.session_state.pdf_page_ranges[file_key] = (1, 1)
74
-
75
- os.unlink(temp_file_path)
76
-
77
- if use_text:
78
- sources['text'] = st.text_area("Enter text content", placeholder="Paste your text here...")
79
-
80
- predefined_actions = [
81
- "Summarize", "Analyze", "Review", "Critique", "Explain",
82
- "Paraphrase", "Simplify", "Elaborate", "Extract key points",
83
- "Provide an overview", "Highlight main ideas", "Create an outline",
84
- "Generate a report", "Identify themes", "List pros and cons",
85
- "Fact-check", "Create study notes", "Generate questions"
86
- ]
87
-
88
- action_type = st.radio("Choose action type", ["Predefined", "Custom"])
89
-
90
- if action_type == "Predefined":
91
- action = st.selectbox("Select Action", predefined_actions)
92
- else:
93
- action = st.text_input("Enter Custom Action", placeholder="e.g., Summarize in bullet points")
94
-
95
- prompt_template = """
96
- Provide a {action} of the following content:
97
-
98
- Content: {text}
99
-
100
- {action}:
101
- """
102
-
103
- refine_template = """
104
- We have provided an existing {action} of the content: {existing_answer}
105
-
106
- We have some additional content to incorporate: {text}
107
-
108
- Given this new information, please refine and update the existing {action}.
109
-
110
- Refined {action}:
111
- """
112
-
113
- prompt = PromptTemplate(input_variables=['text', 'action'], template=prompt_template)
114
- refine_prompt = PromptTemplate(input_variables=['text', 'action', 'existing_answer'], template=refine_template)
115
-
116
- if st.button("Process Content"):
117
- if not groq_api_key.strip():
118
- st.error("Please provide your Groq API key in the sidebar.")
119
- elif not sources:
120
- st.error("Please select at least one source type and provide content.")
121
- elif action_type == "Custom" and not action.strip():
122
- st.error("Please enter a custom action.")
123
- else:
124
- try:
125
- llm = ChatGroq(model=model, groq_api_key=groq_api_key)
126
-
127
- all_docs = []
128
-
129
- with st.spinner(f"Processing... ({action.lower()})"):
130
- if 'urls' in sources and sources['urls']:
131
- url_list = [url.strip() for url in sources['urls'].split('\n') if url.strip()]
132
- for url in url_list:
133
- if not validators.url(url):
134
- st.warning(f"Skipping invalid URL: {url}")
135
- continue
136
-
137
- if "youtube.com" in url or "youtu.be" in url:
138
- loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
139
- st.info(f"Processing YouTube video: {url}")
140
- else:
141
- loader = UnstructuredURLLoader(
142
- urls=[url],
143
- ssl_verify=False,
144
- headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"}
145
- )
146
- st.info(f"Processing website content: {url}")
147
-
148
- docs = loader.load()
149
- all_docs.extend(docs)
150
-
151
- if 'files' in sources and sources['files']:
152
- for uploaded_file in sources['files']:
153
- with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(uploaded_file.name)[1]) as temp_file:
154
- temp_file.write(uploaded_file.getvalue())
155
- temp_file_path = temp_file.name
156
-
157
- if uploaded_file.type == "application/pdf":
158
- loader = PyPDFLoader(temp_file_path)
159
- st.info(f"Processing PDF: {uploaded_file.name}")
160
-
161
- pdf_pages = loader.load()
162
- file_key = f"pdf_range_{uploaded_file.name}"
163
- page_range = st.session_state.pdf_page_ranges[file_key]
164
-
165
- selected_pages = pdf_pages[page_range[0]-1:page_range[1]]
166
-
167
- chunk_size = calculate_chunk_size(sum(len(page.page_content) for page in selected_pages), 8192)
168
- current_chunk = []
169
- current_chunk_size = 0
170
-
171
- for page in selected_pages:
172
- page_size = len(page.page_content)
173
- if current_chunk_size + page_size > chunk_size and current_chunk:
174
- all_docs.append(Document(page_content="\n".join([p.page_content for p in current_chunk]), metadata={"source": uploaded_file.name}))
175
- current_chunk = []
176
- current_chunk_size = 0
177
- current_chunk.append(page)
178
- current_chunk_size += page_size
179
-
180
- if current_chunk:
181
- all_docs.append(Document(page_content="\n".join([p.page_content for p in current_chunk]), metadata={"source": uploaded_file.name}))
182
- else:
183
- loader = TextLoader(temp_file_path)
184
- st.info(f"Processing text file: {uploaded_file.name}")
185
- docs = loader.load()
186
- all_docs.extend(docs)
187
-
188
- os.unlink(temp_file_path)
189
-
190
- if 'text' in sources and sources['text']:
191
- with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt") as temp_file:
192
- temp_file.write(sources['text'])
193
- temp_file_path = temp_file.name
194
 
195
- loader = TextLoader(temp_file_path)
196
- docs = loader.load()
197
- all_docs.extend(docs)
198
- st.info("Processing text input")
199
 
200
- os.unlink(temp_file_path)
201
-
202
- if not all_docs:
203
- st.error("No content was processed. Please check your inputs and try again.")
204
-
205
-
206
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=calculate_chunk_size(sum(len(doc.page_content) for doc in all_docs), 8192), chunk_overlap=200)
207
- split_docs = []
208
- for doc in all_docs:
209
- if doc.metadata.get("source", "").lower().endswith(".pdf"):
210
- split_docs.append(doc)
211
- else:
212
- split_docs.extend(text_splitter.split_documents([doc]))
213
-
214
- chain = load_summarize_chain(
215
- llm=llm,
216
- chain_type="refine",
217
- question_prompt=prompt,
218
- refine_prompt=refine_prompt
219
- )
220
-
221
- output = chain.run(input_documents=split_docs, action=action.lower())
222
-
223
- st.success("Processing complete!")
224
- st.subheader(f"{action} Result")
225
- st.write(output)
226
-
227
- except Exception as e:
228
- st.error(f"An error occurred: {str(e)}")
229
 
230
- st.divider()
231
- st.caption("Powered by LangChain and Groq")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import tempfile
3
  import validators
4
  import streamlit as st
5
+ from typing import List, Dict, Any
6
  from langchain.prompts import PromptTemplate
7
  from langchain_groq import ChatGroq
8
  from langchain.chains.summarize import load_summarize_chain
9
  from langchain_community.document_loaders import YoutubeLoader, UnstructuredURLLoader, PyPDFLoader, TextLoader
10
  from langchain_text_splitters import RecursiveCharacterTextSplitter
11
  from langchain.schema import Document
12
+ from langchain.vectorstores import FAISS
13
+ from langchain.embeddings import HuggingFaceEmbeddings
14
+ from langchain.chains import RetrievalQA
15
+ from dotenv import load_dotenv
16
+
17
+ class ContentProcessor:
18
+ def __init__(self):
19
+ load_dotenv()
20
+ self.configure_environment()
21
+ self.configure_streamlit()
22
+
23
+ def configure_environment(self):
24
+ os.environ['LANGCHAIN_API_KEY'] = os.getenv("LANGCHAIN_API_KEY")
25
+ os.environ['LANGCHAIN_TRACING_V2'] = "true"
26
+ os.environ['LANGCHAIN_PROJECT'] = "LangChain: Process Content from Multiple Sources"
27
+ os.environ["LANGCHAIN_ENDPOINT"] = "https://api.smith.langchain.com"
28
+
29
+ def configure_streamlit(self):
30
+ st.set_page_config(page_title="LangChain: Process Content from Multiple Sources", page_icon="🦜")
31
+ st.title("🦜 LangChain: Process Content from Multiple Sources")
32
+
33
+ def calculate_chunk_size(self, text_length: int, model_context_length: int) -> int:
34
+ target_chunk_size = model_context_length // 3
35
+ return max(1000, min(target_chunk_size, model_context_length // 2))
36
+
37
+ def get_configuration(self) -> Dict[str, Any]:
38
+ with st.sidebar:
39
+ st.header("Configuration")
40
+ groq_api_key = st.text_input("Groq API Key", type="password")
41
+ model = st.selectbox("Select Model", ["llama3-8b-8192", "gemma2-9b-it", "mixtral-8x7b-32768"])
42
+
43
+ st.header("Task")
44
+ task = st.radio("Choose task", ["Process Content", "Interactive Q&A"], index=0)
45
+
46
+ return {"groq_api_key": groq_api_key, "model": model, "task": task}
47
+
48
+ def get_sources(self) -> Dict[str, Any]:
49
+ st.subheader('Select Sources to Process')
50
+ use_urls = st.checkbox("URLs (YouTube or websites)")
51
+ use_files = st.checkbox("File Upload (PDF or text files)")
52
+ use_text = st.checkbox("Text Input")
53
+
54
+ sources = {}
55
+ if use_urls:
56
+ sources['urls'] = st.text_area("Enter URLs (one per line)", placeholder="https://example.com\nhttps://youtube.com/watch?v=...")
57
+ if use_files:
58
+ sources['files'] = st.file_uploader("Upload PDF or text files", type=["pdf", "txt"], accept_multiple_files=True)
59
+ if use_text:
60
+ sources['text'] = st.text_area("Enter text content", placeholder="Paste your text here...")
61
+ return sources
62
+
63
+ def process_pdf(self, uploaded_file) -> List[Document]:
64
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
65
+ temp_file.write(uploaded_file.getvalue())
66
+ temp_file_path = temp_file.name
67
+
68
+ loader = PyPDFLoader(temp_file_path)
69
+ pdf_pages = loader.load()
70
+
71
+ st.sidebar.write(f"Processing PDF: {uploaded_file.name}")
72
+ st.sidebar.write(f"Total pages: {len(pdf_pages)}")
73
 
74
+ os.unlink(temp_file_path)
75
+ return pdf_pages
76
 
77
+ def process_content(self, sources: Dict[str, Any]) -> List[Document]:
78
+ all_docs = []
79
 
80
+ if 'urls' in sources and sources['urls']:
81
+ url_list = [url.strip() for url in sources['urls'].split('\n') if url.strip()]
82
+ for url in url_list:
83
+ if not validators.url(url):
84
+ st.warning(f"Skipping invalid URL: {url}")
85
+ continue
86
+
87
+ if "youtube.com" in url or "youtu.be" in url:
88
+ loader = YoutubeLoader.from_youtube_url(url, add_video_info=True)
89
+ st.info(f"Processing YouTube video: {url}")
90
+ else:
91
+ loader = UnstructuredURLLoader(
92
+ urls=[url],
93
+ ssl_verify=False,
94
+ headers={"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 13_5_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36"}
95
+ )
96
+ st.info(f"Processing website content: {url}")
97
+
98
+ docs = loader.load()
99
+ all_docs.extend(docs)
100
+
101
+ if 'files' in sources and sources['files']:
102
+ for uploaded_file in sources['files']:
103
+ if uploaded_file.type == "application/pdf":
104
+ st.info(f"Processing PDF: {uploaded_file.name}")
105
+ all_docs.extend(self.process_pdf(uploaded_file))
106
+ else:
107
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
108
+ temp_file.write(uploaded_file.getvalue())
109
+ temp_file_path = temp_file.name
110
 
111
+ loader = TextLoader(temp_file_path)
112
+ st.info(f"Processing text file: {uploaded_file.name}")
113
+ docs = loader.load()
114
+ all_docs.extend(docs)
115
+ os.unlink(temp_file_path)
 
116
 
117
+ if 'text' in sources and sources['text']:
118
+ with tempfile.NamedTemporaryFile(delete=False, mode="w", suffix=".txt") as temp_file:
119
+ temp_file.write(sources['text'])
120
+ temp_file_path = temp_file.name
121
 
122
+ loader = TextLoader(temp_file_path)
123
+ docs = loader.load()
124
+ all_docs.extend(docs)
125
+ st.info("Processing text input")
126
+ os.unlink(temp_file_path)
127
 
128
+ return all_docs
 
129
 
130
+ def create_prompts(self) -> Dict[str, PromptTemplate]:
131
+ prompt_template = """
132
+ Provide a {action} of the following content:
 
 
 
 
 
 
 
133
 
134
+ Content: {text}
 
 
135
 
136
+ {action}:
137
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
+ refine_template = """
140
+ We have provided an existing {action} of the content: {existing_answer}
 
 
141
 
142
+ We have some additional content to incorporate: {text}
143
+
144
+ Given this new information, please refine and update the existing {action}.
145
+
146
+ Refined {action}:
147
+ """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
+ return {
150
+ "prompt": PromptTemplate(input_variables=['text', 'action'], template=prompt_template),
151
+ "refine_prompt": PromptTemplate(input_variables=['text', 'action', 'existing_answer'], template=refine_template)
152
+ }
153
+
154
+ def process_documents(self, docs: List[Document], action: str, config: Dict[str, Any]) -> str:
155
+ llm = ChatGroq(model=config['model'], groq_api_key=config['groq_api_key'])
156
+
157
+ text_splitter = RecursiveCharacterTextSplitter(
158
+ chunk_size=self.calculate_chunk_size(sum(len(doc.page_content) for doc in docs), 8192),
159
+ chunk_overlap=200
160
+ )
161
+ split_docs = text_splitter.split_documents(docs)
162
+
163
+ prompts = self.create_prompts()
164
+ chain = load_summarize_chain(
165
+ llm=llm,
166
+ chain_type="refine",
167
+ question_prompt=prompts["prompt"],
168
+ refine_prompt=prompts["refine_prompt"]
169
+ )
170
+
171
+ return chain.run(input_documents=split_docs, action=action.lower())
172
+
173
+ def create_retriever(self, docs: List[Document]) -> FAISS:
174
+ embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
175
+ return FAISS.from_documents(docs, embeddings)
176
+
177
+ def answer_question(self, retriever: FAISS, question: str, config: Dict[str, Any]) -> str:
178
+ llm = ChatGroq(model=config['model'], groq_api_key=config['groq_api_key'])
179
+ qa_chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever.as_retriever())
180
+ return qa_chain.run(question)
181
+
182
+ def run(self):
183
+ config = self.get_configuration()
184
+ sources = self.get_sources()
185
+
186
+ if config['task'] == "Process Content":
187
+ action_type = st.radio("Choose action type", ["Predefined", "Custom"])
188
+ if action_type == "Predefined":
189
+ action = st.selectbox("Select Action", self.predefined_actions)
190
+ else:
191
+ action = st.text_input("Enter Custom Action", placeholder="e.g., Summarize in bullet points")
192
+ else:
193
+ action = "Answer questions about the content"
194
+
195
+ process_button = st.button("Process Content")
196
+
197
+ if 'docs' not in st.session_state:
198
+ st.session_state.docs = None
199
+ if 'retriever' not in st.session_state:
200
+ st.session_state.retriever = None
201
+
202
+ if process_button:
203
+ if not config['groq_api_key'].strip():
204
+ st.error("Please provide your Groq API Key in the sidebar.")
205
+ elif not sources:
206
+ st.error("Please select at least one source type and provide content.")
207
+ elif config['task'] == "Process Content" and action_type == "Custom" and not action.strip():
208
+ st.error("Please enter a custom action.")
209
+ else:
210
+ with st.spinner("Processing..."):
211
+ st.session_state.docs = self.process_content(sources)
212
+
213
+ if not st.session_state.docs:
214
+ st.error("No content was processed. Please check your inputs and try again.")
215
+ elif config['task'] == "Process Content":
216
+ output = self.process_documents(st.session_state.docs, action, config)
217
+ st.success("Processing complete!")
218
+ st.subheader(f"{action} Result")
219
+ st.write(output)
220
+ else: # Interactive Q&A
221
+ st.session_state.retriever = self.create_retriever(st.session_state.docs)
222
+ st.success("Document processed and ready for questions!")
223
+
224
+ if config['task'] == "Interactive Q&A" and st.session_state.retriever is not None:
225
+ question = st.text_input("Ask a question about the document:")
226
+ if question:
227
+ with st.spinner("Finding answer..."):
228
+ answer = self.answer_question(st.session_state.retriever, question, config)
229
+ st.subheader("Answer")
230
+ st.write(answer)
231
+
232
+ st.divider()
233
+ st.caption("Powered by LangChain and Groq")
234
+ st.caption("Created by : Akshay Kumar BM")
235
+
236
+ @property
237
+ def predefined_actions(self):
238
+ return [
239
+ "Summarize", "Analyze", "Review", "Critique", "Explain",
240
+ "Paraphrase", "Simplify", "Elaborate", "Extract key points",
241
+ "Provide an overview", "Highlight main ideas", "Create an outline",
242
+ "Generate a report", "Identify themes", "List pros and cons",
243
+ "Fact-check", "Create study notes", "Generate questions"
244
+ ]
245
+
246
+ if __name__ == "__main__":
247
+ processor = ContentProcessor()
248
+ processor.run()