MahatirTusher commited on
Commit
7dab3ce
Β·
verified Β·
1 Parent(s): 07cd63c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -28
app.py CHANGED
@@ -4,12 +4,14 @@ from langchain_community.document_loaders import WebBaseLoader
4
  from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain_community.vectorstores.faiss import FAISS
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
 
7
  import os
8
  import time
9
  from langchain_groq import ChatGroq
10
  from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
11
  from langchain.prompts import PromptTemplate
12
  from bs4 import SoupStrainer
 
13
 
14
  # Load environment variables (optional)
15
  load_dotenv()
@@ -100,6 +102,8 @@ st.markdown("""
100
  # Display large logo at the top of the main page
101
  st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
102
 
 
 
103
 
104
  # Initialize session state
105
  if "index_created" not in st.session_state:
@@ -115,12 +119,16 @@ if "summary" not in st.session_state:
115
  if "embeddings" not in st.session_state:
116
  st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
117
 
118
- # Sidebar for URL input
119
  with st.sidebar:
120
  st.header("Enter Web URL")
121
  url = st.text_input("URL", placeholder="e.g., https://mahatirtusher.com/astronomy-mythology/")
122
  process_url_clicked = st.button("Process URL")
123
 
 
 
 
 
124
  # Main content container
125
  main_container = st.container()
126
 
@@ -133,7 +141,7 @@ llm = ChatGroq(
133
 
134
  # Custom prompt for detailed answers
135
  qa_prompt = PromptTemplate(
136
- template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasing and knowledge to explain anything to the users. If the users ask any question in bengali, you too will answer it in fine and detailed bengali.
137
 
138
  Context: {context}
139
 
@@ -142,7 +150,7 @@ Question: {question}
142
  Answer with sources: """
143
  )
144
 
145
- # Function to summarize URL content
146
  def summarize_content(content, llm):
147
  summary_prompt = f"""Summarize the following content in 5-10 sentences, capturing the main points and key details in easy expression:
148
 
@@ -158,7 +166,39 @@ def save_faiss_index(vectorstore, path):
158
  def load_faiss_index(path, embeddings):
159
  return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
160
 
161
- # Process URL
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
  if process_url_clicked:
163
  with main_container:
164
  if not url.strip():
@@ -181,31 +221,45 @@ if process_url_clicked:
181
 
182
  # Store content for summarization
183
  st.session_state.url_content = "\n".join([doc.page_content for doc in data])
 
 
 
 
184
 
185
- st.text("Text Splitter...Started...βœ…βœ…βœ…")
186
- text_splitter = RecursiveCharacterTextSplitter(
187
- separators=['\n\n', '\n', '.', ','],
188
- chunk_size=1000
189
- )
190
- docs = text_splitter.split_documents(data)
 
 
 
 
 
 
191
 
192
- if not docs:
193
- st.error("No document chunks created. Try a different URL.")
 
 
 
 
194
  st.stop()
195
- st.text(f"Split into {len(docs)} document chunks.")
196
 
197
- st.text("Embedding Vector Started Building...βœ…βœ…βœ…")
 
 
198
  embeddings = st.session_state.embeddings
199
- vectorstore = FAISS.from_documents(docs, embeddings)
200
-
201
- faiss_index_path = "faiss_index"
202
- save_faiss_index(vectorstore, faiss_index_path)
203
- st.session_state.vectorstore = vectorstore # Cache the vectorstore
204
- st.session_state.index_created = True
205
- st.text("FAISS index saved successfully! βœ…βœ…βœ…")
206
- time.sleep(2)
207
  except Exception as e:
208
- st.error(f"Error processing URL: {str(e)}")
209
 
210
  # Summary button
211
  with main_container:
@@ -216,31 +270,31 @@ with main_container:
216
  # Display summary if generated
217
  if st.session_state.summary:
218
  with main_container:
219
- st.header("Summary of the URL Content")
220
  st.write(st.session_state.summary)
221
 
222
  # Query input with Ask button
223
  with main_container:
224
  st.header("Ask a Question")
225
- query = st.text_input("Question", placeholder="e.g., What is the article about?")
226
  ask_clicked = st.button("Ask")
227
 
228
  if ask_clicked and query:
229
  with main_container:
230
  if not st.session_state.index_created or st.session_state.vectorstore is None:
231
- st.error("No FAISS index found. Please process a URL first.")
232
  else:
233
  with st.spinner("Processing your question..."):
234
  try:
235
  chain = RetrievalQAWithSourcesChain.from_llm(
236
  llm=llm,
237
- retriever=st.session_state.vectorstore.as_retriever(search_kwargs={"k": 2}), # Limit to top 2 documents
238
  question_prompt=qa_prompt
239
  )
240
  result = chain({"question": query}, return_only_outputs=True)
241
 
242
  if not result.get("answer"):
243
- st.warning("No answer generated. Try a different question or URL.")
244
  st.stop()
245
 
246
  st.header("Answer")
 
4
  from langchain.embeddings import HuggingFaceEmbeddings
5
  from langchain_community.vectorstores.faiss import FAISS
6
  from langchain.text_splitter import RecursiveCharacterTextSplitter
7
+ from langchain_core.documents import Document
8
  import os
9
  import time
10
  from langchain_groq import ChatGroq
11
  from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
12
  from langchain.prompts import PromptTemplate
13
  from bs4 import SoupStrainer
14
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
15
 
16
  # Load environment variables (optional)
17
  load_dotenv()
 
102
  # Display large logo at the top of the main page
103
  st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
104
 
105
+ # Set Streamlit app title
106
+ st.title("WebChatter πŸ’¬")
107
 
108
  # Initialize session state
109
  if "index_created" not in st.session_state:
 
119
  if "embeddings" not in st.session_state:
120
  st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
121
 
122
+ # Sidebar for URL and YouTube input
123
  with st.sidebar:
124
  st.header("Enter Web URL")
125
  url = st.text_input("URL", placeholder="e.g., https://mahatirtusher.com/astronomy-mythology/")
126
  process_url_clicked = st.button("Process URL")
127
 
128
+ st.header("Enter YouTube URL")
129
+ youtube_url = st.text_input("YouTube URL", placeholder="e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ")
130
+ process_youtube_clicked = st.button("Process YouTube Video")
131
+
132
  # Main content container
133
  main_container = st.container()
134
 
 
141
 
142
  # Custom prompt for detailed answers
143
  qa_prompt = PromptTemplate(
144
+ template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
145
 
146
  Context: {context}
147
 
 
150
  Answer with sources: """
151
  )
152
 
153
+ # Function to summarize content
154
  def summarize_content(content, llm):
155
  summary_prompt = f"""Summarize the following content in 5-10 sentences, capturing the main points and key details in easy expression:
156
 
 
166
  def load_faiss_index(path, embeddings):
167
  return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
168
 
169
+ # Function to extract YouTube video ID from URL
170
+ def get_youtube_video_id(url):
171
+ if "youtube.com/watch?v=" in url:
172
+ return url.split("v=")[1].split("&")[0]
173
+ elif "youtu.be/" in url:
174
+ return url.split("youtu.be/")[1].split("?")[0]
175
+ return None
176
+
177
+ # Function to process content (web or YouTube)
178
+ def process_content(docs, embeddings):
179
+ st.text("Text Splitter...Started...βœ…βœ…βœ…")
180
+ text_splitter = RecursiveCharacterTextSplitter(
181
+ separators=['\n\n', '\n', '.', ','],
182
+ chunk_size=1000
183
+ )
184
+ docs = text_splitter.split_documents(docs)
185
+
186
+ if not docs:
187
+ st.error("No document chunks created. Try a different URL or video.")
188
+ st.stop()
189
+ st.text(f"Split into {len(docs)} document chunks.")
190
+
191
+ st.text("Embedding Vector Started Building...βœ…βœ…βœ…")
192
+ vectorstore = FAISS.from_documents(docs, embeddings)
193
+
194
+ faiss_index_path = "faiss_index"
195
+ save_faiss_index(vectorstore, faiss_index_path)
196
+ st.session_state.vectorstore = vectorstore # Cache the vectorstore
197
+ st.session_state.index_created = True
198
+ st.text("FAISS index saved successfully! βœ…βœ…βœ…")
199
+ time.sleep(2)
200
+
201
+ # Process Web URL
202
  if process_url_clicked:
203
  with main_container:
204
  if not url.strip():
 
221
 
222
  # Store content for summarization
223
  st.session_state.url_content = "\n".join([doc.page_content for doc in data])
224
+ embeddings = st.session_state.embeddings
225
+ process_content(data, embeddings)
226
+ except Exception as e:
227
+ st.error(f"Error processing URL: {str(e)}")
228
 
229
+ # Process YouTube Video
230
+ if process_youtube_clicked:
231
+ with main_container:
232
+ if not youtube_url.strip():
233
+ st.error("Please provide a valid YouTube URL.")
234
+ else:
235
+ with st.spinner("Processing YouTube Video..."):
236
+ try:
237
+ video_id = get_youtube_video_id(youtube_url)
238
+ if not video_id:
239
+ st.error("Invalid YouTube URL. Please provide a URL like https://www.youtube.com/watch?v=VIDEO_ID.")
240
+ st.stop()
241
 
242
+ st.text("Fetching Transcript...Started...βœ…βœ…βœ…")
243
+ transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en', 'bn'])
244
+ transcript_text = " ".join([entry['text'] for entry in transcript])
245
+
246
+ if not transcript_text.strip():
247
+ st.error("No transcript available for this video. Try a different video.")
248
  st.stop()
 
249
 
250
+ # Create a Document object from the transcript
251
+ doc = Document(page_content=transcript_text, metadata={"source": youtube_url})
252
+ st.session_state.url_content = transcript_text # Store for summarization
253
  embeddings = st.session_state.embeddings
254
+ process_content([doc], embeddings)
255
+ except TranscriptsDisabled:
256
+ st.error("Transcripts are disabled for this video. Try a different video.")
257
+ st.stop()
258
+ except NoTranscriptFound:
259
+ st.error("No transcript found in the supported languages (English or Bengali). Try a different video.")
260
+ st.stop()
 
261
  except Exception as e:
262
+ st.error(f"Error processing YouTube video: {str(e)}")
263
 
264
  # Summary button
265
  with main_container:
 
270
  # Display summary if generated
271
  if st.session_state.summary:
272
  with main_container:
273
+ st.header("Summary of the Content")
274
  st.write(st.session_state.summary)
275
 
276
  # Query input with Ask button
277
  with main_container:
278
  st.header("Ask a Question")
279
+ query = st.text_input("Question", placeholder="e.g., What is the video or article about?")
280
  ask_clicked = st.button("Ask")
281
 
282
  if ask_clicked and query:
283
  with main_container:
284
  if not st.session_state.index_created or st.session_state.vectorstore is None:
285
+ st.error("No content processed. Please process a URL or YouTube video first.")
286
  else:
287
  with st.spinner("Processing your question..."):
288
  try:
289
  chain = RetrievalQAWithSourcesChain.from_llm(
290
  llm=llm,
291
+ retriever=st.session_state.vectorstore.as_retriever(search_kwargs={"k": 2}),
292
  question_prompt=qa_prompt
293
  )
294
  result = chain({"question": query}, return_only_outputs=True)
295
 
296
  if not result.get("answer"):
297
+ st.warning("No answer generated. Try a different question or content.")
298
  st.stop()
299
 
300
  st.header("Answer")