Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,12 +4,14 @@ from langchain_community.document_loaders import WebBaseLoader
|
|
| 4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 5 |
from langchain_community.vectorstores.faiss import FAISS
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
|
|
|
| 7 |
import os
|
| 8 |
import time
|
| 9 |
from langchain_groq import ChatGroq
|
| 10 |
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
|
| 11 |
from langchain.prompts import PromptTemplate
|
| 12 |
from bs4 import SoupStrainer
|
|
|
|
| 13 |
|
| 14 |
# Load environment variables (optional)
|
| 15 |
load_dotenv()
|
|
@@ -100,6 +102,8 @@ st.markdown("""
|
|
| 100 |
# Display large logo at the top of the main page
|
| 101 |
st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
|
| 102 |
|
|
|
|
|
|
|
| 103 |
|
| 104 |
# Initialize session state
|
| 105 |
if "index_created" not in st.session_state:
|
|
@@ -115,12 +119,16 @@ if "summary" not in st.session_state:
|
|
| 115 |
if "embeddings" not in st.session_state:
|
| 116 |
st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 117 |
|
| 118 |
-
# Sidebar for URL input
|
| 119 |
with st.sidebar:
|
| 120 |
st.header("Enter Web URL")
|
| 121 |
url = st.text_input("URL", placeholder="e.g., https://mahatirtusher.com/astronomy-mythology/")
|
| 122 |
process_url_clicked = st.button("Process URL")
|
| 123 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
# Main content container
|
| 125 |
main_container = st.container()
|
| 126 |
|
|
@@ -133,7 +141,7 @@ llm = ChatGroq(
|
|
| 133 |
|
| 134 |
# Custom prompt for detailed answers
|
| 135 |
qa_prompt = PromptTemplate(
|
| 136 |
-
template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own
|
| 137 |
|
| 138 |
Context: {context}
|
| 139 |
|
|
@@ -142,7 +150,7 @@ Question: {question}
|
|
| 142 |
Answer with sources: """
|
| 143 |
)
|
| 144 |
|
| 145 |
-
# Function to summarize
|
| 146 |
def summarize_content(content, llm):
|
| 147 |
summary_prompt = f"""Summarize the following content in 5-10 sentences, capturing the main points and key details in easy expression:
|
| 148 |
|
|
@@ -158,7 +166,39 @@ def save_faiss_index(vectorstore, path):
|
|
| 158 |
def load_faiss_index(path, embeddings):
|
| 159 |
return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
| 160 |
|
| 161 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
if process_url_clicked:
|
| 163 |
with main_container:
|
| 164 |
if not url.strip():
|
|
@@ -181,31 +221,45 @@ if process_url_clicked:
|
|
| 181 |
|
| 182 |
# Store content for summarization
|
| 183 |
st.session_state.url_content = "\n".join([doc.page_content for doc in data])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 191 |
|
| 192 |
-
|
| 193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 194 |
st.stop()
|
| 195 |
-
st.text(f"Split into {len(docs)} document chunks.")
|
| 196 |
|
| 197 |
-
|
|
|
|
|
|
|
| 198 |
embeddings = st.session_state.embeddings
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
st.
|
| 205 |
-
st.
|
| 206 |
-
time.sleep(2)
|
| 207 |
except Exception as e:
|
| 208 |
-
st.error(f"Error processing
|
| 209 |
|
| 210 |
# Summary button
|
| 211 |
with main_container:
|
|
@@ -216,31 +270,31 @@ with main_container:
|
|
| 216 |
# Display summary if generated
|
| 217 |
if st.session_state.summary:
|
| 218 |
with main_container:
|
| 219 |
-
st.header("Summary of the
|
| 220 |
st.write(st.session_state.summary)
|
| 221 |
|
| 222 |
# Query input with Ask button
|
| 223 |
with main_container:
|
| 224 |
st.header("Ask a Question")
|
| 225 |
-
query = st.text_input("Question", placeholder="e.g., What is the article about?")
|
| 226 |
ask_clicked = st.button("Ask")
|
| 227 |
|
| 228 |
if ask_clicked and query:
|
| 229 |
with main_container:
|
| 230 |
if not st.session_state.index_created or st.session_state.vectorstore is None:
|
| 231 |
-
st.error("No
|
| 232 |
else:
|
| 233 |
with st.spinner("Processing your question..."):
|
| 234 |
try:
|
| 235 |
chain = RetrievalQAWithSourcesChain.from_llm(
|
| 236 |
llm=llm,
|
| 237 |
-
retriever=st.session_state.vectorstore.as_retriever(search_kwargs={"k": 2}),
|
| 238 |
question_prompt=qa_prompt
|
| 239 |
)
|
| 240 |
result = chain({"question": query}, return_only_outputs=True)
|
| 241 |
|
| 242 |
if not result.get("answer"):
|
| 243 |
-
st.warning("No answer generated. Try a different question or
|
| 244 |
st.stop()
|
| 245 |
|
| 246 |
st.header("Answer")
|
|
|
|
| 4 |
from langchain.embeddings import HuggingFaceEmbeddings
|
| 5 |
from langchain_community.vectorstores.faiss import FAISS
|
| 6 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 7 |
+
from langchain_core.documents import Document
|
| 8 |
import os
|
| 9 |
import time
|
| 10 |
from langchain_groq import ChatGroq
|
| 11 |
from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
|
| 12 |
from langchain.prompts import PromptTemplate
|
| 13 |
from bs4 import SoupStrainer
|
| 14 |
+
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
| 15 |
|
| 16 |
# Load environment variables (optional)
|
| 17 |
load_dotenv()
|
|
|
|
| 102 |
# Display large logo at the top of the main page
|
| 103 |
st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
|
| 104 |
|
| 105 |
+
# Set Streamlit app title
|
| 106 |
+
st.title("WebChatter π¬")
|
| 107 |
|
| 108 |
# Initialize session state
|
| 109 |
if "index_created" not in st.session_state:
|
|
|
|
| 119 |
if "embeddings" not in st.session_state:
|
| 120 |
st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
|
| 121 |
|
| 122 |
+
# Sidebar for URL and YouTube input
|
| 123 |
with st.sidebar:
|
| 124 |
st.header("Enter Web URL")
|
| 125 |
url = st.text_input("URL", placeholder="e.g., https://mahatirtusher.com/astronomy-mythology/")
|
| 126 |
process_url_clicked = st.button("Process URL")
|
| 127 |
|
| 128 |
+
st.header("Enter YouTube URL")
|
| 129 |
+
youtube_url = st.text_input("YouTube URL", placeholder="e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ")
|
| 130 |
+
process_youtube_clicked = st.button("Process YouTube Video")
|
| 131 |
+
|
| 132 |
# Main content container
|
| 133 |
main_container = st.container()
|
| 134 |
|
|
|
|
| 141 |
|
| 142 |
# Custom prompt for detailed answers
|
| 143 |
qa_prompt = PromptTemplate(
|
| 144 |
+
template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
|
| 145 |
|
| 146 |
Context: {context}
|
| 147 |
|
|
|
|
| 150 |
Answer with sources: """
|
| 151 |
)
|
| 152 |
|
| 153 |
+
# Function to summarize content
|
| 154 |
def summarize_content(content, llm):
|
| 155 |
summary_prompt = f"""Summarize the following content in 5-10 sentences, capturing the main points and key details in easy expression:
|
| 156 |
|
|
|
|
| 166 |
def load_faiss_index(path, embeddings):
|
| 167 |
return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
|
| 168 |
|
| 169 |
+
# Function to extract YouTube video ID from URL
|
| 170 |
+
def get_youtube_video_id(url):
|
| 171 |
+
if "youtube.com/watch?v=" in url:
|
| 172 |
+
return url.split("v=")[1].split("&")[0]
|
| 173 |
+
elif "youtu.be/" in url:
|
| 174 |
+
return url.split("youtu.be/")[1].split("?")[0]
|
| 175 |
+
return None
|
| 176 |
+
|
| 177 |
+
# Function to process content (web or YouTube)
|
| 178 |
+
def process_content(docs, embeddings):
|
| 179 |
+
st.text("Text Splitter...Started...β
β
β
")
|
| 180 |
+
text_splitter = RecursiveCharacterTextSplitter(
|
| 181 |
+
separators=['\n\n', '\n', '.', ','],
|
| 182 |
+
chunk_size=1000
|
| 183 |
+
)
|
| 184 |
+
docs = text_splitter.split_documents(docs)
|
| 185 |
+
|
| 186 |
+
if not docs:
|
| 187 |
+
st.error("No document chunks created. Try a different URL or video.")
|
| 188 |
+
st.stop()
|
| 189 |
+
st.text(f"Split into {len(docs)} document chunks.")
|
| 190 |
+
|
| 191 |
+
st.text("Embedding Vector Started Building...β
β
β
")
|
| 192 |
+
vectorstore = FAISS.from_documents(docs, embeddings)
|
| 193 |
+
|
| 194 |
+
faiss_index_path = "faiss_index"
|
| 195 |
+
save_faiss_index(vectorstore, faiss_index_path)
|
| 196 |
+
st.session_state.vectorstore = vectorstore # Cache the vectorstore
|
| 197 |
+
st.session_state.index_created = True
|
| 198 |
+
st.text("FAISS index saved successfully! β
β
β
")
|
| 199 |
+
time.sleep(2)
|
| 200 |
+
|
| 201 |
+
# Process Web URL
|
| 202 |
if process_url_clicked:
|
| 203 |
with main_container:
|
| 204 |
if not url.strip():
|
|
|
|
| 221 |
|
| 222 |
# Store content for summarization
|
| 223 |
st.session_state.url_content = "\n".join([doc.page_content for doc in data])
|
| 224 |
+
embeddings = st.session_state.embeddings
|
| 225 |
+
process_content(data, embeddings)
|
| 226 |
+
except Exception as e:
|
| 227 |
+
st.error(f"Error processing URL: {str(e)}")
|
| 228 |
|
| 229 |
+
# Process YouTube Video
|
| 230 |
+
if process_youtube_clicked:
|
| 231 |
+
with main_container:
|
| 232 |
+
if not youtube_url.strip():
|
| 233 |
+
st.error("Please provide a valid YouTube URL.")
|
| 234 |
+
else:
|
| 235 |
+
with st.spinner("Processing YouTube Video..."):
|
| 236 |
+
try:
|
| 237 |
+
video_id = get_youtube_video_id(youtube_url)
|
| 238 |
+
if not video_id:
|
| 239 |
+
st.error("Invalid YouTube URL. Please provide a URL like https://www.youtube.com/watch?v=VIDEO_ID.")
|
| 240 |
+
st.stop()
|
| 241 |
|
| 242 |
+
st.text("Fetching Transcript...Started...β
β
β
")
|
| 243 |
+
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en', 'bn'])
|
| 244 |
+
transcript_text = " ".join([entry['text'] for entry in transcript])
|
| 245 |
+
|
| 246 |
+
if not transcript_text.strip():
|
| 247 |
+
st.error("No transcript available for this video. Try a different video.")
|
| 248 |
st.stop()
|
|
|
|
| 249 |
|
| 250 |
+
# Create a Document object from the transcript
|
| 251 |
+
doc = Document(page_content=transcript_text, metadata={"source": youtube_url})
|
| 252 |
+
st.session_state.url_content = transcript_text # Store for summarization
|
| 253 |
embeddings = st.session_state.embeddings
|
| 254 |
+
process_content([doc], embeddings)
|
| 255 |
+
except TranscriptsDisabled:
|
| 256 |
+
st.error("Transcripts are disabled for this video. Try a different video.")
|
| 257 |
+
st.stop()
|
| 258 |
+
except NoTranscriptFound:
|
| 259 |
+
st.error("No transcript found in the supported languages (English or Bengali). Try a different video.")
|
| 260 |
+
st.stop()
|
|
|
|
| 261 |
except Exception as e:
|
| 262 |
+
st.error(f"Error processing YouTube video: {str(e)}")
|
| 263 |
|
| 264 |
# Summary button
|
| 265 |
with main_container:
|
|
|
|
| 270 |
# Display summary if generated
|
| 271 |
if st.session_state.summary:
|
| 272 |
with main_container:
|
| 273 |
+
st.header("Summary of the Content")
|
| 274 |
st.write(st.session_state.summary)
|
| 275 |
|
| 276 |
# Query input with Ask button
|
| 277 |
with main_container:
|
| 278 |
st.header("Ask a Question")
|
| 279 |
+
query = st.text_input("Question", placeholder="e.g., What is the video or article about?")
|
| 280 |
ask_clicked = st.button("Ask")
|
| 281 |
|
| 282 |
if ask_clicked and query:
|
| 283 |
with main_container:
|
| 284 |
if not st.session_state.index_created or st.session_state.vectorstore is None:
|
| 285 |
+
st.error("No content processed. Please process a URL or YouTube video first.")
|
| 286 |
else:
|
| 287 |
with st.spinner("Processing your question..."):
|
| 288 |
try:
|
| 289 |
chain = RetrievalQAWithSourcesChain.from_llm(
|
| 290 |
llm=llm,
|
| 291 |
+
retriever=st.session_state.vectorstore.as_retriever(search_kwargs={"k": 2}),
|
| 292 |
question_prompt=qa_prompt
|
| 293 |
)
|
| 294 |
result = chain({"question": query}, return_only_outputs=True)
|
| 295 |
|
| 296 |
if not result.get("answer"):
|
| 297 |
+
st.warning("No answer generated. Try a different question or content.")
|
| 298 |
st.stop()
|
| 299 |
|
| 300 |
st.header("Answer")
|