Spaces:

MahatirTusher
/

WebChatter

Sleeping

App Files Files Community

MahatirTusher commited on Apr 22, 2025

Commit

7dab3ce

verified ·

1 Parent(s): 07cd63c

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -28

app.py CHANGED Viewed

@@ -4,12 +4,14 @@ from langchain_community.document_loaders import WebBaseLoader
 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores.faiss import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 import os
 import time
 from langchain_groq import ChatGroq
 from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
 from langchain.prompts import PromptTemplate
 from bs4 import SoupStrainer
 # Load environment variables (optional)
 load_dotenv()
@@ -100,6 +102,8 @@ st.markdown("""
 # Display large logo at the top of the main page
 st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
 # Initialize session state
 if "index_created" not in st.session_state:
@@ -115,12 +119,16 @@ if "summary" not in st.session_state:
 if "embeddings" not in st.session_state:
     st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
-# Sidebar for URL input
 with st.sidebar:
     st.header("Enter Web URL")
     url = st.text_input("URL", placeholder="e.g., https://mahatirtusher.com/astronomy-mythology/")
     process_url_clicked = st.button("Process URL")
 # Main content container
 main_container = st.container()
@@ -133,7 +141,7 @@ llm = ChatGroq(
 # Custom prompt for detailed answers
 qa_prompt = PromptTemplate(
-    template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasing and knowledge to explain anything to the users. If the users ask any question in bengali, you too will answer it in fine and detailed bengali.
 Context: {context}
@@ -142,7 +150,7 @@ Question: {question}
 Answer with sources: """
 )
-# Function to summarize URL content
 def summarize_content(content, llm):
     summary_prompt = f"""Summarize the following content in 5-10 sentences, capturing the main points and key details in easy expression:
@@ -158,7 +166,39 @@ def save_faiss_index(vectorstore, path):
 def load_faiss_index(path, embeddings):
     return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
-# Process URL
 if process_url_clicked:
     with main_container:
         if not url.strip():
@@ -181,31 +221,45 @@ if process_url_clicked:
                     # Store content for summarization
                     st.session_state.url_content = "\n".join([doc.page_content for doc in data])
-                    st.text("Text Splitter...Started...✅✅✅")
-                    text_splitter = RecursiveCharacterTextSplitter(
-                        separators=['\n\n', '\n', '.', ','],
-                        chunk_size=1000
-                    )
-                    docs = text_splitter.split_documents(data)
-                    if not docs:
-                        st.error("No document chunks created. Try a different URL.")
                         st.stop()
-                    st.text(f"Split into {len(docs)} document chunks.")
-                    st.text("Embedding Vector Started Building...✅✅✅")
                     embeddings = st.session_state.embeddings
-                    vectorstore = FAISS.from_documents(docs, embeddings)
-                    faiss_index_path = "faiss_index"
-                    save_faiss_index(vectorstore, faiss_index_path)
-                    st.session_state.vectorstore = vectorstore  # Cache the vectorstore
-                    st.session_state.index_created = True
-                    st.text("FAISS index saved successfully! ✅✅✅")
-                    time.sleep(2)
                 except Exception as e:
-                    st.error(f"Error processing URL: {str(e)}")
 # Summary button
 with main_container:
@@ -216,31 +270,31 @@ with main_container:
 # Display summary if generated
 if st.session_state.summary:
     with main_container:
-        st.header("Summary of the URL Content")
         st.write(st.session_state.summary)
 # Query input with Ask button
 with main_container:
     st.header("Ask a Question")
-    query = st.text_input("Question", placeholder="e.g., What is the article about?")
     ask_clicked = st.button("Ask")
 if ask_clicked and query:
     with main_container:
         if not st.session_state.index_created or st.session_state.vectorstore is None:
-            st.error("No FAISS index found. Please process a URL first.")
         else:
             with st.spinner("Processing your question..."):
                 try:
                     chain = RetrievalQAWithSourcesChain.from_llm(
                         llm=llm,
-                        retriever=st.session_state.vectorstore.as_retriever(search_kwargs={"k": 2}),  # Limit to top 2 documents
                         question_prompt=qa_prompt
                     )
                     result = chain({"question": query}, return_only_outputs=True)
                     if not result.get("answer"):
-                        st.warning("No answer generated. Try a different question or URL.")
                         st.stop()
                     st.header("Answer")

 from langchain.embeddings import HuggingFaceEmbeddings
 from langchain_community.vectorstores.faiss import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_core.documents import Document
 import os
 import time
 from langchain_groq import ChatGroq
 from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
 from langchain.prompts import PromptTemplate
 from bs4 import SoupStrainer
+from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
 # Load environment variables (optional)
 load_dotenv()
 # Display large logo at the top of the main page
 st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
+# Set Streamlit app title
+st.title("WebChatter 💬")
 # Initialize session state
 if "index_created" not in st.session_state:
 if "embeddings" not in st.session_state:
     st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# Sidebar for URL and YouTube input
 with st.sidebar:
     st.header("Enter Web URL")
     url = st.text_input("URL", placeholder="e.g., https://mahatirtusher.com/astronomy-mythology/")
     process_url_clicked = st.button("Process URL")
+    st.header("Enter YouTube URL")
+    youtube_url = st.text_input("YouTube URL", placeholder="e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ")
+    process_youtube_clicked = st.button("Process YouTube Video")
 # Main content container
 main_container = st.container()
 # Custom prompt for detailed answers
 qa_prompt = PromptTemplate(
+    template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
 Context: {context}
 Answer with sources: """
 )
+# Function to summarize content
 def summarize_content(content, llm):
     summary_prompt = f"""Summarize the following content in 5-10 sentences, capturing the main points and key details in easy expression:
 def load_faiss_index(path, embeddings):
     return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
+# Function to extract YouTube video ID from URL
+def get_youtube_video_id(url):
+    if "youtube.com/watch?v=" in url:
+        return url.split("v=")[1].split("&")[0]
+    elif "youtu.be/" in url:
+        return url.split("youtu.be/")[1].split("?")[0]
+    return None
+# Function to process content (web or YouTube)
+def process_content(docs, embeddings):
+    st.text("Text Splitter...Started...✅✅✅")
+    text_splitter = RecursiveCharacterTextSplitter(
+        separators=['\n\n', '\n', '.', ','],
+        chunk_size=1000
+    )
+    docs = text_splitter.split_documents(docs)
+    if not docs:
+        st.error("No document chunks created. Try a different URL or video.")
+        st.stop()
+    st.text(f"Split into {len(docs)} document chunks.")
+    st.text("Embedding Vector Started Building...✅✅✅")
+    vectorstore = FAISS.from_documents(docs, embeddings)
+    faiss_index_path = "faiss_index"
+    save_faiss_index(vectorstore, faiss_index_path)
+    st.session_state.vectorstore = vectorstore  # Cache the vectorstore
+    st.session_state.index_created = True
+    st.text("FAISS index saved successfully! ✅✅✅")
+    time.sleep(2)
+# Process Web URL
 if process_url_clicked:
     with main_container:
         if not url.strip():
                     # Store content for summarization
                     st.session_state.url_content = "\n".join([doc.page_content for doc in data])
+                    embeddings = st.session_state.embeddings
+                    process_content(data, embeddings)
+                except Exception as e:
+                    st.error(f"Error processing URL: {str(e)}")
+# Process YouTube Video
+if process_youtube_clicked:
+    with main_container:
+        if not youtube_url.strip():
+            st.error("Please provide a valid YouTube URL.")
+        else:
+            with st.spinner("Processing YouTube Video..."):
+                try:
+                    video_id = get_youtube_video_id(youtube_url)
+                    if not video_id:
+                        st.error("Invalid YouTube URL. Please provide a URL like https://www.youtube.com/watch?v=VIDEO_ID.")
+                        st.stop()
+                    st.text("Fetching Transcript...Started...✅✅✅")
+                    transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en', 'bn'])
+                    transcript_text = " ".join([entry['text'] for entry in transcript])
+                    if not transcript_text.strip():
+                        st.error("No transcript available for this video. Try a different video.")
                         st.stop()
+                    # Create a Document object from the transcript
+                    doc = Document(page_content=transcript_text, metadata={"source": youtube_url})
+                    st.session_state.url_content = transcript_text  # Store for summarization
                     embeddings = st.session_state.embeddings
+                    process_content([doc], embeddings)
+                except TranscriptsDisabled:
+                    st.error("Transcripts are disabled for this video. Try a different video.")
+                    st.stop()
+                except NoTranscriptFound:
+                    st.error("No transcript found in the supported languages (English or Bengali). Try a different video.")
+                    st.stop()
                 except Exception as e:
+                    st.error(f"Error processing YouTube video: {str(e)}")
 # Summary button
 with main_container:
 # Display summary if generated
 if st.session_state.summary:
     with main_container:
+        st.header("Summary of the Content")
         st.write(st.session_state.summary)
 # Query input with Ask button
 with main_container:
     st.header("Ask a Question")
+    query = st.text_input("Question", placeholder="e.g., What is the video or article about?")
     ask_clicked = st.button("Ask")
 if ask_clicked and query:
     with main_container:
         if not st.session_state.index_created or st.session_state.vectorstore is None:
+            st.error("No content processed. Please process a URL or YouTube video first.")
         else:
             with st.spinner("Processing your question..."):
                 try:
                     chain = RetrievalQAWithSourcesChain.from_llm(
                         llm=llm,
+                        retriever=st.session_state.vectorstore.as_retriever(search_kwargs={"k": 2}),
                         question_prompt=qa_prompt
                     )
                     result = chain({"question": query}, return_only_outputs=True)
                     if not result.get("answer"):
+                        st.warning("No answer generated. Try a different question or content.")
                         st.stop()
                     st.header("Answer")