Spaces:

MahatirTusher
/

WebChatter

Sleeping

App Files Files Community

MahatirTusher commited on Apr 22, 2025

Commit

fa9a363

verified ·

1 Parent(s): 9352b95

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -94

app.py CHANGED Viewed

@@ -14,6 +14,7 @@ from bs4 import SoupStrainer
 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
 import yt_dlp
 import re
 # Load environment variables (optional)
 load_dotenv()
@@ -104,6 +105,8 @@ st.markdown("""
 # Display large logo at the top of the main page
 st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
 # Initialize session state
 if "index_created" not in st.session_state:
@@ -119,6 +122,14 @@ if "summary" not in st.session_state:
 if "embeddings" not in st.session_state:
     st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
 # Sidebar for URL and YouTube input
 with st.sidebar:
     st.header("Enter Web URL")
@@ -126,19 +137,12 @@ with st.sidebar:
     process_url_clicked = st.button("Process URL")
     st.header("Enter YouTube URL")
-    youtube_url = st.text_input("YouTube URL", placeholder="e.g., https://www.youtube.com/watch?v=pxiP-HJLCx0")
     process_youtube_clicked = st.button("Process YouTube Video")
 # Main content container
 main_container = st.container()
-# Initialize the Groq LLM
-llm = ChatGroq(
-    api_key=GROQ_API_KEY,
-    model="llama3-70b-8192",
-    max_tokens=2048  # Increased for detailed answers
-)
 # Custom prompt for detailed answers
 qa_prompt = PromptTemplate(
     template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
@@ -160,52 +164,73 @@ Summary: """
     summary = llm.invoke(summary_prompt).content
     return summary
-def save_faiss_index(vectorstore, path):
-    vectorstore.save_local(path)
-def load_faiss_index(path, embeddings):
-    return FAISS.load_local(path, embeddings, allow_dangerous_deserialization=True)
 # Function to extract YouTube video ID from URL
-def get_youtube_video_id(url):
     if "youtube.com/watch?v=" in url:
         return url.split("v=")[1].split("&")[0]
     elif "youtu.be/" in url:
         return url.split("youtu.be/")[1].split("?")[0]
     return None
-# Function to extract subtitles using yt-dlp
 def extract_subtitles_with_ytdlp(video_url):
     ydl_opts = {
         'writesubtitles': True,
         'writeautomaticsub': True,
-        'subtitleslangs': ['all'],  # Fetch subtitles in all available languages
         'skip_download': True,
         'subtitlesformat': 'vtt',
         'outtmpl': 'subtitle.%(ext)s',
     }
     try:
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             info = ydl.extract_info(video_url, download=False)
             available_subs = info.get('subtitles', {})
             auto_subs = info.get('automatic_captions', {})
             # Log available subtitles for debugging
             st.text(f"Available subtitles: {list(available_subs.keys())}")
             st.text(f"Available auto-captions: {list(auto_subs.keys())}")
             # Download the first available subtitle or auto-caption
-            ydl.params['subtitleslangs'] = list(available_subs.keys()) or list(auto_subs.keys()) or ['en']
             ydl.download([video_url])
         # Look for the subtitle file
         subtitle_file = None
-        for lang in available_subs.keys() or auto_subs.keys():
             possible_file = f"subtitle.{lang}.vtt"
             if os.path.exists(possible_file):
                 subtitle_file = possible_file
                 break
         if not subtitle_file:
             return None
@@ -220,7 +245,6 @@ def extract_subtitles_with_ytdlp(video_url):
         lines = subtitle_text.split('\n')
         text_lines = []
         for line in lines:
-            # Skip WEBVTT header, timestamps, and empty lines
             if line.strip() and not line.startswith('WEBVTT') and not line.startswith('Kind:') and not line.startswith('Language:') and not re.match(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', line):
                 text_lines.append(line.strip())
@@ -229,29 +253,27 @@ def extract_subtitles_with_ytdlp(video_url):
         st.error(f"Error fetching captions with yt-dlp: {str(e)}")
         return None
-# Function to process content (web or YouTube)
-def process_content(docs, embeddings):
-    st.text("Text Splitter...Started...✅✅✅")
     text_splitter = RecursiveCharacterTextSplitter(
-        separators=['\n\n', '\n', '.', ','],
-        chunk_size=1000
     )
-    docs = text_splitter.split_documents(docs)
-    if not docs:
-        st.error("No document chunks created. Try a different URL or video.")
-        st.stop()
-    st.text(f"Split into {len(docs)} document chunks.")
-    st.text("Embedding Vector Started Building...✅✅✅")
     vectorstore = FAISS.from_documents(docs, embeddings)
-    faiss_index_path = "faiss_index"
-    save_faiss_index(vectorstore, faiss_index_path)
-    st.session_state.vectorstore = vectorstore  # Cache the vectorstore
-    st.session_state.index_created = True
-    st.text("FAISS index saved successfully! ✅✅✅")
-    time.sleep(2)
 # Process Web URL
 if process_url_clicked:
@@ -277,7 +299,9 @@ if process_url_clicked:
                     # Store content for summarization
                     st.session_state.url_content = "\n".join([doc.page_content for doc in data])
                     embeddings = st.session_state.embeddings
-                    process_content(data, embeddings)
                 except Exception as e:
                     st.error(f"Error processing URL: {str(e)}")
@@ -289,52 +313,17 @@ if process_youtube_clicked:
         else:
             with st.spinner("Processing YouTube Video..."):
                 try:
-                    video_id = get_youtube_video_id(youtube_url)
                     if not video_id:
                         st.error("Invalid YouTube URL. Please provide a URL like https://www.youtube.com/watch?v=VIDEO_ID.")
                         st.stop()
                     transcript_text = None
                     st.text("Fetching Transcript...Started...✅✅✅")
-                    try:
-                        # Get the list of available transcripts
-                        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
-                        # Log available transcripts for debugging
-                        available_languages = [t.language for t in transcript_list]
-                        st.text(f"Available transcript languages: {available_languages}")
-                        transcript = None
-                        # Try to find a manually created transcript in any language
-                        for lang in available_languages:
-                            try:
-                                transcript = transcript_list.find_manually_created_transcript([lang])
-                                break
-                            except NoTranscriptFound:
-                                continue
-                        # If no manual transcript, try an auto-generated one
-                        if not transcript:
-                            for lang in available_languages:
-                                try:
-                                    transcript = transcript_list.find_generated_transcript([lang])
-                                    break
-                                except NoTranscriptFound:
-                                    continue
-                        # If a transcript is found and it's not in English, translate to English
-                        if transcript:
-                            if transcript.language_code != 'en' and transcript.is_translatable:
-                                transcript = transcript.translate('en')
-                            transcript_data = transcript.fetch()
-                            transcript_text = " ".join([entry['text'] for entry in transcript_data])
-                    except TranscriptsDisabled:
-                        st.warning("Transcripts are disabled for this video. Attempting to fetch closed captions...")
-                    except NoTranscriptFound:
-                        st.warning("No transcript found in any language. Attempting to fetch closed captions...")
-                    # Fallback to yt-dlp for closed captions
                     if not transcript_text:
                         st.text("Fetching Closed Captions...Started...✅✅✅")
                         transcript_text = extract_subtitles_with_ytdlp(youtube_url)
                         if not transcript_text:
@@ -345,11 +334,12 @@ if process_youtube_clicked:
                         st.error("Transcript or captions are empty. Try a different video.")
                         st.stop()
-                    # Create a Document object from the transcript
-                    doc = Document(page_content=transcript_text, metadata={"source": youtube_url})
-                    st.session_state.url_content = transcript_text  # Store for summarization
                     embeddings = st.session_state.embeddings
-                    process_content([doc], embeddings)
                 except Exception as e:
                     st.error(f"Error processing YouTube video: {str(e)}")
@@ -357,7 +347,7 @@ if process_youtube_clicked:
 with main_container:
     if st.session_state.url_content and st.button("Generate Summary"):
         with st.spinner("Generating summary..."):
-            st.session_state.summary = summarize_content(st.session_state.url_content, llm)
 # Display summary if generated
 if st.session_state.summary:
@@ -378,12 +368,10 @@ if ask_clicked and query:
         else:
             with st.spinner("Processing your question..."):
                 try:
-                    chain = RetrievalQAWithSourcesChain.from_llm(
-                        llm=llm,
-                        retriever=st.session_state.vectorstore.as_retriever(search_kwargs={"k": 2}),
-                        question_prompt=qa_prompt
-                    )
-                    result = chain({"question": query}, return_only_outputs=True)
                     if not result.get("answer"):
                         st.warning("No answer generated. Try a different question or content.")
@@ -407,7 +395,7 @@ if ask_clicked and query:
 st.markdown(
     """
     <div class="footer">
-        <img src="https://i.postimg.cc/2j0QWF3Z/Removal-575.png" width="130">
         WebChatter © 2025 | Developed by Mahatir Ahmed Tusher
     </div>
     """,

 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
 import yt_dlp
 import re
+import requests
 # Load environment variables (optional)
 load_dotenv()
 # Display large logo at the top of the main page
 st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
+# Set Streamlit app title
+st.title("WebChatter 💬")
 # Initialize session state
 if "index_created" not in st.session_state:
 if "embeddings" not in st.session_state:
     st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+# Initialize LLM once at the start
+if "llm" not in st.session_state:
+    st.session_state.llm = ChatGroq(
+        api_key=GROQ_API_KEY,
+        model="llama3-70b-8192",
+        max_tokens=2048
+    )
 # Sidebar for URL and YouTube input
 with st.sidebar:
     st.header("Enter Web URL")
     process_url_clicked = st.button("Process URL")
     st.header("Enter YouTube URL")
+    youtube_url = st.text_input("YouTube URL", placeholder="e.g., https://www.youtube.com/watch?v=DJO_9auJhJQ")
     process_youtube_clicked = st.button("Process YouTube Video")
 # Main content container
 main_container = st.container()
 # Custom prompt for detailed answers
 qa_prompt = PromptTemplate(
     template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
     summary = llm.invoke(summary_prompt).content
     return summary
 # Function to extract YouTube video ID from URL
+def get_video_id(url):
     if "youtube.com/watch?v=" in url:
         return url.split("v=")[1].split("&")[0]
     elif "youtu.be/" in url:
         return url.split("youtu.be/")[1].split("?")[0]
     return None
+# Function to fetch YouTube transcript (ChatGPT-inspired)
+def fetch_youtube_transcript(video_id):
+    try:
+        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
+        transcript = transcript_list.find_transcript(['en']).fetch()
+        full_text = " ".join([item['text'] for item in transcript])
+        return full_text
+    except TranscriptsDisabled:
+        return None
+    except NoTranscriptFound:
+        # Try to find any transcript and translate to English
+        for transcript in transcript_list:
+            if transcript.is_translatable:
+                translated_transcript = transcript.translate('en').fetch()
+                return " ".join([item['text'] for item in translated_transcript])
+        return None
+# Function to extract subtitles using yt-dlp with bot detection bypass
 def extract_subtitles_with_ytdlp(video_url):
     ydl_opts = {
         'writesubtitles': True,
         'writeautomaticsub': True,
+        'subtitleslangs': ['all'],
         'skip_download': True,
         'subtitlesformat': 'vtt',
         'outtmpl': 'subtitle.%(ext)s',
+        'http_headers': {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
+            'Accept-Language': 'en-US,en;q=0.9',
+        },
+        'retries': 3,
+        'retry_sleep': 5,
     }
     try:
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             info = ydl.extract_info(video_url, download=False)
             available_subs = info.get('subtitles', {})
             auto_subs = info.get('automatic_captions', {})
             # Log available subtitles for debugging
             st.text(f"Available subtitles: {list(available_subs.keys())}")
             st.text(f"Available auto-captions: {list(auto_subs.keys())}")
             # Download the first available subtitle or auto-caption
+            subtitle_langs = list(available_subs.keys()) or list(auto_subs.keys())
+            if not subtitle_langs:
+                return None
+            ydl.params['subtitleslangs'] = subtitle_langs
             ydl.download([video_url])
         # Look for the subtitle file
         subtitle_file = None
+        for lang in subtitle_langs:
             possible_file = f"subtitle.{lang}.vtt"
             if os.path.exists(possible_file):
                 subtitle_file = possible_file
                 break
         if not subtitle_file:
             return None
         lines = subtitle_text.split('\n')
         text_lines = []
         for line in lines:
             if line.strip() and not line.startswith('WEBVTT') and not line.startswith('Kind:') and not line.startswith('Language:') and not re.match(r'\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}', line):
                 text_lines.append(line.strip())
         st.error(f"Error fetching captions with yt-dlp: {str(e)}")
         return None
+# Function to process and chunk text (web or YouTube)
+def process_content(text, embeddings):
     text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        separators=["\n\n", "\n", ".", " ", ""]
     )
+    docs = text_splitter.create_documents([text])
     vectorstore = FAISS.from_documents(docs, embeddings)
+    return vectorstore
+# Function to create QA chain
+def create_qa_chain(vectorstore, llm):
+    retriever = vectorstore.as_retriever(search_kwargs={"k": 2})
+    qa_chain = RetrievalQAWithSourcesChain.from_chain_type(
+        llm=llm,
+        retriever=retriever,
+        chain_type="stuff",
+        chain_type_kwargs={"prompt": qa_prompt}
+    )
+    return qa_chain
 # Process Web URL
 if process_url_clicked:
                     # Store content for summarization
                     st.session_state.url_content = "\n".join([doc.page_content for doc in data])
                     embeddings = st.session_state.embeddings
+                    st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings)
+                    st.session_state.index_created = True
+                    st.text("Content processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing URL: {str(e)}")
         else:
             with st.spinner("Processing YouTube Video..."):
                 try:
+                    video_id = get_video_id(youtube_url)
                     if not video_id:
                         st.error("Invalid YouTube URL. Please provide a URL like https://www.youtube.com/watch?v=VIDEO_ID.")
                         st.stop()
                     transcript_text = None
                     st.text("Fetching Transcript...Started...✅✅✅")
+                    transcript_text = fetch_youtube_transcript(video_id)
                     if not transcript_text:
+                        st.warning("Transcripts are disabled or unavailable. Attempting to fetch closed captions...")
                         st.text("Fetching Closed Captions...Started...✅✅✅")
                         transcript_text = extract_subtitles_with_ytdlp(youtube_url)
                         if not transcript_text:
                         st.error("Transcript or captions are empty. Try a different video.")
                         st.stop()
+                    # Process the transcript
+                    st.session_state.url_content = transcript_text
                     embeddings = st.session_state.embeddings
+                    st.session_state.vectorstore = process_content(transcript_text, embeddings)
+                    st.session_state.index_created = True
+                    st.text("YouTube video processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing YouTube video: {str(e)}")
 with main_container:
     if st.session_state.url_content and st.button("Generate Summary"):
         with st.spinner("Generating summary..."):
+            st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm)
 # Display summary if generated
 if st.session_state.summary:
         else:
             with st.spinner("Processing your question..."):
                 try:
+                    if "qa_chain" not in st.session_state:
+                        st.session_state.qa_chain = create_qa_chain(st.session_state.vectorstore, st.session_state.llm)
+                    result = st.session_state.qa_chain({"question": query}, return_only_outputs=True)
                     if not result.get("answer"):
                         st.warning("No answer generated. Try a different question or content.")
 st.markdown(
     """
     <div class="footer">
+        <img src="https://i.postimg.cc/2j0QWF3Z/Removal-575.png" width="80">
         WebChatter © 2025 | Developed by Mahatir Ahmed Tusher
     </div>
     """,