Spaces:

MahatirTusher
/

WebChatter

Sleeping

App Files Files Community

MahatirTusher commited on Apr 23, 2025

Commit

3f006bc

verified ·

1 Parent(s): c3e31e8

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -101

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ from langchain_community.vectorstores.faiss import FAISS
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 import os
-import time
 from langchain_groq import ChatGroq
 from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
 from langchain.prompts import PromptTemplate
@@ -14,16 +13,12 @@ from bs4 import SoupStrainer
 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
 import yt_dlp
 import re
-from googleapiclient.discovery import build
-from googleapiclient.errors import HttpError
 # Load environment variables (optional)
 load_dotenv()
 # Hardcoded Groq API key
 GROQ_API_KEY = "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"
-# YouTube API key (to be set in Hugging Face Spaces secrets)
-YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
 # Custom CSS
 st.markdown("""
@@ -112,18 +107,16 @@ st.image("https://i.postimg.cc/2j0QWF3Z/Removal-575.png", width=390)
 st.title("WebChatter 💬")
 # Initialize session state
-if "index_created" not in st.session_state:
-    st.session_state.index_created = False
 if "url_content" not in st.session_state:
     st.session_state.url_content = None
-if "vectorstore" not in st.session_state:
-    st.session_state.vectorstore = None
 if "summary" not in st.session_state:
     st.session_state.summary = None
-if "qa_chain" not in st.session_state:
-    st.session_state.qa_chain = None
-# Initialize embeddings once at the start
 if "embeddings" not in st.session_state:
     st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
@@ -132,23 +125,9 @@ if "llm" not in st.session_state:
     st.session_state.llm = ChatGroq(
         api_key=GROQ_API_KEY,
         model="llama3-70b-8192",
-        max_tokens=2048
     )
-# Custom logger for yt-dlp to redirect logs to Streamlit
-class StreamlitLogger:
-    def debug(self, msg):
-        st.text(f"[yt-dlp DEBUG] {msg}")
-    def info(self, msg):
-        st.info(f"[yt-dlp INFO] {msg}")
-    def warning(self, msg):
-        st.warning(f"[yt-dlp WARNING] {msg}")
-    def error(self, msg):
-        st.error(f"[yt-dlp ERROR] {msg}")
 # Sidebar for URL and YouTube input
 with st.sidebar:
     st.header("Enter Web URL")
@@ -162,7 +141,7 @@ with st.sidebar:
 # Main content container
 main_container = st.container()
-# Custom prompt for detailed answers
 qa_prompt = PromptTemplate(
     template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
@@ -174,8 +153,17 @@ Answer with sources: """
 )
 # Function to summarize content
-def summarize_content(content, llm):
-    summary_prompt = f"""Summarize the following content in 5-10 sentences, capturing the main points and key details in easy expression:
 {content}
@@ -207,51 +195,12 @@ def fetch_youtube_transcript(video_id):
                 return " ".join([item['text'] for item in translated_transcript])
         return None
-# Function to fetch captions using YouTube Data API (limited to listing with API key)
-def fetch_youtube_captions_api(video_id, api_key):
-    if not api_key:
-        st.warning("YOUTUBE_API_KEY not set. Skipping YouTube Data API fallback.")
-        return None
-    try:
-        youtube = build('youtube', 'v3', developerKey=api_key)
-        captions = youtube.captions().list(
-            part='snippet',
-            videoId=video_id
-        ).execute()
-        caption_id = None
-        for item in captions.get('items', []):
-            if item['snippet']['language'] == 'en':
-                caption_id = item['id']
-                break
-            elif item['snippet']['language'] in ['en-US', 'en-GB']:
-                caption_id = item['id']
-                break
-        if not caption_id:
-            st.warning("No English captions found via YouTube Data API.")
-            return None
-        # Note: Downloading captions requires OAuth 2.0 authentication
-        st.warning(
-            "English captions are available for this video but cannot be fetched with an API key alone. "
-            "Downloading captions requires OAuth 2.0 authentication, which is not supported in Hugging Face Spaces without user interaction. "
-            "To fetch captions:\n"
-            "- Test locally with OAuth 2.0 setup (see instructions in the documentation).\n"
-            "- Or try a video with transcripts available (e.g., https://www.youtube.com/watch?v=dQw4w9WgXcQ)."
-        )
-        return None
-    except HttpError as e:
-        st.error(f"Error fetching captions with YouTube Data API: {str(e)}")
-        return None
 # Function to extract subtitles using yt-dlp with cookies
 def extract_subtitles_with_ytdlp(video_url):
     ydl_opts = {
         'writesubtitles': True,
         'writeautomaticsub': True,
-        'subtitleslangs': ['all', '-live_chat'],  # Match previous version, exclude live chat
         'skip_download': True,
         'subtitlesformat': 'vtt',
         'outtmpl': 'subtitle.%(ext)s',
@@ -260,8 +209,6 @@ def extract_subtitles_with_ytdlp(video_url):
             'Accept-Language': 'en-US,en;q=0.9',
         },
         'cookiefile': 'cookies.txt',
-        'retries': 3,
-        'retry_sleep': 5,
     }
     try:
         if not os.path.exists('cookies.txt'):
@@ -276,7 +223,6 @@ def extract_subtitles_with_ytdlp(video_url):
             return None
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
-            ydl.params['logger'] = StreamlitLogger()
             info = ydl.extract_info(video_url, download=False)
             available_subs = info.get('subtitles', {})
             auto_subs = info.get('automatic_captions', {})
@@ -317,7 +263,7 @@ def extract_subtitles_with_ytdlp(video_url):
         st.error(f"Error fetching captions with yt-dlp: {str(e)}")
         return None
-# Function to process and chunk text (web or YouTube) with source metadata
 def process_content(text, embeddings, source):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
@@ -325,16 +271,13 @@ def process_content(text, embeddings, source):
         separators=["\n\n", "\n", ".", " ", ""]
     )
     docs = text_splitter.create_documents([text], metadatas=[{"source": source}])
-    if docs:
-        st.text(f"Document metadata: {docs[0].metadata}")
-    else:
         st.error("No documents created from the content.")
         return None
     vectorstore = FAISS.from_documents(docs, embeddings)
-    st.text(f"Vector store created with {len(docs)} documents.")
     return vectorstore
-# Function to create QA chain
 def create_qa_chain(vectorstore, llm):
     if vectorstore is None:
         st.error("Vector store is not initialized. Cannot create QA chain.")
@@ -349,7 +292,6 @@ def create_qa_chain(vectorstore, llm):
             "document_variable_name": "context"
         }
     )
-    st.text("QA chain created successfully.")
     return qa_chain
 # Process Web URL
@@ -376,10 +318,11 @@ if process_url_clicked:
                     embeddings = st.session_state.embeddings
                     st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
                     st.session_state.index_created = True
-                    st.session_state.qa_chain = None
                     st.text("Content processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing URL: {str(e)}")
 # Process YouTube Video
 if process_youtube_clicked:
@@ -403,10 +346,6 @@ if process_youtube_clicked:
                         st.text("Fetching Closed Captions...Started...✅✅✅")
                         transcript_text = extract_subtitles_with_ytdlp(youtube_url)
-                        if not transcript_text and YOUTUBE_API_KEY:
-                            st.text("Fetching Captions via YouTube Data API...Started...✅✅✅")
-                            transcript_text = fetch_youtube_captions_api(video_id, YOUTUBE_API_KEY)
                         if not transcript_text:
                             st.error(
                                 "No transcripts or closed captions available. "
@@ -426,19 +365,23 @@ if process_youtube_clicked:
                         st.stop()
                     st.session_state.url_content = transcript_text
-                    embeddings = st.session_state.embeddings
-                    st.session_state.vectorstore = process_content(transcript_text, embeddings, source=youtube_url.strip())
-                    st.session_state.index_created = True
-                    st.session_state.qa_chain = None
                     st.text("YouTube video processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing YouTube video: {str(e)}")
 # Summary button
 with main_container:
     if st.session_state.url_content and st.button("Generate Summary"):
         with st.spinner("Generating summary..."):
-            st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm)
 # Display summary if generated
 if st.session_state.summary:
@@ -446,17 +389,14 @@ if st.session_state.summary:
         st.header("Summary of the Content")
         st.write(st.session_state.summary)
-# Query input with Ask button
-with main_container:
-    st.header("Ask a Question")
-    query = st.text_input("Question", placeholder="e.g., What is the video or article about?")
-    ask_clicked = st.button("Ask")
-if ask_clicked and query:
     with main_container:
-        if not st.session_state.index_created or st.session_state.vectorstore is None:
-            st.error("No content processed. Please process a URL or YouTube video first.")
-        else:
             with st.spinner("Processing your question..."):
                 try:
                     if "qa_chain" not in st.session_state or st.session_state.qa_chain is None:
@@ -465,7 +405,6 @@ if ask_clicked and query:
                             st.error("Failed to create QA chain.")
                             st.stop()
-                    st.text(f"Querying with question: {query}")
                     result = st.session_state.qa_chain({"question": query}, return_only_outputs=True)
                     if not result.get("answer"):
@@ -485,6 +424,7 @@ if ask_clicked and query:
                         st.write("No sources found.")
                 except Exception as e:
                     st.error(f"Error answering query: {str(e)}")
 # Footer with tiny logo and text
 st.markdown(

 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_core.documents import Document
 import os
 from langchain_groq import ChatGroq
 from langchain.chains.qa_with_sources.retrieval import RetrievalQAWithSourcesChain
 from langchain.prompts import PromptTemplate
 from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
 import yt_dlp
 import re
 # Load environment variables (optional)
 load_dotenv()
 # Hardcoded Groq API key
 GROQ_API_KEY = "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"
 # Custom CSS
 st.markdown("""
 st.title("WebChatter 💬")
 # Initialize session state
 if "url_content" not in st.session_state:
     st.session_state.url_content = None
 if "summary" not in st.session_state:
     st.session_state.summary = None
+if "vectorstore" not in st.session_state:
+    st.session_state.vectorstore = None
+if "index_created" not in st.session_state:
+    st.session_state.index_created = False
+# Initialize embeddings once at the start (only for web URLs)
 if "embeddings" not in st.session_state:
     st.session_state.embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     st.session_state.llm = ChatGroq(
         api_key=GROQ_API_KEY,
         model="llama3-70b-8192",
+        max_tokens=1024  # Reduced to lower resource usage
     )
 # Sidebar for URL and YouTube input
 with st.sidebar:
     st.header("Enter Web URL")
 # Main content container
 main_container = st.container()
+# Custom prompt for detailed answers (for web URLs only)
 qa_prompt = PromptTemplate(
     template="""You are an expert assistant tasked with providing detailed, extensive, and comprehensive answers. Use the provided context to answer the question thoroughly, including explanations, examples, and additional relevant information. If the context is limited, expand on the topic with your knowledge to ensure a complete response. In case of explaining anything, break the topic and explain step by step. Sometimes use your own reasoning and knowledge to explain anything to the users. If the users ask any question in Bengali, you too will answer it in fine and detailed Bengali.
 )
 # Function to summarize content
+def summarize_content(content, llm, is_youtube=False):
+    if is_youtube:
+        # Extensive summary for YouTube videos (15-20 sentences)
+        summary_prompt = f"""You are an expert summarizer tasked with providing a very detailed and extensive summary of the following YouTube video transcript. Capture all key points, main ideas, and significant details in 15-20 sentences. Include specific examples, quotes, or moments from the transcript to make the summary comprehensive and vivid. Ensure the summary is well-organized, flowing naturally from one point to the next, and provides a thorough overview of the video's content.
+Transcript: {content}
+Extensive Summary: """
+    else:
+        # Shorter summary for web URLs (5-10 sentences)
+        summary_prompt = f"""Summarize the following content in 5-10 sentences, capturing the main points and key details in easy expression:
 {content}
                 return " ".join([item['text'] for item in translated_transcript])
         return None
 # Function to extract subtitles using yt-dlp with cookies
 def extract_subtitles_with_ytdlp(video_url):
     ydl_opts = {
         'writesubtitles': True,
         'writeautomaticsub': True,
+        'subtitleslangs': ['all', '-live_chat'],
         'skip_download': True,
         'subtitlesformat': 'vtt',
         'outtmpl': 'subtitle.%(ext)s',
             'Accept-Language': 'en-US,en;q=0.9',
         },
         'cookiefile': 'cookies.txt',
     }
     try:
         if not os.path.exists('cookies.txt'):
             return None
         with yt_dlp.YoutubeDL(ydl_opts) as ydl:
             info = ydl.extract_info(video_url, download=False)
             available_subs = info.get('subtitles', {})
             auto_subs = info.get('automatic_captions', {})
         st.error(f"Error fetching captions with yt-dlp: {str(e)}")
         return None
+# Function to process and chunk text (for web URLs only)
 def process_content(text, embeddings, source):
     text_splitter = RecursiveCharacterTextSplitter(
         chunk_size=1000,
         separators=["\n\n", "\n", ".", " ", ""]
     )
     docs = text_splitter.create_documents([text], metadatas=[{"source": source}])
+    if not docs:
         st.error("No documents created from the content.")
         return None
     vectorstore = FAISS.from_documents(docs, embeddings)
     return vectorstore
+# Function to create QA chain (for web URLs only)
 def create_qa_chain(vectorstore, llm):
     if vectorstore is None:
         st.error("Vector store is not initialized. Cannot create QA chain.")
             "document_variable_name": "context"
         }
     )
     return qa_chain
 # Process Web URL
                     embeddings = st.session_state.embeddings
                     st.session_state.vectorstore = process_content(st.session_state.url_content, embeddings, source=url.strip())
                     st.session_state.index_created = True
+                    st.session_state.summary = None
                     st.text("Content processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing URL: {str(e)}")
+                    st.stop()
 # Process YouTube Video
 if process_youtube_clicked:
                         st.text("Fetching Closed Captions...Started...✅✅✅")
                         transcript_text = extract_subtitles_with_ytdlp(youtube_url)
                         if not transcript_text:
                             st.error(
                                 "No transcripts or closed captions available. "
                         st.stop()
                     st.session_state.url_content = transcript_text
+                    # No vector store for YouTube videos since we're not doing QA
+                    st.session_state.vectorstore = None
+                    st.session_state.index_created = False
+                    st.session_state.summary = None
                     st.text("YouTube video processed successfully! ✅✅✅")
                 except Exception as e:
                     st.error(f"Error processing YouTube video: {str(e)}")
+                    st.stop()
 # Summary button
 with main_container:
     if st.session_state.url_content and st.button("Generate Summary"):
         with st.spinner("Generating summary..."):
+            # Check if the content is from a YouTube video (based on last processed input)
+            is_youtube = youtube_url.strip() and youtube_url == (st.session_state.get('last_processed_url', ''))
+            st.session_state.summary = summarize_content(st.session_state.url_content, st.session_state.llm, is_youtube=is_youtube)
+            st.session_state.last_processed_url = youtube_url if is_youtube else url
 # Display summary if generated
 if st.session_state.summary:
         st.header("Summary of the Content")
         st.write(st.session_state.summary)
+# Query input with Ask button (only for web URLs)
+if st.session_state.url_content and not youtube_url.strip():
     with main_container:
+        st.header("Ask a Question")
+        query = st.text_input("Question", placeholder="e.g., What is the article about?")
+        ask_clicked = st.button("Ask")
+        if ask_clicked and query:
             with st.spinner("Processing your question..."):
                 try:
                     if "qa_chain" not in st.session_state or st.session_state.qa_chain is None:
                             st.error("Failed to create QA chain.")
                             st.stop()
                     result = st.session_state.qa_chain({"question": query}, return_only_outputs=True)
                     if not result.get("answer"):
                         st.write("No sources found.")
                 except Exception as e:
                     st.error(f"Error answering query: {str(e)}")
+                    st.stop()
 # Footer with tiny logo and text
 st.markdown(