Spaces:

neel692
/

Youtube_Transcript_RAG

Sleeping

App Files Files Community

NeelTA commited on Jun 21, 2025

Commit

be2cf45

1 Parent(s): bbc4b89

transcript api changed

Browse files

Files changed (2) hide show

app.py +35 -39
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -2,13 +2,12 @@ from dotenv import load_dotenv
 load_dotenv()
 import os
-import requests
 if not os.environ.get("GOOGLE_API_KEY"):
     raise RuntimeError("Please set the GOOGLE_API_KEY environment variable with your Google API key.")
 import gradio as gr
-from youtube_transcript_api import YouTubeTranscriptApi
-from youtube_transcript_api._api import TranscriptListFetcher  # ✅ Add this line
 from langchain_core.prompts import PromptTemplate
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
@@ -17,26 +16,8 @@ from langchain_google_genai import ChatGoogleGenerativeAI
 from langchain_core.messages import HumanMessage
 from langchain_google_genai import GoogleGenerativeAIEmbeddings
-# Load proxy credentials from Hugging Face Secrets
-PROXY_USER = os.environ.get("PROXY_USER")
-PROXY_PASS = os.environ.get("PP")
-PROXY_HOST = os.environ.get("PROXY_HOST")
-PROXY_PORT = os.environ.get("PROXY_PORT")
-if not all([PROXY_USER, PROXY_PASS, PROXY_HOST, PROXY_PORT]):
-    raise RuntimeError("Proxy credentials not fully set in Hugging Face Secrets.")
-PROXY_URL = f"http://{PROXY_USER}:{PROXY_PASS}@{PROXY_HOST}:{PROXY_PORT}"
-# Create a session with proxy
-proxy_session = requests.Session()
-proxy_session.proxies = {
-    "http": PROXY_URL,
-    "https": PROXY_URL
-}
-# Patch youtube_transcript_api to use this proxy session
-TranscriptListFetcher._session = proxy_session
 # Initialize the text splitter
 text_splitter = RecursiveCharacterTextSplitter(
@@ -73,13 +54,24 @@ chat = ChatGoogleGenerativeAI(
 # Define the prompt template
 prompt = PromptTemplate(
     template="""
-      You are a helpful assistant.
-      Answer ONLY from the provided transcript context.
-      If the context is insufficient, just say you don't know.
-      {context}
-      Question: {question}
-    """,
     input_variables=['context', 'question']
 )
@@ -107,12 +99,15 @@ def process_video_url(video_url_or_id):
         if current_video_id == video_id and current_retriever is not None:
             return f"✅ Video already processed: {video_id}"
-        # Get transcript
-        transcript = YouTubeTranscriptApi.get_transcript(video_id)
-        # Extract text segments
-        list_of_text_segments = [item['text'] for item in transcript]
-        full_transcript_text = " ".join(list_of_text_segments)
         # Create chunks
         chunks = text_splitter.create_documents([full_transcript_text])
@@ -120,7 +115,7 @@ def process_video_url(video_url_or_id):
         # Create vector store
         vector_store = FAISS.from_documents(chunks, embeddings)
         current_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 8})
-        print(f"✅ Current Retreiver : {current_retriever}")
         current_video_id = video_id
         return f"✅ Video processed successfully: {video_id}"
@@ -147,7 +142,7 @@ def answer_question(question):
         # Build context and reply as before
         context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
-        print("\nContext text:\n", context_text)
         final_prompt = prompt.invoke({"context": context_text, "question": question})
         answer = chat.invoke(final_prompt)
         return answer.content
@@ -213,9 +208,10 @@ def main():
         # Example inputs
         gr.Examples(
             examples=[
-                ["https://www.youtube.com/watch?v=JaRGJVrJBQ8", "What is this video about?"],
-                ["JaRGJVrJBQ8", "What are the main topics discussed?"],
-                ["https://www.youtube.com/watch?v=JaRGJVrJBQ8", "Summarize the key points"]
             ],
             inputs=[video_input, question_input]
         )

 load_dotenv()
 import os
 if not os.environ.get("GOOGLE_API_KEY"):
     raise RuntimeError("Please set the GOOGLE_API_KEY environment variable with your Google API key.")
 import gradio as gr
+from supadata import Supadata
 from langchain_core.prompts import PromptTemplate
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain_community.vectorstores import FAISS
 from langchain_core.messages import HumanMessage
 from langchain_google_genai import GoogleGenerativeAIEmbeddings
+# Initialize Supadata
+supadata = Supadata(api_key=os.environ.get("SUPADATA_API_KEY"))
 # Initialize the text splitter
 text_splitter = RecursiveCharacterTextSplitter(
 # Define the prompt template
 prompt = PromptTemplate(
     template="""
+You are an intelligent AI assistant specialized in analyzing YouTube video transcripts. Your task is to provide accurate, detailed, and helpful answers based solely on the provided transcript content.
+IMPORTANT GUIDELINES:
+- Answer ONLY from the provided transcript context
+- If the context is insufficient to answer the question, clearly state "I don't have enough information from the transcript to answer this question"
+- Provide specific details and examples from the transcript when possible
+- Be concise but comprehensive in your responses
+- If asked for a summary, organize the information logically
+- If asked about specific topics, focus on what was actually discussed in the video
+- Maintain a helpful and informative tone
+TRANSCRIPT CONTEXT:
+{context}
+QUESTION: {question}
+Please provide a clear and detailed answer based on the transcript above:
+""",
     input_variables=['context', 'question']
 )
         if current_video_id == video_id and current_retriever is not None:
             return f"✅ Video already processed: {video_id}"
+        # Get transcript using Supadata
+        transcript_response = supadata.youtube.transcript(
+            video_id=video_id,
+            text=True  # Get plain text transcript
+        )
+        # Extract the transcript text
+        full_transcript_text = transcript_response.content
+        # print(f"✅ Full transcript text: {full_transcript_text}")
         # Create chunks
         chunks = text_splitter.create_documents([full_transcript_text])
         # Create vector store
         vector_store = FAISS.from_documents(chunks, embeddings)
         current_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 8})
+        # print(f"✅ Current Retreiver : {current_retriever}")
         current_video_id = video_id
         return f"✅ Video processed successfully: {video_id}"
         # Build context and reply as before
         context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
+        # print("\nContext text:\n", context_text)
         final_prompt = prompt.invoke({"context": context_text, "question": question})
         answer = chat.invoke(final_prompt)
         return answer.content
         # Example inputs
         gr.Examples(
             examples=[
+                ["https://www.youtube.com/watch?v=-moW9jvvMr4&t=1s", "What is this video about?"],
+                ["https://www.youtube.com/watch?v=-moW9jvvMr4&t=1s", "What are the main topics discussed in this video?"],
+                ["https://www.youtube.com/watch?v=-moW9jvvMr4&t=1s", "Can you summarize the key points from this video?"],
+                ["-moW9jvvMr4", "What are the most important takeaways from this content?"]
             ],
             inputs=[video_input, question_input]
         )

requirements.txt CHANGED Viewed

@@ -1,10 +1,11 @@
-youtube-transcript-api>=1.1.0
 langchain-core>=0.3.65
 langchain-community>=0.3.25
 langchain-huggingface>=0.3.0
 faiss-cpu>=1.11.0
 gradio>=5.34.0
 huggingface-hub>=0.33.0
 sentence-transformers>=4.1.0
 tf_keras>=2.18.0
-langchain_google_genai>=2.1.5

+supadata>=1.0.0
 langchain-core>=0.3.65
 langchain-community>=0.3.25
 langchain-huggingface>=0.3.0
+langchain-google-genai>=2.0.0
 faiss-cpu>=1.11.0
 gradio>=5.34.0
 huggingface-hub>=0.33.0
 sentence-transformers>=4.1.0
 tf_keras>=2.18.0
+google-generativeai>=0.8.0