NeelTA commited on
Commit
be2cf45
Β·
1 Parent(s): bbc4b89

transcript api changed

Browse files
Files changed (2) hide show
  1. app.py +35 -39
  2. requirements.txt +3 -2
app.py CHANGED
@@ -2,13 +2,12 @@ from dotenv import load_dotenv
2
  load_dotenv()
3
 
4
  import os
5
- import requests
6
  if not os.environ.get("GOOGLE_API_KEY"):
7
  raise RuntimeError("Please set the GOOGLE_API_KEY environment variable with your Google API key.")
8
 
9
  import gradio as gr
10
- from youtube_transcript_api import YouTubeTranscriptApi
11
- from youtube_transcript_api._api import TranscriptListFetcher # βœ… Add this line
12
  from langchain_core.prompts import PromptTemplate
13
  from langchain.text_splitter import RecursiveCharacterTextSplitter
14
  from langchain_community.vectorstores import FAISS
@@ -17,26 +16,8 @@ from langchain_google_genai import ChatGoogleGenerativeAI
17
  from langchain_core.messages import HumanMessage
18
  from langchain_google_genai import GoogleGenerativeAIEmbeddings
19
 
20
- # Load proxy credentials from Hugging Face Secrets
21
- PROXY_USER = os.environ.get("PROXY_USER")
22
- PROXY_PASS = os.environ.get("PP")
23
- PROXY_HOST = os.environ.get("PROXY_HOST")
24
- PROXY_PORT = os.environ.get("PROXY_PORT")
25
-
26
- if not all([PROXY_USER, PROXY_PASS, PROXY_HOST, PROXY_PORT]):
27
- raise RuntimeError("Proxy credentials not fully set in Hugging Face Secrets.")
28
-
29
- PROXY_URL = f"http://{PROXY_USER}:{PROXY_PASS}@{PROXY_HOST}:{PROXY_PORT}"
30
-
31
- # Create a session with proxy
32
- proxy_session = requests.Session()
33
- proxy_session.proxies = {
34
- "http": PROXY_URL,
35
- "https": PROXY_URL
36
- }
37
-
38
- # Patch youtube_transcript_api to use this proxy session
39
- TranscriptListFetcher._session = proxy_session
40
 
41
  # Initialize the text splitter
42
  text_splitter = RecursiveCharacterTextSplitter(
@@ -73,13 +54,24 @@ chat = ChatGoogleGenerativeAI(
73
  # Define the prompt template
74
  prompt = PromptTemplate(
75
  template="""
76
- You are a helpful assistant.
77
- Answer ONLY from the provided transcript context.
78
- If the context is insufficient, just say you don't know.
79
 
80
- {context}
81
- Question: {question}
82
- """,
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  input_variables=['context', 'question']
84
  )
85
 
@@ -107,12 +99,15 @@ def process_video_url(video_url_or_id):
107
  if current_video_id == video_id and current_retriever is not None:
108
  return f"βœ… Video already processed: {video_id}"
109
 
110
- # Get transcript
111
- transcript = YouTubeTranscriptApi.get_transcript(video_id)
 
 
 
112
 
113
- # Extract text segments
114
- list_of_text_segments = [item['text'] for item in transcript]
115
- full_transcript_text = " ".join(list_of_text_segments)
116
 
117
  # Create chunks
118
  chunks = text_splitter.create_documents([full_transcript_text])
@@ -120,7 +115,7 @@ def process_video_url(video_url_or_id):
120
  # Create vector store
121
  vector_store = FAISS.from_documents(chunks, embeddings)
122
  current_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 8})
123
- print(f"βœ… Current Retreiver : {current_retriever}")
124
  current_video_id = video_id
125
 
126
  return f"βœ… Video processed successfully: {video_id}"
@@ -147,7 +142,7 @@ def answer_question(question):
147
 
148
  # Build context and reply as before
149
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
150
- print("\nContext text:\n", context_text)
151
  final_prompt = prompt.invoke({"context": context_text, "question": question})
152
  answer = chat.invoke(final_prompt)
153
  return answer.content
@@ -213,9 +208,10 @@ def main():
213
  # Example inputs
214
  gr.Examples(
215
  examples=[
216
- ["https://www.youtube.com/watch?v=JaRGJVrJBQ8", "What is this video about?"],
217
- ["JaRGJVrJBQ8", "What are the main topics discussed?"],
218
- ["https://www.youtube.com/watch?v=JaRGJVrJBQ8", "Summarize the key points"]
 
219
  ],
220
  inputs=[video_input, question_input]
221
  )
 
2
  load_dotenv()
3
 
4
  import os
5
+
6
  if not os.environ.get("GOOGLE_API_KEY"):
7
  raise RuntimeError("Please set the GOOGLE_API_KEY environment variable with your Google API key.")
8
 
9
  import gradio as gr
10
+ from supadata import Supadata
 
11
  from langchain_core.prompts import PromptTemplate
12
  from langchain.text_splitter import RecursiveCharacterTextSplitter
13
  from langchain_community.vectorstores import FAISS
 
16
  from langchain_core.messages import HumanMessage
17
  from langchain_google_genai import GoogleGenerativeAIEmbeddings
18
 
19
+ # Initialize Supadata
20
+ supadata = Supadata(api_key=os.environ.get("SUPADATA_API_KEY"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
 
22
  # Initialize the text splitter
23
  text_splitter = RecursiveCharacterTextSplitter(
 
54
  # Define the prompt template
55
  prompt = PromptTemplate(
56
  template="""
57
+ You are an intelligent AI assistant specialized in analyzing YouTube video transcripts. Your task is to provide accurate, detailed, and helpful answers based solely on the provided transcript content.
 
 
58
 
59
+ IMPORTANT GUIDELINES:
60
+ - Answer ONLY from the provided transcript context
61
+ - If the context is insufficient to answer the question, clearly state "I don't have enough information from the transcript to answer this question"
62
+ - Provide specific details and examples from the transcript when possible
63
+ - Be concise but comprehensive in your responses
64
+ - If asked for a summary, organize the information logically
65
+ - If asked about specific topics, focus on what was actually discussed in the video
66
+ - Maintain a helpful and informative tone
67
+
68
+ TRANSCRIPT CONTEXT:
69
+ {context}
70
+
71
+ QUESTION: {question}
72
+
73
+ Please provide a clear and detailed answer based on the transcript above:
74
+ """,
75
  input_variables=['context', 'question']
76
  )
77
 
 
99
  if current_video_id == video_id and current_retriever is not None:
100
  return f"βœ… Video already processed: {video_id}"
101
 
102
+ # Get transcript using Supadata
103
+ transcript_response = supadata.youtube.transcript(
104
+ video_id=video_id,
105
+ text=True # Get plain text transcript
106
+ )
107
 
108
+ # Extract the transcript text
109
+ full_transcript_text = transcript_response.content
110
+ # print(f"βœ… Full transcript text: {full_transcript_text}")
111
 
112
  # Create chunks
113
  chunks = text_splitter.create_documents([full_transcript_text])
 
115
  # Create vector store
116
  vector_store = FAISS.from_documents(chunks, embeddings)
117
  current_retriever = vector_store.as_retriever(search_type="similarity", search_kwargs={"k": 8})
118
+ # print(f"βœ… Current Retreiver : {current_retriever}")
119
  current_video_id = video_id
120
 
121
  return f"βœ… Video processed successfully: {video_id}"
 
142
 
143
  # Build context and reply as before
144
  context_text = "\n\n".join(doc.page_content for doc in retrieved_docs)
145
+ # print("\nContext text:\n", context_text)
146
  final_prompt = prompt.invoke({"context": context_text, "question": question})
147
  answer = chat.invoke(final_prompt)
148
  return answer.content
 
208
  # Example inputs
209
  gr.Examples(
210
  examples=[
211
+ ["https://www.youtube.com/watch?v=-moW9jvvMr4&t=1s", "What is this video about?"],
212
+ ["https://www.youtube.com/watch?v=-moW9jvvMr4&t=1s", "What are the main topics discussed in this video?"],
213
+ ["https://www.youtube.com/watch?v=-moW9jvvMr4&t=1s", "Can you summarize the key points from this video?"],
214
+ ["-moW9jvvMr4", "What are the most important takeaways from this content?"]
215
  ],
216
  inputs=[video_input, question_input]
217
  )
requirements.txt CHANGED
@@ -1,10 +1,11 @@
1
- youtube-transcript-api>=1.1.0
2
  langchain-core>=0.3.65
3
  langchain-community>=0.3.25
4
  langchain-huggingface>=0.3.0
 
5
  faiss-cpu>=1.11.0
6
  gradio>=5.34.0
7
  huggingface-hub>=0.33.0
8
  sentence-transformers>=4.1.0
9
  tf_keras>=2.18.0
10
- langchain_google_genai>=2.1.5
 
1
+ supadata>=1.0.0
2
  langchain-core>=0.3.65
3
  langchain-community>=0.3.25
4
  langchain-huggingface>=0.3.0
5
+ langchain-google-genai>=2.0.0
6
  faiss-cpu>=1.11.0
7
  gradio>=5.34.0
8
  huggingface-hub>=0.33.0
9
  sentence-transformers>=4.1.0
10
  tf_keras>=2.18.0
11
+ google-generativeai>=0.8.0