Spaces:

agnixcode
/

youtube_chatbot_transcriber

Sleeping

App Files Files Community

agnixcode commited on 26 days ago

Commit

e34d257

verified ·

1 Parent(s): 784b49b

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -25

app.py CHANGED Viewed

@@ -1,7 +1,12 @@
 # ================================
 # IMPORTS
 # ================================
-from youtube_transcript_api import YouTubeTranscriptApi
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
@@ -13,9 +18,10 @@ import os
 # ================================
 # CONFIG
 # ================================
-GROQ_API_KEY = os.getenv("GROQ_API_KEY")  # 🔐 Use HF secrets
-client = Groq(api_key=GROQ_API_KEY)
 embed_model = SentenceTransformer("all-MiniLM-L6-v2")
 # Global store
@@ -31,6 +37,7 @@ def extract_video_id(url):
 # ================================
 # STEP 1: GET TRANSCRIPT
 # ================================
 def get_transcript(url):
     video_id = extract_video_id(url)
@@ -38,11 +45,28 @@ def get_transcript(url):
         return "❌ Invalid YouTube URL"
     try:
-        api = YouTubeTranscriptApi()
-        transcript = api.fetch(video_id)
-        full_text = " ".join([t.text for t in transcript])
-        return full_text
     except Exception as e:
         return f"❌ Transcript Error: {str(e)}"
@@ -53,11 +77,9 @@ def get_transcript(url):
 def chunk_text(text, chunk_size=300):
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size):
         chunk = " ".join(words[i:i + chunk_size])
         chunks.append(chunk)
     return chunks
 # ================================
@@ -65,13 +87,10 @@ def chunk_text(text, chunk_size=300):
 # ================================
 def create_vector_store(chunks):
     global vector_store, stored_chunks
     embeddings = embed_model.encode(chunks)
     dim = embeddings.shape[1]
     index = faiss.IndexFlatL2(dim)
     index.add(np.array(embeddings))
     vector_store = index
     stored_chunks = chunks
@@ -81,16 +100,14 @@ def create_vector_store(chunks):
 def retrieve(query, top_k=3):
     query_embedding = embed_model.encode([query])
     distances, indices = vector_store.search(np.array(query_embedding), top_k)
     results = [stored_chunks[i] for i in indices[0]]
     return "\n".join(results)
 # ================================
-# STEP 5: LLM
 # ================================
 def generate_answer(query, context):
-    prompt = f"""
-You are a helpful assistant.
 Use ONLY the context below to answer the question.
@@ -100,15 +117,13 @@ Context:
 Question:
 {query}
-Answer:
-"""
     response = client.chat.completions.create(
         model="llama-3.3-70b-versatile",
         messages=[{"role": "user", "content": prompt}],
         temperature=0.3
     )
     return response.choices[0].message.content
 # ================================
@@ -116,24 +131,18 @@ Answer:
 # ================================
 def handle_process(url):
     transcript = get_transcript(url)
     if transcript.startswith("❌"):
         return transcript, "", []
     chunks = chunk_text(transcript)
     create_vector_store(chunks)
     preview = transcript[:500]
     return "✅ Video processed successfully!", preview, []
 def handle_chat(query, chat_history):
     if vector_store is None:
         return "", chat_history + [(query, "❌ Process a video first")]
     context = retrieve(query)
     answer = generate_answer(query, context)
     chat_history.append((query, answer))
     return "", chat_history

+# ================================
+# INSTALL DEPENDENCIES
+# ================================
+# pip install sentence-transformers faiss-cpu gradio groq requests
 # ================================
 # IMPORTS
 # ================================
+import requests
 from sentence_transformers import SentenceTransformer
 import faiss
 import numpy as np
 # ================================
 # CONFIG
 # ================================
+GROQ_API_KEY = os.getenv("GROQ_API_KEY")
+SUPADATA_API_KEY = os.getenv("SUPADATA_API_KEY")
+client = Groq(api_key=GROQ_API_KEY)
 embed_model = SentenceTransformer("all-MiniLM-L6-v2")
 # Global store
 # ================================
 # STEP 1: GET TRANSCRIPT
+# Using Supadata API — works from any cloud server (no IP blocks)
 # ================================
 def get_transcript(url):
     video_id = extract_video_id(url)
         return "❌ Invalid YouTube URL"
     try:
+        response = requests.get(
+            "https://api.supadata.ai/v1/youtube/transcript",
+            params={"videoId": video_id, "text": "true"},
+            headers={"x-api-key": SUPADATA_API_KEY},
+            timeout=30
+        )
+        if response.status_code == 401:
+            return "❌ Invalid Supadata API key. Check your HF secret: SUPADATA_API_KEY"
+        if response.status_code == 404:
+            return "❌ No transcript found for this video (it may have captions disabled)"
+        if response.status_code != 200:
+            return f"❌ Supadata API error {response.status_code}: {response.text}"
+        data = response.json()
+        # text=true returns content as a plain string
+        content = data.get("content", "")
+        if not content:
+            return "❌ Transcript is empty"
+        return content
     except Exception as e:
         return f"❌ Transcript Error: {str(e)}"
 def chunk_text(text, chunk_size=300):
     words = text.split()
     chunks = []
     for i in range(0, len(words), chunk_size):
         chunk = " ".join(words[i:i + chunk_size])
         chunks.append(chunk)
     return chunks
 # ================================
 # ================================
 def create_vector_store(chunks):
     global vector_store, stored_chunks
     embeddings = embed_model.encode(chunks)
     dim = embeddings.shape[1]
     index = faiss.IndexFlatL2(dim)
     index.add(np.array(embeddings))
     vector_store = index
     stored_chunks = chunks
 def retrieve(query, top_k=3):
     query_embedding = embed_model.encode([query])
     distances, indices = vector_store.search(np.array(query_embedding), top_k)
     results = [stored_chunks[i] for i in indices[0]]
     return "\n".join(results)
 # ================================
+# STEP 5: LLM (GROQ)
 # ================================
 def generate_answer(query, context):
+    prompt = f"""You are a helpful assistant.
 Use ONLY the context below to answer the question.
 Question:
 {query}
+Answer:"""
     response = client.chat.completions.create(
         model="llama-3.3-70b-versatile",
         messages=[{"role": "user", "content": prompt}],
         temperature=0.3
     )
     return response.choices[0].message.content
 # ================================
 # ================================
 def handle_process(url):
     transcript = get_transcript(url)
     if transcript.startswith("❌"):
         return transcript, "", []
     chunks = chunk_text(transcript)
     create_vector_store(chunks)
     preview = transcript[:500]
     return "✅ Video processed successfully!", preview, []
 def handle_chat(query, chat_history):
     if vector_store is None:
         return "", chat_history + [(query, "❌ Process a video first")]
     context = retrieve(query)
     answer = generate_answer(query, context)
     chat_history.append((query, answer))
     return "", chat_history