agnixcode commited on
Commit
e34d257
·
verified ·
1 Parent(s): 784b49b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -25
app.py CHANGED
@@ -1,7 +1,12 @@
 
 
 
 
 
1
  # ================================
2
  # IMPORTS
3
  # ================================
4
- from youtube_transcript_api import YouTubeTranscriptApi
5
  from sentence_transformers import SentenceTransformer
6
  import faiss
7
  import numpy as np
@@ -13,9 +18,10 @@ import os
13
  # ================================
14
  # CONFIG
15
  # ================================
16
- GROQ_API_KEY = os.getenv("GROQ_API_KEY") # 🔐 Use HF secrets
17
- client = Groq(api_key=GROQ_API_KEY)
18
 
 
19
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
20
 
21
  # Global store
@@ -31,6 +37,7 @@ def extract_video_id(url):
31
 
32
  # ================================
33
  # STEP 1: GET TRANSCRIPT
 
34
  # ================================
35
  def get_transcript(url):
36
  video_id = extract_video_id(url)
@@ -38,11 +45,28 @@ def get_transcript(url):
38
  return "❌ Invalid YouTube URL"
39
 
40
  try:
41
- api = YouTubeTranscriptApi()
42
- transcript = api.fetch(video_id)
 
 
 
 
43
 
44
- full_text = " ".join([t.text for t in transcript])
45
- return full_text
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
  except Exception as e:
48
  return f"❌ Transcript Error: {str(e)}"
@@ -53,11 +77,9 @@ def get_transcript(url):
53
  def chunk_text(text, chunk_size=300):
54
  words = text.split()
55
  chunks = []
56
-
57
  for i in range(0, len(words), chunk_size):
58
  chunk = " ".join(words[i:i + chunk_size])
59
  chunks.append(chunk)
60
-
61
  return chunks
62
 
63
  # ================================
@@ -65,13 +87,10 @@ def chunk_text(text, chunk_size=300):
65
  # ================================
66
  def create_vector_store(chunks):
67
  global vector_store, stored_chunks
68
-
69
  embeddings = embed_model.encode(chunks)
70
-
71
  dim = embeddings.shape[1]
72
  index = faiss.IndexFlatL2(dim)
73
  index.add(np.array(embeddings))
74
-
75
  vector_store = index
76
  stored_chunks = chunks
77
 
@@ -81,16 +100,14 @@ def create_vector_store(chunks):
81
  def retrieve(query, top_k=3):
82
  query_embedding = embed_model.encode([query])
83
  distances, indices = vector_store.search(np.array(query_embedding), top_k)
84
-
85
  results = [stored_chunks[i] for i in indices[0]]
86
  return "\n".join(results)
87
 
88
  # ================================
89
- # STEP 5: LLM
90
  # ================================
91
  def generate_answer(query, context):
92
- prompt = f"""
93
- You are a helpful assistant.
94
 
95
  Use ONLY the context below to answer the question.
96
 
@@ -100,15 +117,13 @@ Context:
100
  Question:
101
  {query}
102
 
103
- Answer:
104
- """
105
 
106
  response = client.chat.completions.create(
107
  model="llama-3.3-70b-versatile",
108
  messages=[{"role": "user", "content": prompt}],
109
  temperature=0.3
110
  )
111
-
112
  return response.choices[0].message.content
113
 
114
  # ================================
@@ -116,24 +131,18 @@ Answer:
116
  # ================================
117
  def handle_process(url):
118
  transcript = get_transcript(url)
119
-
120
  if transcript.startswith("❌"):
121
  return transcript, "", []
122
-
123
  chunks = chunk_text(transcript)
124
  create_vector_store(chunks)
125
-
126
  preview = transcript[:500]
127
-
128
  return "✅ Video processed successfully!", preview, []
129
 
130
  def handle_chat(query, chat_history):
131
  if vector_store is None:
132
  return "", chat_history + [(query, "❌ Process a video first")]
133
-
134
  context = retrieve(query)
135
  answer = generate_answer(query, context)
136
-
137
  chat_history.append((query, answer))
138
  return "", chat_history
139
 
 
1
+ # ================================
2
+ # INSTALL DEPENDENCIES
3
+ # ================================
4
+ # pip install sentence-transformers faiss-cpu gradio groq requests
5
+
6
  # ================================
7
  # IMPORTS
8
  # ================================
9
+ import requests
10
  from sentence_transformers import SentenceTransformer
11
  import faiss
12
  import numpy as np
 
18
  # ================================
19
  # CONFIG
20
  # ================================
21
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY")
22
+ SUPADATA_API_KEY = os.getenv("SUPADATA_API_KEY")
23
 
24
+ client = Groq(api_key=GROQ_API_KEY)
25
  embed_model = SentenceTransformer("all-MiniLM-L6-v2")
26
 
27
  # Global store
 
37
 
38
  # ================================
39
  # STEP 1: GET TRANSCRIPT
40
+ # Using Supadata API — works from any cloud server (no IP blocks)
41
  # ================================
42
  def get_transcript(url):
43
  video_id = extract_video_id(url)
 
45
  return "❌ Invalid YouTube URL"
46
 
47
  try:
48
+ response = requests.get(
49
+ "https://api.supadata.ai/v1/youtube/transcript",
50
+ params={"videoId": video_id, "text": "true"},
51
+ headers={"x-api-key": SUPADATA_API_KEY},
52
+ timeout=30
53
+ )
54
 
55
+ if response.status_code == 401:
56
+ return "❌ Invalid Supadata API key. Check your HF secret: SUPADATA_API_KEY"
57
+ if response.status_code == 404:
58
+ return "❌ No transcript found for this video (it may have captions disabled)"
59
+ if response.status_code != 200:
60
+ return f"❌ Supadata API error {response.status_code}: {response.text}"
61
+
62
+ data = response.json()
63
+
64
+ # text=true returns content as a plain string
65
+ content = data.get("content", "")
66
+ if not content:
67
+ return "❌ Transcript is empty"
68
+
69
+ return content
70
 
71
  except Exception as e:
72
  return f"❌ Transcript Error: {str(e)}"
 
77
  def chunk_text(text, chunk_size=300):
78
  words = text.split()
79
  chunks = []
 
80
  for i in range(0, len(words), chunk_size):
81
  chunk = " ".join(words[i:i + chunk_size])
82
  chunks.append(chunk)
 
83
  return chunks
84
 
85
  # ================================
 
87
  # ================================
88
  def create_vector_store(chunks):
89
  global vector_store, stored_chunks
 
90
  embeddings = embed_model.encode(chunks)
 
91
  dim = embeddings.shape[1]
92
  index = faiss.IndexFlatL2(dim)
93
  index.add(np.array(embeddings))
 
94
  vector_store = index
95
  stored_chunks = chunks
96
 
 
100
  def retrieve(query, top_k=3):
101
  query_embedding = embed_model.encode([query])
102
  distances, indices = vector_store.search(np.array(query_embedding), top_k)
 
103
  results = [stored_chunks[i] for i in indices[0]]
104
  return "\n".join(results)
105
 
106
  # ================================
107
+ # STEP 5: LLM (GROQ)
108
  # ================================
109
  def generate_answer(query, context):
110
+ prompt = f"""You are a helpful assistant.
 
111
 
112
  Use ONLY the context below to answer the question.
113
 
 
117
  Question:
118
  {query}
119
 
120
+ Answer:"""
 
121
 
122
  response = client.chat.completions.create(
123
  model="llama-3.3-70b-versatile",
124
  messages=[{"role": "user", "content": prompt}],
125
  temperature=0.3
126
  )
 
127
  return response.choices[0].message.content
128
 
129
  # ================================
 
131
  # ================================
132
  def handle_process(url):
133
  transcript = get_transcript(url)
 
134
  if transcript.startswith("❌"):
135
  return transcript, "", []
 
136
  chunks = chunk_text(transcript)
137
  create_vector_store(chunks)
 
138
  preview = transcript[:500]
 
139
  return "✅ Video processed successfully!", preview, []
140
 
141
  def handle_chat(query, chat_history):
142
  if vector_store is None:
143
  return "", chat_history + [(query, "❌ Process a video first")]
 
144
  context = retrieve(query)
145
  answer = generate_answer(query, context)
 
146
  chat_history.append((query, answer))
147
  return "", chat_history
148