FocusFlow Assistant commited on
Commit
068aa2f
·
1 Parent(s): dcf165e

feat: use Invidious API for YouTube transcripts

Browse files

Bypass YouTube CORS/blocking by fetching transcripts through
the public Invidious API server-side.

Changes:
- Added get_youtube_transcript using Invidious to rag_engine.py
- Added /ingest_youtube endpoint to backend/main.py
- Refactored app.py URL parsing to hit backend directly
- Removed browser-side JS component and youtube_transcript.html

Files changed (4) hide show
  1. app.py +20 -31
  2. backend/main.py +35 -0
  3. backend/rag_engine.py +113 -0
  4. frontend/youtube_transcript.html +0 -137
app.py CHANGED
@@ -1137,7 +1137,7 @@ if not st.session_state.focus_mode:
1137
  st.error(f"Error: {e}")
1138
 
1139
 
1140
- # URL Input - YouTube uses browser-side fetch, web pages use server-side
1141
  with st.expander("+ Add URL / YouTube"):
1142
  url_input = st.text_input("URL", placeholder="https://youtube.com/... or any webpage", label_visibility="collapsed")
1143
  if st.button("Process URL", use_container_width=True):
@@ -1154,8 +1154,25 @@ if not st.session_state.focus_mode:
1154
  st.error("❌ Invalid YouTube URL format. Supported: youtube.com/watch?v=ID, youtu.be/ID, or youtube.com/shorts/ID")
1155
  else:
1156
  video_id = vid_match.group(1)
1157
- st.session_state["yt_processing"] = video_id
1158
- st.rerun()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1159
  else:
1160
  # Non-YouTube URL: use server-side ingestion
1161
  with st.spinner("Fetching content..."):
@@ -1172,34 +1189,6 @@ if not st.session_state.focus_mode:
1172
  st.error("⏱️ Request timed out. Please try again.")
1173
  except Exception as e:
1174
  st.error(f"Error: {str(e)}")
1175
-
1176
- # Browser-side YouTube transcript fetch
1177
- if st.session_state.get("yt_processing"):
1178
- video_id = st.session_state["yt_processing"]
1179
- st.info(f"⏳ Fetching transcript for video `{video_id}` from your browser...")
1180
-
1181
- # Load and inject the YouTube transcript component with video ID
1182
- try:
1183
- import os as os_mod
1184
- html_path = os_mod.path.join(os_mod.path.dirname(__file__), "frontend", "youtube_transcript.html")
1185
- with open(html_path) as f:
1186
- yt_html = f.read()
1187
-
1188
- # Inject the video ID so the component auto-starts
1189
- yt_html = yt_html.replace(
1190
- '<body>',
1191
- f'<body data-video-id="{video_id}">'
1192
- )
1193
-
1194
- components.html(yt_html, height=40)
1195
-
1196
- # Auto-clear after a delay (user can click Process URL again if needed)
1197
- if st.button("🔄 Done / Refresh Sources", use_container_width=True):
1198
- st.session_state.pop("yt_processing", None)
1199
- st.rerun()
1200
- except FileNotFoundError:
1201
- st.error("YouTube component not found. Please check frontend/youtube_transcript.html exists.")
1202
- st.session_state.pop("yt_processing", None)
1203
 
1204
  # --- FOCUS MODE UI ---
1205
  if st.session_state.focus_mode:
 
1137
  st.error(f"Error: {e}")
1138
 
1139
 
1140
+ # URL Input
1141
  with st.expander("+ Add URL / YouTube"):
1142
  url_input = st.text_input("URL", placeholder="https://youtube.com/... or any webpage", label_visibility="collapsed")
1143
  if st.button("Process URL", use_container_width=True):
 
1154
  st.error("❌ Invalid YouTube URL format. Supported: youtube.com/watch?v=ID, youtu.be/ID, or youtube.com/shorts/ID")
1155
  else:
1156
  video_id = vid_match.group(1)
1157
+ with st.spinner(" Fetching transcript via Invidious..."):
1158
+ try:
1159
+ resp = requests.post(f"{API_URL}/ingest_youtube", json={"video_id": video_id}, headers=get_headers(), timeout=120)
1160
+ if resp.status_code == 200:
1161
+ st.success("✅ YouTube transcript processed successfully!")
1162
+ time.sleep(1)
1163
+ st.rerun()
1164
+ else:
1165
+ error_detail = resp.json().get('detail', resp.text)
1166
+ if "No captions available" in str(error_detail):
1167
+ st.error("❌ No captions found. Try a video with CC enabled.")
1168
+ elif "Could not reach any transcript" in str(error_detail):
1169
+ st.error("⚠️ Transcript service unavailable. Try again later.")
1170
+ else:
1171
+ st.error(f"Failed: {error_detail}")
1172
+ except requests.Timeout:
1173
+ st.error("⏱️ Request timed out. Please try again.")
1174
+ except Exception as e:
1175
+ st.error(f"Error: {str(e)}")
1176
  else:
1177
  # Non-YouTube URL: use server-side ingestion
1178
  with st.spinner("Fetching content..."):
 
1189
  st.error("⏱️ Request timed out. Please try again.")
1190
  except Exception as e:
1191
  st.error(f"Error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1192
 
1193
  # --- FOCUS MODE UI ---
1194
  if st.session_state.focus_mode:
backend/main.py CHANGED
@@ -138,6 +138,41 @@ def ingest_text_endpoint(request: TextIngestionRequest, db: Session = Depends(ge
138
  except Exception as e:
139
  raise HTTPException(status_code=500, detail=str(e))
140
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  @app.get("/sources", response_model=List[SourceItem])
142
  def get_sources(db: Session = Depends(get_db)):
143
  sources = db.query(Source).filter(Source.is_active == True).all()
 
138
  except Exception as e:
139
  raise HTTPException(status_code=500, detail=str(e))
140
 
141
+ class YouTubeIngestionRequest(BaseModel):
142
+ video_id: str
143
+
144
+ @app.post("/ingest_youtube")
145
+ def ingest_youtube(request: YouTubeIngestionRequest, db: Session = Depends(get_db)):
146
+ try:
147
+ from backend.rag_engine import get_youtube_transcript, ingest_text
148
+ # Fetch transcript using Invidious
149
+ transcript_text = get_youtube_transcript(request.video_id)
150
+
151
+ # Run through existing ingestion pipeline
152
+ source_name = f"YouTube: {request.video_id}"
153
+ title = ingest_text(
154
+ text=transcript_text,
155
+ source_name=source_name,
156
+ source_type="youtube"
157
+ )
158
+
159
+ # Save to DB
160
+ new_source = Source(filename=title, type="youtube", file_path=source_name, is_active=True)
161
+ db.add(new_source)
162
+ db.commit()
163
+ db.refresh(new_source)
164
+
165
+ return {"status": "success", "message": f"Successfully added: {title}", "source": source_name, "id": new_source.id}
166
+
167
+ except ValueError as e:
168
+ raise HTTPException(status_code=400, detail=str(e))
169
+ except Exception as e:
170
+ raise HTTPException(
171
+ status_code=500,
172
+ detail=f"Failed to process YouTube video: {str(e)}"
173
+ )
174
+
175
+
176
  @app.get("/sources", response_model=List[SourceItem])
177
  def get_sources(db: Session = Depends(get_db)):
178
  sources = db.query(Source).filter(Source.is_active == True).all()
backend/rag_engine.py CHANGED
@@ -9,14 +9,127 @@ from langchain_core.documents import Document
9
  import logging
10
  import time
11
  import re
 
 
12
 
13
  # Configure logger FIRST
14
  logger = logging.getLogger(__name__)
15
 
16
  CACHE_DIR = "./chroma_db"
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
 
 
 
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  def ingest_document(file_path: str):
 
9
  import logging
10
  import time
11
  import re
12
+ import requests
13
+ import xml.etree.ElementTree as ET
14
 
15
  # Configure logger FIRST
16
  logger = logging.getLogger(__name__)
17
 
18
  CACHE_DIR = "./chroma_db"
19
 
20
+ INVIDIOUS_INSTANCES = [
21
+ "https://inv.nadeko.net",
22
+ "https://invidious.slipfox.xyz",
23
+ "https://invidious.privacydev.net",
24
+ "https://yt.artemislena.eu"
25
+ ]
26
+
27
+ def get_youtube_transcript(video_id: str) -> str:
28
+ """Fetch YouTube transcripts via the Invidious API to bypass network blocks."""
29
+ # Step 1: Try each Invidious instance until one works
30
+ captions_data = None
31
+ last_error = None
32
+
33
+ for instance in INVIDIOUS_INSTANCES:
34
+ try:
35
+ url = f"{instance}/api/v1/captions/{video_id}"
36
+ response = requests.get(url, timeout=10)
37
+ if response.status_code == 200:
38
+ captions_data = response.json()
39
+ break
40
+ else:
41
+ last_error = f"HTTP {response.status_code}"
42
+ except Exception as e:
43
+ last_error = str(e)
44
+ continue
45
+
46
+ if not captions_data:
47
+ raise ValueError(
48
+ "Could not reach any transcript service. "
49
+ f"Last error: {last_error}. "
50
+ "Please try again later or upload a PDF instead."
51
+ )
52
 
53
+ # Step 2: Get list of caption tracks
54
+ captions = captions_data.get("captions", [])
55
 
56
+ if not captions:
57
+ raise ValueError(
58
+ "No captions available for this video. "
59
+ "The creator may have disabled captions. "
60
+ "Try a video with the CC button visible, "
61
+ "or upload a PDF instead."
62
+ )
63
+
64
+ # Step 3: Pick best caption track in priority order
65
+ selected = None
66
+ for cap in captions:
67
+ lang = cap.get("languageCode", "")
68
+ auto = cap.get("autoGenerated", False)
69
+ if lang == "en" and not auto:
70
+ selected = cap
71
+ break
72
+ if not selected:
73
+ for cap in captions:
74
+ lang = cap.get("languageCode", "")
75
+ if lang == "en":
76
+ selected = cap
77
+ break
78
+ if not selected:
79
+ selected = captions[0]
80
+
81
+ # Step 4: Download the caption track
82
+ caption_url = selected.get("url")
83
+ if not caption_url:
84
+ label = selected.get("label", "English")
85
+ lang_code = selected.get("languageCode", "en")
86
+ caption_url = (
87
+ f"{INVIDIOUS_INSTANCES[0]}/api/v1/captions/{video_id}"
88
+ f"?label={label}&lang={lang_code}"
89
+ )
90
+
91
+ cap_response = requests.get(caption_url, timeout=15)
92
+ if cap_response.status_code != 200:
93
+ raise ValueError("Failed to download caption track.")
94
+
95
+ content = cap_response.text
96
+
97
+ # Step 5: Parse the caption format (XML/VTT/SRT)
98
+ try:
99
+ root = ET.fromstring(content)
100
+ text_parts = []
101
+ for elem in root.iter():
102
+ if elem.text and elem.text.strip():
103
+ text_parts.append(elem.text.strip())
104
+ transcript_text = " ".join(text_parts)
105
+ except ET.ParseError:
106
+ lines = content.split('\n')
107
+ text_parts = []
108
+ for line in lines:
109
+ line = line.strip()
110
+ if not line:
111
+ continue
112
+ if re.match(r'^\d+$', line):
113
+ continue
114
+ if re.match(r'[\d:,]+ --> [\d:,]+', line):
115
+ continue
116
+ if line in ['WEBVTT', 'NOTE']:
117
+ continue
118
+ text_parts.append(line)
119
+ transcript_text = " ".join(text_parts)
120
+
121
+ # Step 6: Clean the text
122
+ transcript_text = re.sub(r'<[^>]+>', '', transcript_text)
123
+ transcript_text = re.sub(r'\[.*?\]', '', transcript_text)
124
+ transcript_text = re.sub(r'&amp;', '&', transcript_text)
125
+ transcript_text = re.sub(r'&quot;', '"', transcript_text)
126
+ transcript_text = re.sub(r'&#39;', "'", transcript_text)
127
+ transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()
128
+
129
+ if len(transcript_text) < 50:
130
+ raise ValueError("Transcript is too short or empty. Try a different video.")
131
+
132
+ return transcript_text
133
 
134
 
135
  def ingest_document(file_path: str):
frontend/youtube_transcript.html DELETED
@@ -1,137 +0,0 @@
1
- <!DOCTYPE html>
2
- <html>
3
-
4
- <head>
5
- <meta charset="utf-8">
6
- </head>
7
-
8
- <body>
9
- <div id="status" style="font-family: 'Inter', sans-serif; font-size: 14px; padding: 8px 0; color: #6B7280;"></div>
10
- <script>
11
- const BACKEND_URL = "http://localhost:8000";
12
- const statusEl = document.getElementById("status");
13
-
14
- function setStatus(msg, color) {
15
- statusEl.textContent = msg;
16
- statusEl.style.color = color || "#6B7280";
17
- }
18
-
19
- async function fetchAndIngest(videoId) {
20
- try {
21
- setStatus("⏳ Fetching transcript from YouTube...", "#2563EB");
22
-
23
- // Step 1: Fetch the YouTube page from the user's browser
24
- const response = await fetch(
25
- `https://www.youtube.com/watch?v=${videoId}`,
26
- { headers: { "Accept-Language": "en-US,en;q=0.9" } }
27
- );
28
-
29
- if (!response.ok) {
30
- throw new Error("Could not access this YouTube video. It may be private or region-restricted.");
31
- }
32
-
33
- const html = await response.text();
34
-
35
- // Step 2: Extract caption tracks
36
- const match = html.match(/"captionTracks":(\[.*?\])/);
37
- if (!match) {
38
- throw new Error("No captions available for this video. Try a video with the CC button visible.");
39
- }
40
-
41
- const tracks = JSON.parse(match[1]);
42
-
43
- // Step 3: Pick best caption track
44
- let selected = null;
45
- for (const t of tracks) {
46
- if (t.languageCode === 'en' && t.kind !== 'asr') { selected = t; break; }
47
- }
48
- if (!selected) {
49
- for (const t of tracks) {
50
- if (t.languageCode === 'en') { selected = t; break; }
51
- }
52
- }
53
- if (!selected) selected = tracks[0];
54
-
55
- if (!selected || !selected.baseUrl) {
56
- throw new Error("Could not find a usable caption track.");
57
- }
58
-
59
- setStatus("⏳ Downloading captions...", "#2563EB");
60
-
61
- // Step 4: Download captions
62
- const capResponse = await fetch(selected.baseUrl + "&fmt=json3");
63
- const capData = await capResponse.json();
64
-
65
- // Step 5: Parse transcript
66
- const events = capData.events || [];
67
- let parts = [];
68
- for (const event of events) {
69
- for (const seg of (event.segs || [])) {
70
- if (seg.utf8 && seg.utf8 !== '\n') {
71
- parts.push(seg.utf8);
72
- }
73
- }
74
- }
75
-
76
- let text = parts.join(' ')
77
- .replace(/\[.*?\]/g, '')
78
- .replace(/\s+/g, ' ')
79
- .trim();
80
-
81
- if (text.length < 50) {
82
- throw new Error("Transcript too short or empty. Try a video with more spoken content.");
83
- }
84
-
85
- setStatus(`⏳ Processing ${text.length} characters...`, "#2563EB");
86
-
87
- // Step 6: Send transcript directly to backend
88
- const ingestResponse = await fetch(`${BACKEND_URL}/ingest_text`, {
89
- method: "POST",
90
- headers: { "Content-Type": "application/json" },
91
- body: JSON.stringify({
92
- text: text,
93
- source_name: `YouTube: ${videoId}`,
94
- source_type: "youtube"
95
- })
96
- });
97
-
98
- if (!ingestResponse.ok) {
99
- const errData = await ingestResponse.json().catch(() => ({}));
100
- throw new Error(errData.detail || "Failed to process transcript in backend.");
101
- }
102
-
103
- setStatus("✅ YouTube transcript processed successfully!", "#16A34A");
104
-
105
- // Notify Streamlit parent to rerun
106
- window.parent.postMessage({
107
- type: "youtube_transcript",
108
- status: "success",
109
- videoId: videoId
110
- }, "*");
111
-
112
- } catch (err) {
113
- setStatus("❌ " + err.message, "#DC2626");
114
- window.parent.postMessage({
115
- type: "youtube_transcript",
116
- status: "error",
117
- message: err.message
118
- }, "*");
119
- }
120
- }
121
-
122
- // Listen for trigger from Streamlit
123
- window.addEventListener("message", function (event) {
124
- if (event.data && event.data.type === "fetch_youtube" && event.data.videoId) {
125
- fetchAndIngest(event.data.videoId);
126
- }
127
- });
128
-
129
- // Auto-start if video ID is embedded in the page
130
- const autoVideoId = document.body.getAttribute("data-video-id");
131
- if (autoVideoId) {
132
- fetchAndIngest(autoVideoId);
133
- }
134
- </script>
135
- </body>
136
-
137
- </html>