Spaces:

noodledom
/

focusflow

Sleeping

FocusFlow Assistant commited on Feb 28

Commit

068aa2f

1 Parent(s): dcf165e

feat: use Invidious API for YouTube transcripts

Bypass YouTube CORS/blocking by fetching transcripts through
the public Invidious API server-side.

Changes:
- Added get_youtube_transcript using Invidious to rag_engine.py
- Added /ingest_youtube endpoint to backend/main.py
- Refactored app.py URL parsing to hit backend directly
- Removed browser-side JS component and youtube_transcript.html

Files changed (4) hide show

app.py +20 -31
backend/main.py +35 -0
backend/rag_engine.py +113 -0
frontend/youtube_transcript.html +0 -137

app.py CHANGED Viewed

@@ -1137,7 +1137,7 @@ if not st.session_state.focus_mode:
                             st.error(f"Error: {e}")
-            # URL Input - YouTube uses browser-side fetch, web pages use server-side
             with st.expander("+ Add URL / YouTube"):
                 url_input = st.text_input("URL", placeholder="https://youtube.com/... or any webpage", label_visibility="collapsed")
                 if st.button("Process URL", use_container_width=True):
@@ -1154,8 +1154,25 @@ if not st.session_state.focus_mode:
                                 st.error("❌ Invalid YouTube URL format. Supported: youtube.com/watch?v=ID, youtu.be/ID, or youtube.com/shorts/ID")
                             else:
                                 video_id = vid_match.group(1)
-                                st.session_state["yt_processing"] = video_id
-                                st.rerun()
                         else:
                             # Non-YouTube URL: use server-side ingestion
                             with st.spinner("Fetching content..."):
@@ -1172,34 +1189,6 @@ if not st.session_state.focus_mode:
                                     st.error("⏱️ Request timed out. Please try again.")
                                 except Exception as e:
                                     st.error(f"Error: {str(e)}")
-                # Browser-side YouTube transcript fetch
-                if st.session_state.get("yt_processing"):
-                    video_id = st.session_state["yt_processing"]
-                    st.info(f"⏳ Fetching transcript for video `{video_id}` from your browser...")
-                    # Load and inject the YouTube transcript component with video ID
-                    try:
-                        import os as os_mod
-                        html_path = os_mod.path.join(os_mod.path.dirname(__file__), "frontend", "youtube_transcript.html")
-                        with open(html_path) as f:
-                            yt_html = f.read()
-                        # Inject the video ID so the component auto-starts
-                        yt_html = yt_html.replace(
-                            '<body>',
-                            f'<body data-video-id="{video_id}">'
-                        )
-                        components.html(yt_html, height=40)
-                        # Auto-clear after a delay (user can click Process URL again if needed)
-                        if st.button("🔄 Done / Refresh Sources", use_container_width=True):
-                            st.session_state.pop("yt_processing", None)
-                            st.rerun()
-                    except FileNotFoundError:
-                        st.error("YouTube component not found. Please check frontend/youtube_transcript.html exists.")
-                        st.session_state.pop("yt_processing", None)
 # --- FOCUS MODE UI ---
 if st.session_state.focus_mode:

                             st.error(f"Error: {e}")
+            # URL Input
             with st.expander("+ Add URL / YouTube"):
                 url_input = st.text_input("URL", placeholder="https://youtube.com/... or any webpage", label_visibility="collapsed")
                 if st.button("Process URL", use_container_width=True):
                                 st.error("❌ Invalid YouTube URL format. Supported: youtube.com/watch?v=ID, youtu.be/ID, or youtube.com/shorts/ID")
                             else:
                                 video_id = vid_match.group(1)
+                                with st.spinner("⏳ Fetching transcript via Invidious..."):
+                                    try:
+                                        resp = requests.post(f"{API_URL}/ingest_youtube", json={"video_id": video_id}, headers=get_headers(), timeout=120)
+                                        if resp.status_code == 200:
+                                            st.success("✅ YouTube transcript processed successfully!")
+                                            time.sleep(1)
+                                            st.rerun()
+                                        else:
+                                            error_detail = resp.json().get('detail', resp.text)
+                                            if "No captions available" in str(error_detail):
+                                                st.error("❌ No captions found. Try a video with CC enabled.")
+                                            elif "Could not reach any transcript" in str(error_detail):
+                                                st.error("⚠️ Transcript service unavailable. Try again later.")
+                                            else:
+                                                st.error(f"Failed: {error_detail}")
+                                    except requests.Timeout:
+                                        st.error("⏱️ Request timed out. Please try again.")
+                                    except Exception as e:
+                                        st.error(f"Error: {str(e)}")
                         else:
                             # Non-YouTube URL: use server-side ingestion
                             with st.spinner("Fetching content..."):
                                     st.error("⏱️ Request timed out. Please try again.")
                                 except Exception as e:
                                     st.error(f"Error: {str(e)}")
 # --- FOCUS MODE UI ---
 if st.session_state.focus_mode:

backend/main.py CHANGED Viewed

@@ -138,6 +138,41 @@ def ingest_text_endpoint(request: TextIngestionRequest, db: Session = Depends(ge
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 @app.get("/sources", response_model=List[SourceItem])
 def get_sources(db: Session = Depends(get_db)):
     sources = db.query(Source).filter(Source.is_active == True).all()

     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+class YouTubeIngestionRequest(BaseModel):
+    video_id: str
+@app.post("/ingest_youtube")
+def ingest_youtube(request: YouTubeIngestionRequest, db: Session = Depends(get_db)):
+    try:
+        from backend.rag_engine import get_youtube_transcript, ingest_text
+        # Fetch transcript using Invidious
+        transcript_text = get_youtube_transcript(request.video_id)
+        # Run through existing ingestion pipeline
+        source_name = f"YouTube: {request.video_id}"
+        title = ingest_text(
+            text=transcript_text,
+            source_name=source_name,
+            source_type="youtube"
+        )
+        # Save to DB
+        new_source = Source(filename=title, type="youtube", file_path=source_name, is_active=True)
+        db.add(new_source)
+        db.commit()
+        db.refresh(new_source)
+        return {"status": "success", "message": f"Successfully added: {title}", "source": source_name, "id": new_source.id}
+    except ValueError as e:
+        raise HTTPException(status_code=400, detail=str(e))
+    except Exception as e:
+        raise HTTPException(
+            status_code=500,
+            detail=f"Failed to process YouTube video: {str(e)}"
+        )
 @app.get("/sources", response_model=List[SourceItem])
 def get_sources(db: Session = Depends(get_db)):
     sources = db.query(Source).filter(Source.is_active == True).all()

backend/rag_engine.py CHANGED Viewed

@@ -9,14 +9,127 @@ from langchain_core.documents import Document
 import logging
 import time
 import re
 # Configure logger FIRST
 logger = logging.getLogger(__name__)
 CACHE_DIR = "./chroma_db"
 def ingest_document(file_path: str):

 import logging
 import time
 import re
+import requests
+import xml.etree.ElementTree as ET
 # Configure logger FIRST
 logger = logging.getLogger(__name__)
 CACHE_DIR = "./chroma_db"
+INVIDIOUS_INSTANCES = [
+    "https://inv.nadeko.net",
+    "https://invidious.slipfox.xyz",
+    "https://invidious.privacydev.net",
+    "https://yt.artemislena.eu"
+]
+def get_youtube_transcript(video_id: str) -> str:
+    """Fetch YouTube transcripts via the Invidious API to bypass network blocks."""
+    # Step 1: Try each Invidious instance until one works
+    captions_data = None
+    last_error = None
+    for instance in INVIDIOUS_INSTANCES:
+        try:
+            url = f"{instance}/api/v1/captions/{video_id}"
+            response = requests.get(url, timeout=10)
+            if response.status_code == 200:
+                captions_data = response.json()
+                break
+            else:
+                last_error = f"HTTP {response.status_code}"
+        except Exception as e:
+            last_error = str(e)
+            continue
+    if not captions_data:
+        raise ValueError(
+            "Could not reach any transcript service. "
+            f"Last error: {last_error}. "
+            "Please try again later or upload a PDF instead."
+        )
+    # Step 2: Get list of caption tracks
+    captions = captions_data.get("captions", [])
+    if not captions:
+        raise ValueError(
+            "No captions available for this video. "
+            "The creator may have disabled captions. "
+            "Try a video with the CC button visible, "
+            "or upload a PDF instead."
+        )
+    # Step 3: Pick best caption track in priority order
+    selected = None
+    for cap in captions:
+        lang = cap.get("languageCode", "")
+        auto = cap.get("autoGenerated", False)
+        if lang == "en" and not auto:
+            selected = cap
+            break
+    if not selected:
+        for cap in captions:
+            lang = cap.get("languageCode", "")
+            if lang == "en":
+                selected = cap
+                break
+    if not selected:
+        selected = captions[0]
+    # Step 4: Download the caption track
+    caption_url = selected.get("url")
+    if not caption_url:
+        label = selected.get("label", "English")
+        lang_code = selected.get("languageCode", "en")
+        caption_url = (
+            f"{INVIDIOUS_INSTANCES[0]}/api/v1/captions/{video_id}"
+            f"?label={label}&lang={lang_code}"
+        )
+    cap_response = requests.get(caption_url, timeout=15)
+    if cap_response.status_code != 200:
+        raise ValueError("Failed to download caption track.")
+    content = cap_response.text
+    # Step 5: Parse the caption format (XML/VTT/SRT)
+    try:
+        root = ET.fromstring(content)
+        text_parts = []
+        for elem in root.iter():
+            if elem.text and elem.text.strip():
+                text_parts.append(elem.text.strip())
+        transcript_text = " ".join(text_parts)
+    except ET.ParseError:
+        lines = content.split('\n')
+        text_parts = []
+        for line in lines:
+            line = line.strip()
+            if not line:
+                continue
+            if re.match(r'^\d+$', line):
+                continue
+            if re.match(r'[\d:,]+ --> [\d:,]+', line):
+                continue
+            if line in ['WEBVTT', 'NOTE']:
+                continue
+            text_parts.append(line)
+        transcript_text = " ".join(text_parts)
+    # Step 6: Clean the text
+    transcript_text = re.sub(r'<[^>]+>', '', transcript_text)
+    transcript_text = re.sub(r'\[.*?\]', '', transcript_text)
+    transcript_text = re.sub(r'&amp;', '&', transcript_text)
+    transcript_text = re.sub(r'&quot;', '"', transcript_text)
+    transcript_text = re.sub(r'&#39;', "'", transcript_text)
+    transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()
+    if len(transcript_text) < 50:
+        raise ValueError("Transcript is too short or empty. Try a different video.")
+    return transcript_text
 def ingest_document(file_path: str):

frontend/youtube_transcript.html DELETED Viewed

@@ -1,137 +0,0 @@
-<!DOCTYPE html>
-<html>
-<head>
-    <meta charset="utf-8">
-</head>
-<body>
-    <div id="status" style="font-family: 'Inter', sans-serif; font-size: 14px; padding: 8px 0; color: #6B7280;"></div>
-    <script>
-        const BACKEND_URL = "http://localhost:8000";
-        const statusEl = document.getElementById("status");
-        function setStatus(msg, color) {
-            statusEl.textContent = msg;
-            statusEl.style.color = color || "#6B7280";
-        }
-        async function fetchAndIngest(videoId) {
-            try {
-                setStatus("⏳ Fetching transcript from YouTube...", "#2563EB");
-                // Step 1: Fetch the YouTube page from the user's browser
-                const response = await fetch(
-                    `https://www.youtube.com/watch?v=${videoId}`,
-                    { headers: { "Accept-Language": "en-US,en;q=0.9" } }
-                );
-                if (!response.ok) {
-                    throw new Error("Could not access this YouTube video. It may be private or region-restricted.");
-                }
-                const html = await response.text();
-                // Step 2: Extract caption tracks
-                const match = html.match(/"captionTracks":(\[.*?\])/);
-                if (!match) {
-                    throw new Error("No captions available for this video. Try a video with the CC button visible.");
-                }
-                const tracks = JSON.parse(match[1]);
-                // Step 3: Pick best caption track
-                let selected = null;
-                for (const t of tracks) {
-                    if (t.languageCode === 'en' && t.kind !== 'asr') { selected = t; break; }
-                }
-                if (!selected) {
-                    for (const t of tracks) {
-                        if (t.languageCode === 'en') { selected = t; break; }
-                    }
-                }
-                if (!selected) selected = tracks[0];
-                if (!selected || !selected.baseUrl) {
-                    throw new Error("Could not find a usable caption track.");
-                }
-                setStatus("⏳ Downloading captions...", "#2563EB");
-                // Step 4: Download captions
-                const capResponse = await fetch(selected.baseUrl + "&fmt=json3");
-                const capData = await capResponse.json();
-                // Step 5: Parse transcript
-                const events = capData.events || [];
-                let parts = [];
-                for (const event of events) {
-                    for (const seg of (event.segs || [])) {
-                        if (seg.utf8 && seg.utf8 !== '\n') {
-                            parts.push(seg.utf8);
-                        }
-                    }
-                }
-                let text = parts.join(' ')
-                    .replace(/\[.*?\]/g, '')
-                    .replace(/\s+/g, ' ')
-                    .trim();
-                if (text.length < 50) {
-                    throw new Error("Transcript too short or empty. Try a video with more spoken content.");
-                }
-                setStatus(`⏳ Processing ${text.length} characters...`, "#2563EB");
-                // Step 6: Send transcript directly to backend
-                const ingestResponse = await fetch(`${BACKEND_URL}/ingest_text`, {
-                    method: "POST",
-                    headers: { "Content-Type": "application/json" },
-                    body: JSON.stringify({
-                        text: text,
-                        source_name: `YouTube: ${videoId}`,
-                        source_type: "youtube"
-                    })
-                });
-                if (!ingestResponse.ok) {
-                    const errData = await ingestResponse.json().catch(() => ({}));
-                    throw new Error(errData.detail || "Failed to process transcript in backend.");
-                }
-                setStatus("✅ YouTube transcript processed successfully!", "#16A34A");
-                // Notify Streamlit parent to rerun
-                window.parent.postMessage({
-                    type: "youtube_transcript",
-                    status: "success",
-                    videoId: videoId
-                }, "*");
-            } catch (err) {
-                setStatus("❌ " + err.message, "#DC2626");
-                window.parent.postMessage({
-                    type: "youtube_transcript",
-                    status: "error",
-                    message: err.message
-                }, "*");
-            }
-        }
-        // Listen for trigger from Streamlit
-        window.addEventListener("message", function (event) {
-            if (event.data && event.data.type === "fetch_youtube" && event.data.videoId) {
-                fetchAndIngest(event.data.videoId);
-            }
-        });
-        // Auto-start if video ID is embedded in the page
-        const autoVideoId = document.body.getAttribute("data-video-id");
-        if (autoVideoId) {
-            fetchAndIngest(autoVideoId);
-        }
-    </script>
-</body>
-</html>