Spaces:

noodledom
/

focusflow

Sleeping

App Files Files Community

FocusFlow Assistant commited on Feb 27

Commit

cecb03b

1 Parent(s): f723af8

Replace YouTube fetch with timedtext API - no API keys needed, works from any server

Browse files

Files changed (2) hide show

app.py +1 -7
backend/rag_engine.py +119 -212

app.py CHANGED Viewed

@@ -1149,13 +1149,7 @@ if not st.session_state.focus_mode:
                                 resp = requests.post(f"{API_URL}/ingest_url", json={"url": url_input}, headers=get_headers(), timeout=120)
                                 if resp.status_code == 200:
                                     data = resp.json()
-                                    message = data.get('message', 'Success')
-                                    # Show caption type info if available
-                                    if "auto-generated" in message:
-                                        st.success(f"✅ {message}")
-                                        st.info("ℹ️ Transcript extracted using auto-generated captions. Quality may vary — auto-captions can have errors.")
-                                    else:
-                                        st.success(f"✅ {message}")
                                     time.sleep(1)
                                     st.rerun()
                                 else:

                                 resp = requests.post(f"{API_URL}/ingest_url", json={"url": url_input}, headers=get_headers(), timeout=120)
                                 if resp.status_code == 200:
                                     data = resp.json()
+                                    st.success(f"✅ YouTube transcript extracted successfully.")
                                     time.sleep(1)
                                     st.rerun()
                                 else:

backend/rag_engine.py CHANGED Viewed

@@ -1,9 +1,11 @@
 import os
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_chroma import Chroma
 from langchain_community.llms import Ollama
-from backend.config import get_llm, get_embeddings, has_youtube_api_key, YOUTUBE_API_KEY
 from langchain_core.documents import Document
 import logging
 import time
@@ -12,206 +14,125 @@ import re
 # Configure logger FIRST
 logger = logging.getLogger(__name__)
-# YouTube transcript support
-try:
-    from youtube_transcript_api import YouTubeTranscriptApi
-    from youtube_transcript_api import TranscriptsDisabled, NoTranscriptFound, InvalidVideoId
-    HAS_YOUTUBE_API = True
-except ImportError:
-    HAS_YOUTUBE_API = False
-    logger.warning("youtube-transcript-api not installed - YouTube local fallback will not work")
 CACHE_DIR = "./chroma_db"
-def _parse_srt_to_text(srt_content: str) -> str:
-    """Parse SRT subtitle format to plain text."""
-    lines = srt_content.split('\n')
-    text_lines = []
-    for line in lines:
-        line = line.strip()
-        # Skip empty lines, sequence numbers, and timestamp lines
-        if not line:
-            continue
-        if line.isdigit():
-            continue
-        if re.match(r'\d{2}:\d{2}:\d{2}', line):
-            continue
-        text_lines.append(line)
-    return ' '.join(text_lines)
-def _fetch_youtube_transcript(video_id: str) -> tuple:
     """
-    Fetch YouTube transcript using the best available method.
-    Returns (transcript_text, caption_type) tuple.
-    Method A: YouTube Data API v3 (when YOUTUBE_API_KEY is set - reliable in cloud)
-    Method B: youtube-transcript-api fallback (local mode)
     """
-    # --- METHOD A: YouTube Data API v3 (cloud-reliable) ---
-    if has_youtube_api_key():
-        logger.info("Using YouTube Data API v3 (API key found)")
-        try:
-            from googleapiclient.discovery import build
-            from googleapiclient.errors import HttpError
-            youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
-            # Get available caption tracks
-            captions_response = youtube.captions().list(
-                part='snippet',
-                videoId=video_id
-            ).execute()
-            caption_items = captions_response.get('items', [])
-            if not caption_items:
-                raise ValueError(
-                    "No captions found for this video. "
-                    "The video may have captions disabled by the creator."
-                )
-            # Select best caption track by priority
-            selected_track = None
-            caption_type = "manual"
-            # Priority 1: Manual English
-            for item in caption_items:
-                snippet = item['snippet']
-                if snippet.get('language') == 'en' and snippet.get('trackKind') == 'standard':
-                    selected_track = item
-                    caption_type = "manual"
-                    break
-            # Priority 2: Auto-generated English
-            if not selected_track:
-                for item in caption_items:
-                    snippet = item['snippet']
-                    if snippet.get('language') == 'en' and snippet.get('trackKind') == 'asr':
-                        selected_track = item
-                        caption_type = "auto-generated"
-                        break
-            # Priority 3: Any manual caption
-            if not selected_track:
-                for item in caption_items:
-                    snippet = item['snippet']
-                    if snippet.get('trackKind') == 'standard':
-                        selected_track = item
-                        caption_type = "manual"
-                        break
-            # Priority 4: Any auto-generated caption
-            if not selected_track:
-                for item in caption_items:
-                    selected_track = item
-                    caption_type = "auto-generated"
-                    break
-            if not selected_track:
-                raise ValueError(
-                    "No captions found for this video. "
-                    "The video may have captions disabled by the creator."
-                )
-            logger.info(f"Selected caption track: {selected_track['snippet'].get('language')} ({caption_type})")
-            # Download the caption track
-            caption_id = selected_track['id']
-            subtitle_response = youtube.captions().download(
-                id=caption_id,
-                tfmt='srt'
-            ).execute()
-            # Parse SRT to plain text
-            srt_text = subtitle_response.decode('utf-8') if isinstance(subtitle_response, bytes) else str(subtitle_response)
-            transcript_text = _parse_srt_to_text(srt_text)
-            if not transcript_text or len(transcript_text) < 50:
-                raise ValueError(
-                    "Transcript is too short or empty. "
-                    "Try a different video with more spoken content."
-                )
-            logger.info(f"YouTube Data API: extracted {len(transcript_text)} chars ({caption_type})")
-            return transcript_text, caption_type
-        except HttpError as e:
-            if e.resp.status == 403:
-                if 'quotaExceeded' in str(e):
-                    raise ValueError(
-                        "YouTube API quota exceeded. "
-                        "Please try again later or upload a PDF instead."
-                    )
-                # captions().download requires OAuth for third-party videos
-                # Fall through to Method B
-                logger.warning(f"YouTube Data API forbidden (likely needs OAuth for captions download): {e}")
-                logger.info("Falling back to youtube-transcript-api...")
-            elif e.resp.status == 404:
-                raise ValueError(
-                    "Could not access this video. "
-                    "It may be private, deleted, or region-restricted."
-                )
-            else:
-                logger.error(f"YouTube Data API error: {e}")
-                logger.info("Falling back to youtube-transcript-api...")
-        except ValueError:
-            raise
-        except Exception as e:
-            logger.error(f"YouTube Data API unexpected error: {e}")
-            logger.info("Falling back to youtube-transcript-api...")
-    # --- METHOD B: youtube-transcript-api fallback (local / API fallback) ---
-    if not HAS_YOUTUBE_API:
         raise ValueError(
-            "YouTube transcript libraries not available. "
-            "Please upload a PDF instead."
         )
-    logger.info("Using youtube-transcript-api (local fallback)")
-    ytt = YouTubeTranscriptApi()
     try:
-        # PRIORITY 1: Try manual English captions
-        logger.info("Trying manual English captions...")
-        transcript = ytt.fetch(video_id, languages=['en'])
-        transcript_text = ' '.join([t.text for t in transcript])
-        logger.info(f"Got manual English transcript ({len(transcript_text)} chars)")
-        return transcript_text, "manual"
-    except Exception as e1:
-        logger.info(f"Manual English not available: {e1}")
-        try:
-            # PRIORITY 2: Try any available transcript (includes auto-generated)
-            logger.info("Trying any available transcript...")
-            transcript_list = ytt.list(video_id)
-            first_available = next(iter(transcript_list))
-            transcript = first_available.fetch()
-            transcript_text = ' '.join([t.text for t in transcript])
-            logger.info(f"Got fallback transcript ({len(transcript_text)} chars)")
-            return transcript_text, "auto-generated"
-        except (TranscriptsDisabled, NoTranscriptFound):
-            raise ValueError(
-                "No captions found for this video. "
-                "This video may have captions disabled entirely. "
-                "Try a different video or upload a PDF instead."
-            )
-        except InvalidVideoId:
-            raise ValueError(f"Invalid YouTube video ID: {video_id}. Please check the URL.")
-        except StopIteration:
-            raise ValueError(
-                "No captions found for this video. "
-                "This video may have captions disabled entirely. "
-                "Try a different video or upload a PDF instead."
-            )
-        except Exception as e2:
-            logger.error(f"All transcript fetch attempts failed: {e2}")
-            raise ValueError(
-                "No captions found for this video. "
-                "This video may have captions disabled entirely. "
-                "Try a different video or upload a PDF instead."
-            )
@@ -302,13 +223,10 @@ def ingest_document(file_path: str):
 def ingest_url(url: str):
     """
     Ingests content from a URL (YouTube or Web).
-    Uses YouTube Data API v3 in cloud mode, youtube-transcript-api locally.
     """
     from langchain_community.document_loaders import WebBaseLoader
-    if not HAS_YOUTUBE_API and not has_youtube_api_key() and ("youtube.com" in url or "youtu.be" in url):
-        raise ValueError("YouTube support not available - no API key or transcript library found")
     docs = []
     title = url
@@ -331,18 +249,8 @@ def ingest_url(url: str):
             logger.info(f"Extracted video ID: {video_id}")
-            # Fetch transcript using the appropriate method
-            transcript_text, caption_type = _fetch_youtube_transcript(video_id)
-            # Clean up transcript text
-            transcript_text = re.sub(r'\[.*?\]', '', transcript_text)  # Remove [Music], [Applause], etc.
-            transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()  # Normalize whitespace
-            if len(transcript_text) < 50:
-                raise ValueError(
-                    "Transcript is too short or empty after cleanup. "
-                    "Try a different video with more spoken content."
-                )
             # Create a document from the transcript
             docs = [Document(
@@ -350,11 +258,10 @@ def ingest_url(url: str):
                 metadata={
                     "source": url,
                     "title": f"YouTube: {video_id}",
-                    "type": "youtube",
-                    "caption_type": caption_type
                 }
             )]
-            title = f"YouTube Video: {video_id} ({caption_type} captions)"
         else:
             # Regular web page

 import os
+import json
+import requests as http_requests
 from langchain_community.document_loaders import PyPDFLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_chroma import Chroma
 from langchain_community.llms import Ollama
+from backend.config import get_llm, get_embeddings
 from langchain_core.documents import Document
 import logging
 import time
 # Configure logger FIRST
 logger = logging.getLogger(__name__)
 CACHE_DIR = "./chroma_db"
+def _fetch_youtube_transcript(video_id: str) -> str:
     """
+    Fetch YouTube transcript using the timedtext API endpoint.
+    Works from any server — no API keys or OAuth required.
+    Returns cleaned transcript text.
     """
+    # Step 1: Fetch the YouTube page to get caption track info
+    page_url = f"https://www.youtube.com/watch?v={video_id}"
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                       "AppleWebKit/537.36 (KHTML, like Gecko) "
+                       "Chrome/120.0.0.0 Safari/537.36",
+        "Accept-Language": "en-US,en;q=0.9"
+    }
+    response = http_requests.get(page_url, headers=headers, timeout=30)
+    if response.status_code != 200:
         raise ValueError(
+            "Could not access this YouTube video. "
+            "It may be private or region-restricted."
         )
+    # Step 2: Extract captionTracks from ytInitialPlayerResponse
+    match = re.search(r'"captionTracks":(\[.*?\])', response.text)
+    if not match:
+        raise ValueError(
+            "No captions available for this video. "
+            "The creator may have disabled captions. "
+            "Try a different video or upload a PDF instead."
+        )
     try:
+        caption_tracks = json.loads(match.group(1))
+    except json.JSONDecodeError:
+        raise ValueError(
+            "Failed to parse caption data. "
+            "Try a different video or upload a PDF instead."
+        )
+    if not caption_tracks:
+        raise ValueError(
+            "No captions available for this video. "
+            "The creator may have disabled captions. "
+            "Try a different video or upload a PDF instead."
+        )
+    # Step 3: Pick best caption track by priority
+    selected = None
+    # Priority 1: Manual English captions
+    for track in caption_tracks:
+        lang = track.get('languageCode', '')
+        kind = track.get('kind', '')
+        if lang == 'en' and kind != 'asr':
+            selected = track
+            break
+    # Priority 2: Auto-generated English captions
+    if not selected:
+        for track in caption_tracks:
+            if track.get('languageCode', '') == 'en':
+                selected = track
+                break
+    # Priority 3: Any available track
+    if not selected:
+        selected = caption_tracks[0]
+    logger.info(f"Selected caption track: {selected.get('languageCode')} (kind={selected.get('kind', 'standard')})")
+    # Step 4: Download the caption track in JSON3 format
+    caption_url = selected.get('baseUrl')
+    if not caption_url:
+        raise ValueError("Could not retrieve caption URL.")
+    caption_response = http_requests.get(
+        caption_url + "&fmt=json3",
+        headers=headers,
+        timeout=30
+    )
+    if caption_response.status_code != 200:
+        raise ValueError("Failed to download captions.")
+    # Step 5: Parse the JSON3 caption format
+    try:
+        caption_data = caption_response.json()
+    except json.JSONDecodeError:
+        raise ValueError("Failed to parse caption data.")
+    events = caption_data.get('events', [])
+    text_parts = []
+    for event in events:
+        segs = event.get('segs', [])
+        for seg in segs:
+            utf8 = seg.get('utf8', '')
+            if utf8 and utf8 != '\n':
+                text_parts.append(utf8)
+    transcript_text = ' '.join(text_parts)
+    # Step 6: Clean the text
+    transcript_text = re.sub(r'\[.*?\]', '', transcript_text)
+    transcript_text = re.sub(r'\s+', ' ', transcript_text).strip()
+    if len(transcript_text) < 50:
+        raise ValueError(
+            "Transcript is too short or empty. "
+            "Try a different video with more spoken content."
+        )
+    logger.info(f"Timedtext API: extracted {len(transcript_text)} chars")
+    return transcript_text
 def ingest_url(url: str):
     """
     Ingests content from a URL (YouTube or Web).
+    Uses YouTube's timedtext API for transcripts — no API keys needed.
     """
     from langchain_community.document_loaders import WebBaseLoader
     docs = []
     title = url
             logger.info(f"Extracted video ID: {video_id}")
+            # Fetch transcript using timedtext API
+            transcript_text = _fetch_youtube_transcript(video_id)
             # Create a document from the transcript
             docs = [Document(
                 metadata={
                     "source": url,
                     "title": f"YouTube: {video_id}",
+                    "type": "youtube"
                 }
             )]
+            title = f"YouTube Video: {video_id}"
         else:
             # Regular web page