Spaces:

gabejavitt
/

agentCourse

Sleeping

App Files Files Community

gabejavitt commited on Nov 5, 2025

Commit

1b187a0

verified ·

1 Parent(s): 4dbb5ae

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -44

app.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import os
 import io
 import json
 import re
 import traceback
@@ -557,7 +558,7 @@ class YoutubeInput(BaseModel):
 @tool(args_schema=YoutubeInput)
 def get_youtube_transcript(video_url: str) -> str:
-    """Fetches YouTube video transcript using official API."""
     if not video_url:
         return "Error: Invalid URL."
@@ -574,54 +575,78 @@ def get_youtube_transcript(video_url: str) -> str:
         if not video_id:
             return f"Error: Could not extract video ID."
-        # Get API key
-        YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
-        if not YOUTUBE_API_KEY:
-            return "Error: YOUTUBE_API_KEY not set in environment."
-        # Build YouTube API client
-        youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
-        # Get captions list
-        captions_response = youtube.captions().list(
-            part='snippet',
-            videoId=video_id
-        ).execute()
-        if not captions_response.get('items'):
-            return "Error: No captions available for this video."
-        # Find English caption track
-        caption_id = None
-        for caption in captions_response['items']:
-            if caption['snippet']['language'] == 'en':
-                caption_id = caption['id']
-                break
-        if not caption_id:
-            # Try first available caption
-            caption_id = captions_response['items'][0]['id']
-        # Download caption
-        caption_download = youtube.captions().download(
-            id=caption_id,
-            tfmt='srt'  # or 'vtt'
-        ).execute()
-        # Parse SRT format to plain text
-        import re
-        text_lines = []
-        for line in caption_download.decode('utf-8').split('\n'):
-            # Skip timestamp lines and sequence numbers
-            if not re.match(r'^\d+$', line) and not re.match(r'\d{2}:\d{2}:\d{2}', line) and line.strip():
-                text_lines.append(line.strip())
-        full_transcript = " ".join(text_lines)
         return f"Transcript:\n{truncate_if_needed(full_transcript)}"
-    except HttpError as e:
-        return f"YouTube API error: {e}"
     except Exception as e:
         return f"Transcript error: {str(e)}"

 import os
 import io
+import subprocess
 import json
 import re
 import traceback
 @tool(args_schema=YoutubeInput)
 def get_youtube_transcript(video_url: str) -> str:
+    """Fetches YouTube video transcript using yt-dlp."""
     if not video_url:
         return "Error: Invalid URL."
         if not video_id:
             return f"Error: Could not extract video ID."
+        # Use yt-dlp to get subtitles
+        subtitle_file = f'{video_id}.en.vtt'
+        cmd = [
+            'yt-dlp',
+            '--skip-download',
+            '--write-auto-subs',
+            '--write-subs',
+            '--sub-lang', 'en',
+            '--sub-format', 'vtt',
+            '--output', video_id,
+            video_url
+        ]
+        print(f"🔧 Running: {' '.join(cmd)}")
+        result = subprocess.run(cmd, capture_output=True, text=True, timeout=45)
+        if result.returncode != 0:
+            print(f"⚠️ yt-dlp stderr: {result.stderr}")
+            return f"Error: Could not fetch subtitles - {result.stderr[:200]}"
+        # Try to find the subtitle file (might have different naming)
+        import glob
+        vtt_files = glob.glob(f"{video_id}*.vtt")
+        if not vtt_files:
+            return "Error: No English subtitles found for this video."
+        subtitle_file = vtt_files[0]
+        print(f"✓ Found subtitle file: {subtitle_file}")
+        # Read and parse VTT file
+        with open(subtitle_file, 'r', encoding='utf-8') as f:
+            content = f.read()
+        # Remove VTT headers and timestamps
+        lines = content.split('\n')
+        transcript_parts = []
+        for line in lines:
+            line = line.strip()
+            # Skip WEBVTT header, timestamps, and empty lines
+            if (line and
+                not line.startswith('WEBVTT') and
+                not '-->' in line and
+                not line.isdigit() and
+                not line.startswith('Kind:') and
+                not line.startswith('Language:')):
+                transcript_parts.append(line)
+        full_transcript = " ".join(transcript_parts)
+        # Cleanup subtitle files
+        for vtt_file in vtt_files:
+            try:
+                os.remove(vtt_file)
+            except:
+                pass
+        if not full_transcript:
+            return "Error: Transcript was empty."
+        print(f"✓ Transcript extracted: {len(full_transcript)} chars")
         return f"Transcript:\n{truncate_if_needed(full_transcript)}"
+    except subprocess.TimeoutExpired:
+        return "Error: yt-dlp timed out after 45 seconds."
+    except FileNotFoundError:
+        return "Error: yt-dlp not installed. Add 'yt-dlp' to requirements.txt"
     except Exception as e:
+        print(f"❌ Error: {str(e)}")
+        print(traceback.format_exc())
         return f"Transcript error: {str(e)}"