gabejavitt commited on
Commit
1b187a0
Β·
verified Β·
1 Parent(s): 4dbb5ae

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -44
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import os
2
  import io
 
3
  import json
4
  import re
5
  import traceback
@@ -557,7 +558,7 @@ class YoutubeInput(BaseModel):
557
 
558
  @tool(args_schema=YoutubeInput)
559
  def get_youtube_transcript(video_url: str) -> str:
560
- """Fetches YouTube video transcript using official API."""
561
  if not video_url:
562
  return "Error: Invalid URL."
563
 
@@ -574,54 +575,78 @@ def get_youtube_transcript(video_url: str) -> str:
574
  if not video_id:
575
  return f"Error: Could not extract video ID."
576
 
577
- # Get API key
578
- YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
579
- if not YOUTUBE_API_KEY:
580
- return "Error: YOUTUBE_API_KEY not set in environment."
581
-
582
- # Build YouTube API client
583
- youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
584
-
585
- # Get captions list
586
- captions_response = youtube.captions().list(
587
- part='snippet',
588
- videoId=video_id
589
- ).execute()
590
-
591
- if not captions_response.get('items'):
592
- return "Error: No captions available for this video."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
593
 
594
- # Find English caption track
595
- caption_id = None
596
- for caption in captions_response['items']:
597
- if caption['snippet']['language'] == 'en':
598
- caption_id = caption['id']
599
- break
600
 
601
- if not caption_id:
602
- # Try first available caption
603
- caption_id = captions_response['items'][0]['id']
604
-
605
- # Download caption
606
- caption_download = youtube.captions().download(
607
- id=caption_id,
608
- tfmt='srt' # or 'vtt'
609
- ).execute()
610
-
611
- # Parse SRT format to plain text
612
- import re
613
- text_lines = []
614
- for line in caption_download.decode('utf-8').split('\n'):
615
- # Skip timestamp lines and sequence numbers
616
- if not re.match(r'^\d+$', line) and not re.match(r'\d{2}:\d{2}:\d{2}', line) and line.strip():
617
- text_lines.append(line.strip())
618
-
619
- full_transcript = " ".join(text_lines)
620
  return f"Transcript:\n{truncate_if_needed(full_transcript)}"
621
 
622
- except HttpError as e:
623
- return f"YouTube API error: {e}"
 
 
624
  except Exception as e:
 
 
625
  return f"Transcript error: {str(e)}"
626
 
627
 
 
1
  import os
2
  import io
3
+ import subprocess
4
  import json
5
  import re
6
  import traceback
 
558
 
559
  @tool(args_schema=YoutubeInput)
560
  def get_youtube_transcript(video_url: str) -> str:
561
+ """Fetches YouTube video transcript using yt-dlp."""
562
  if not video_url:
563
  return "Error: Invalid URL."
564
 
 
575
  if not video_id:
576
  return f"Error: Could not extract video ID."
577
 
578
+ # Use yt-dlp to get subtitles
579
+ subtitle_file = f'{video_id}.en.vtt'
580
+
581
+ cmd = [
582
+ 'yt-dlp',
583
+ '--skip-download',
584
+ '--write-auto-subs',
585
+ '--write-subs',
586
+ '--sub-lang', 'en',
587
+ '--sub-format', 'vtt',
588
+ '--output', video_id,
589
+ video_url
590
+ ]
591
+
592
+ print(f"πŸ”§ Running: {' '.join(cmd)}")
593
+ result = subprocess.run(cmd, capture_output=True, text=True, timeout=45)
594
+
595
+ if result.returncode != 0:
596
+ print(f"⚠️ yt-dlp stderr: {result.stderr}")
597
+ return f"Error: Could not fetch subtitles - {result.stderr[:200]}"
598
+
599
+ # Try to find the subtitle file (might have different naming)
600
+ import glob
601
+ vtt_files = glob.glob(f"{video_id}*.vtt")
602
+
603
+ if not vtt_files:
604
+ return "Error: No English subtitles found for this video."
605
+
606
+ subtitle_file = vtt_files[0]
607
+ print(f"βœ“ Found subtitle file: {subtitle_file}")
608
+
609
+ # Read and parse VTT file
610
+ with open(subtitle_file, 'r', encoding='utf-8') as f:
611
+ content = f.read()
612
+
613
+ # Remove VTT headers and timestamps
614
+ lines = content.split('\n')
615
+ transcript_parts = []
616
+
617
+ for line in lines:
618
+ line = line.strip()
619
+ # Skip WEBVTT header, timestamps, and empty lines
620
+ if (line and
621
+ not line.startswith('WEBVTT') and
622
+ not '-->' in line and
623
+ not line.isdigit() and
624
+ not line.startswith('Kind:') and
625
+ not line.startswith('Language:')):
626
+ transcript_parts.append(line)
627
+
628
+ full_transcript = " ".join(transcript_parts)
629
+
630
+ # Cleanup subtitle files
631
+ for vtt_file in vtt_files:
632
+ try:
633
+ os.remove(vtt_file)
634
+ except:
635
+ pass
636
 
637
+ if not full_transcript:
638
+ return "Error: Transcript was empty."
 
 
 
 
639
 
640
+ print(f"βœ“ Transcript extracted: {len(full_transcript)} chars")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
641
  return f"Transcript:\n{truncate_if_needed(full_transcript)}"
642
 
643
+ except subprocess.TimeoutExpired:
644
+ return "Error: yt-dlp timed out after 45 seconds."
645
+ except FileNotFoundError:
646
+ return "Error: yt-dlp not installed. Add 'yt-dlp' to requirements.txt"
647
  except Exception as e:
648
+ print(f"❌ Error: {str(e)}")
649
+ print(traceback.format_exc())
650
  return f"Transcript error: {str(e)}"
651
 
652