gabejavitt commited on
Commit
d63a5de
·
verified ·
1 Parent(s): ed23d35

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -8
app.py CHANGED
@@ -23,6 +23,8 @@ from bs4 import BeautifulSoup
23
  import requests
24
  from PIL import Image
25
  import base64
 
 
26
 
27
  # LangChain & LangGraph
28
  from langgraph.graph.message import add_messages
@@ -555,13 +557,14 @@ class YoutubeInput(BaseModel):
555
 
556
  @tool(args_schema=YoutubeInput)
557
  def get_youtube_transcript(video_url: str) -> str:
558
- """Fetches YouTube video transcript."""
559
  if not video_url:
560
  return "Error: Invalid URL."
561
 
562
  print(f"📺 YouTube transcript: {video_url}")
563
 
564
  try:
 
565
  video_id = None
566
  if "watch?v=" in video_url:
567
  video_id = video_url.split("v=")[1].split("&")[0]
@@ -570,15 +573,54 @@ def get_youtube_transcript(video_url: str) -> str:
570
 
571
  if not video_id:
572
  return f"Error: Could not extract video ID."
573
-
574
- # FIXED: Use get_transcript instead of list_transcripts
575
- transcript_list = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
576
 
577
- if not transcript_list:
578
- return "Error: No transcript found."
579
-
580
- full_transcript = " ".join([item["text"] for item in transcript_list])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
581
  return f"Transcript:\n{truncate_if_needed(full_transcript)}"
 
 
 
582
  except Exception as e:
583
  return f"Transcript error: {str(e)}"
584
 
 
23
  import requests
24
  from PIL import Image
25
  import base64
26
+ from googleapiclient.discovery import build
27
+ from googleapiclient.errors import HttpError
28
 
29
  # LangChain & LangGraph
30
  from langgraph.graph.message import add_messages
 
557
 
558
  @tool(args_schema=YoutubeInput)
559
  def get_youtube_transcript(video_url: str) -> str:
560
+ """Fetches YouTube video transcript using official API."""
561
  if not video_url:
562
  return "Error: Invalid URL."
563
 
564
  print(f"📺 YouTube transcript: {video_url}")
565
 
566
  try:
567
+ # Extract video ID
568
  video_id = None
569
  if "watch?v=" in video_url:
570
  video_id = video_url.split("v=")[1].split("&")[0]
 
573
 
574
  if not video_id:
575
  return f"Error: Could not extract video ID."
 
 
 
576
 
577
+ # Get API key
578
+ YOUTUBE_API_KEY = os.getenv("YOUTUBE_API_KEY")
579
+ if not YOUTUBE_API_KEY:
580
+ return "Error: YOUTUBE_API_KEY not set in environment."
581
+
582
+ # Build YouTube API client
583
+ youtube = build('youtube', 'v3', developerKey=YOUTUBE_API_KEY)
584
+
585
+ # Get captions list
586
+ captions_response = youtube.captions().list(
587
+ part='snippet',
588
+ videoId=video_id
589
+ ).execute()
590
+
591
+ if not captions_response.get('items'):
592
+ return "Error: No captions available for this video."
593
+
594
+ # Find English caption track
595
+ caption_id = None
596
+ for caption in captions_response['items']:
597
+ if caption['snippet']['language'] == 'en':
598
+ caption_id = caption['id']
599
+ break
600
+
601
+ if not caption_id:
602
+ # Try first available caption
603
+ caption_id = captions_response['items'][0]['id']
604
+
605
+ # Download caption
606
+ caption_download = youtube.captions().download(
607
+ id=caption_id,
608
+ tfmt='srt' # or 'vtt'
609
+ ).execute()
610
+
611
+ # Parse SRT format to plain text
612
+ import re
613
+ text_lines = []
614
+ for line in caption_download.decode('utf-8').split('\n'):
615
+ # Skip timestamp lines and sequence numbers
616
+ if not re.match(r'^\d+$', line) and not re.match(r'\d{2}:\d{2}:\d{2}', line) and line.strip():
617
+ text_lines.append(line.strip())
618
+
619
+ full_transcript = " ".join(text_lines)
620
  return f"Transcript:\n{truncate_if_needed(full_transcript)}"
621
+
622
+ except HttpError as e:
623
+ return f"YouTube API error: {e}"
624
  except Exception as e:
625
  return f"Transcript error: {str(e)}"
626