Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
import io
|
|
|
|
| 3 |
import json
|
| 4 |
import re
|
| 5 |
import traceback
|
|
@@ -557,7 +558,7 @@ class YoutubeInput(BaseModel):
|
|
| 557 |
|
| 558 |
@tool(args_schema=YoutubeInput)
|
| 559 |
def get_youtube_transcript(video_url: str) -> str:
|
| 560 |
-
"""Fetches YouTube video transcript using
|
| 561 |
if not video_url:
|
| 562 |
return "Error: Invalid URL."
|
| 563 |
|
|
@@ -574,54 +575,78 @@ def get_youtube_transcript(video_url: str) -> str:
|
|
| 574 |
if not video_id:
|
| 575 |
return f"Error: Could not extract video ID."
|
| 576 |
|
| 577 |
-
#
|
| 578 |
-
|
| 579 |
-
|
| 580 |
-
|
| 581 |
-
|
| 582 |
-
|
| 583 |
-
|
| 584 |
-
|
| 585 |
-
|
| 586 |
-
|
| 587 |
-
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 593 |
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
for caption in captions_response['items']:
|
| 597 |
-
if caption['snippet']['language'] == 'en':
|
| 598 |
-
caption_id = caption['id']
|
| 599 |
-
break
|
| 600 |
|
| 601 |
-
|
| 602 |
-
# Try first available caption
|
| 603 |
-
caption_id = captions_response['items'][0]['id']
|
| 604 |
-
|
| 605 |
-
# Download caption
|
| 606 |
-
caption_download = youtube.captions().download(
|
| 607 |
-
id=caption_id,
|
| 608 |
-
tfmt='srt' # or 'vtt'
|
| 609 |
-
).execute()
|
| 610 |
-
|
| 611 |
-
# Parse SRT format to plain text
|
| 612 |
-
import re
|
| 613 |
-
text_lines = []
|
| 614 |
-
for line in caption_download.decode('utf-8').split('\n'):
|
| 615 |
-
# Skip timestamp lines and sequence numbers
|
| 616 |
-
if not re.match(r'^\d+$', line) and not re.match(r'\d{2}:\d{2}:\d{2}', line) and line.strip():
|
| 617 |
-
text_lines.append(line.strip())
|
| 618 |
-
|
| 619 |
-
full_transcript = " ".join(text_lines)
|
| 620 |
return f"Transcript:\n{truncate_if_needed(full_transcript)}"
|
| 621 |
|
| 622 |
-
except
|
| 623 |
-
return
|
|
|
|
|
|
|
| 624 |
except Exception as e:
|
|
|
|
|
|
|
| 625 |
return f"Transcript error: {str(e)}"
|
| 626 |
|
| 627 |
|
|
|
|
| 1 |
import os
|
| 2 |
import io
|
| 3 |
+
import subprocess
|
| 4 |
import json
|
| 5 |
import re
|
| 6 |
import traceback
|
|
|
|
| 558 |
|
| 559 |
@tool(args_schema=YoutubeInput)
|
| 560 |
def get_youtube_transcript(video_url: str) -> str:
|
| 561 |
+
"""Fetches YouTube video transcript using yt-dlp."""
|
| 562 |
if not video_url:
|
| 563 |
return "Error: Invalid URL."
|
| 564 |
|
|
|
|
| 575 |
if not video_id:
|
| 576 |
return f"Error: Could not extract video ID."
|
| 577 |
|
| 578 |
+
# Use yt-dlp to get subtitles
|
| 579 |
+
subtitle_file = f'{video_id}.en.vtt'
|
| 580 |
+
|
| 581 |
+
cmd = [
|
| 582 |
+
'yt-dlp',
|
| 583 |
+
'--skip-download',
|
| 584 |
+
'--write-auto-subs',
|
| 585 |
+
'--write-subs',
|
| 586 |
+
'--sub-lang', 'en',
|
| 587 |
+
'--sub-format', 'vtt',
|
| 588 |
+
'--output', video_id,
|
| 589 |
+
video_url
|
| 590 |
+
]
|
| 591 |
+
|
| 592 |
+
print(f"π§ Running: {' '.join(cmd)}")
|
| 593 |
+
result = subprocess.run(cmd, capture_output=True, text=True, timeout=45)
|
| 594 |
+
|
| 595 |
+
if result.returncode != 0:
|
| 596 |
+
print(f"β οΈ yt-dlp stderr: {result.stderr}")
|
| 597 |
+
return f"Error: Could not fetch subtitles - {result.stderr[:200]}"
|
| 598 |
+
|
| 599 |
+
# Try to find the subtitle file (might have different naming)
|
| 600 |
+
import glob
|
| 601 |
+
vtt_files = glob.glob(f"{video_id}*.vtt")
|
| 602 |
+
|
| 603 |
+
if not vtt_files:
|
| 604 |
+
return "Error: No English subtitles found for this video."
|
| 605 |
+
|
| 606 |
+
subtitle_file = vtt_files[0]
|
| 607 |
+
print(f"β Found subtitle file: {subtitle_file}")
|
| 608 |
+
|
| 609 |
+
# Read and parse VTT file
|
| 610 |
+
with open(subtitle_file, 'r', encoding='utf-8') as f:
|
| 611 |
+
content = f.read()
|
| 612 |
+
|
| 613 |
+
# Remove VTT headers and timestamps
|
| 614 |
+
lines = content.split('\n')
|
| 615 |
+
transcript_parts = []
|
| 616 |
+
|
| 617 |
+
for line in lines:
|
| 618 |
+
line = line.strip()
|
| 619 |
+
# Skip WEBVTT header, timestamps, and empty lines
|
| 620 |
+
if (line and
|
| 621 |
+
not line.startswith('WEBVTT') and
|
| 622 |
+
not '-->' in line and
|
| 623 |
+
not line.isdigit() and
|
| 624 |
+
not line.startswith('Kind:') and
|
| 625 |
+
not line.startswith('Language:')):
|
| 626 |
+
transcript_parts.append(line)
|
| 627 |
+
|
| 628 |
+
full_transcript = " ".join(transcript_parts)
|
| 629 |
+
|
| 630 |
+
# Cleanup subtitle files
|
| 631 |
+
for vtt_file in vtt_files:
|
| 632 |
+
try:
|
| 633 |
+
os.remove(vtt_file)
|
| 634 |
+
except:
|
| 635 |
+
pass
|
| 636 |
|
| 637 |
+
if not full_transcript:
|
| 638 |
+
return "Error: Transcript was empty."
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
|
| 640 |
+
print(f"β Transcript extracted: {len(full_transcript)} chars")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
return f"Transcript:\n{truncate_if_needed(full_transcript)}"
|
| 642 |
|
| 643 |
+
except subprocess.TimeoutExpired:
|
| 644 |
+
return "Error: yt-dlp timed out after 45 seconds."
|
| 645 |
+
except FileNotFoundError:
|
| 646 |
+
return "Error: yt-dlp not installed. Add 'yt-dlp' to requirements.txt"
|
| 647 |
except Exception as e:
|
| 648 |
+
print(f"β Error: {str(e)}")
|
| 649 |
+
print(traceback.format_exc())
|
| 650 |
return f"Transcript error: {str(e)}"
|
| 651 |
|
| 652 |
|