Spaces:
Sleeping
Sleeping
Update youtube_tool.py
Browse files- youtube_tool.py +8 -45
youtube_tool.py
CHANGED
|
@@ -14,46 +14,6 @@ def extract_video_id(url: str) -> str:
|
|
| 14 |
Returns:
|
| 15 |
str: The extracted video ID or raises ValueError.
|
| 16 |
"""
|
| 17 |
-
patterns = [
|
| 18 |
-
r"youtube\.com/watch\?v=([a-zA-Z0-9_-]{11})",
|
| 19 |
-
r"youtu\.be/([a-zA-Z0-9_-]{11})"
|
| 20 |
-
]
|
| 21 |
-
for pattern in patterns:
|
| 22 |
-
match = re.search(pattern, url)
|
| 23 |
-
if match:
|
| 24 |
-
return match.group(1)
|
| 25 |
-
raise ValueError("Invalid YouTube URL or unable to extract video ID.")
|
| 26 |
-
|
| 27 |
-
def get_youtube_transcript(url: str) -> str:
|
| 28 |
-
"""
|
| 29 |
-
Fetches the transcript text for a given YouTube video.
|
| 30 |
-
Args:
|
| 31 |
-
url (str): The YouTube video URL.
|
| 32 |
-
Returns:
|
| 33 |
-
str: Combined transcript text or an error message.
|
| 34 |
-
"""
|
| 35 |
-
try:
|
| 36 |
-
video_id = extract_video_id(url)
|
| 37 |
-
transcript_list = YouTubeTranscriptApi.get_transcript(video_id)
|
| 38 |
-
full_text = " ".join([entry["text"] for entry in transcript_list])
|
| 39 |
-
return full_text.strip()[:2000] # Truncate to 2000 chars to prevent token overflow
|
| 40 |
-
except TranscriptsDisabled:
|
| 41 |
-
return "This video has transcripts disabled."
|
| 42 |
-
except NoTranscriptFound:
|
| 43 |
-
return "No transcript was found for this video."
|
| 44 |
-
except Exception as e:
|
| 45 |
-
return f"Transcript error: {str(e)}"
|
| 46 |
-
|
| 47 |
-
youtube_tool = FunctionTool.from_defaults(get_youtube_transcript)
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
def extract_video_id(url: str) -> str:
|
| 51 |
-
"""
|
| 52 |
-
Handles typical YouTube URLs:
|
| 53 |
-
- https://www.youtube.com/watch?v=VIDEO_ID
|
| 54 |
-
- https://youtu.be/VIDEO_ID
|
| 55 |
-
- with extra query params
|
| 56 |
-
"""
|
| 57 |
parsed = urlparse(url)
|
| 58 |
if parsed.hostname in {"www.youtube.com", "youtube.com"}:
|
| 59 |
qs = parse_qs(parsed.query)
|
|
@@ -62,10 +22,14 @@ def extract_video_id(url: str) -> str:
|
|
| 62 |
# fallback for youtu.be or raw IDs
|
| 63 |
return parsed.path.lstrip("/")
|
| 64 |
|
|
|
|
| 65 |
def fetch_youtube_transcript(video_url: str) -> str:
|
| 66 |
"""
|
| 67 |
-
|
| 68 |
-
|
|
|
|
|
|
|
|
|
|
| 69 |
"""
|
| 70 |
video_id = extract_video_id(video_url)
|
| 71 |
|
|
@@ -77,9 +41,8 @@ def fetch_youtube_transcript(video_url: str) -> str:
|
|
| 77 |
)
|
| 78 |
|
| 79 |
#FROM TRANSCRIPT DATA, YOU CAN CREATE A OBJECT OF TRANSCRIPT SNIPET AND TIME
|
| 80 |
-
arr = [snippet.text for snippet in transcript_data]
|
| 81 |
-
return " ".join(arr)
|
| 82 |
-
#return " ".join(entry["text"] for entry in arr)
|
| 83 |
except Exception as e:
|
| 84 |
return f"Error fetching video details: {str(e)}"
|
| 85 |
|
|
|
|
| 14 |
Returns:
|
| 15 |
str: The extracted video ID or raises ValueError.
|
| 16 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
parsed = urlparse(url)
|
| 18 |
if parsed.hostname in {"www.youtube.com", "youtube.com"}:
|
| 19 |
qs = parse_qs(parsed.query)
|
|
|
|
| 22 |
# fallback for youtu.be or raw IDs
|
| 23 |
return parsed.path.lstrip("/")
|
| 24 |
|
| 25 |
+
|
| 26 |
def fetch_youtube_transcript(video_url: str) -> str:
|
| 27 |
"""
|
| 28 |
+
Fetches the transcript text for a given YouTube video.
|
| 29 |
+
Args:
|
| 30 |
+
url (str): The YouTube video URL.
|
| 31 |
+
Returns:
|
| 32 |
+
str: Combined transcript text or an error message.
|
| 33 |
"""
|
| 34 |
video_id = extract_video_id(video_url)
|
| 35 |
|
|
|
|
| 41 |
)
|
| 42 |
|
| 43 |
#FROM TRANSCRIPT DATA, YOU CAN CREATE A OBJECT OF TRANSCRIPT SNIPET AND TIME
|
| 44 |
+
arr = [ {"text": snippet.text} for snippet in transcript_data]
|
| 45 |
+
return " ".join(f"{entry['text']}" for entry in arr)
|
|
|
|
| 46 |
except Exception as e:
|
| 47 |
return f"Error fetching video details: {str(e)}"
|
| 48 |
|