Spaces:
Sleeping
Sleeping
File size: 2,667 Bytes
aea337a a44d34c aea337a 4793736 aea337a f8a91e9 aea337a f8a91e9 aea337a f8a91e9 aea337a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
from urllib.parse import parse_qs, urlparse
from llama_index.core.tools import FunctionTool
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound, VideoUnavailable
#loader = YoutubeTranscriptReader()
yt_ap = YouTubeTranscriptApi()
def extract_video_id(url: str) -> str:
"""
Extracts the video ID from a YouTube URL.
Args:
url (str): The full YouTube video URL.
Returns:
str: The extracted video ID or raises ValueError.
"""
parsed = urlparse(url)
if parsed.hostname in {"www.youtube.com", "youtube.com"}:
qs = parse_qs(parsed.query)
if "v" in qs:
return qs["v"][0]
# fallback for youtu.be or raw IDs
return parsed.path.lstrip("/")
def fetch_youtube_transcript(video_url: str) -> str:
"""
Fetches the transcript text for a given YouTube video.
Args:
url (str): The YouTube video URL.
Returns:
str: Combined transcript text or an error message.
"""
video_id = extract_video_id(video_url)
try:
# ✅ call on the class, NOT an instance
transcript_data = yt_ap.fetch(
video_id=video_id,
languages=["en"], #You can add as many languages, use yt_ap.list(video_id) function to get the langauges
)
#FROM TRANSCRIPT DATA, YOU CAN CREATE A OBJECT OF TRANSCRIPT SNIPET AND TIME
arr = [ {"text": snippet.text} for snippet in transcript_data]
return " ".join(f"{entry['text']}" for entry in arr)
except Exception as e:
return f"Error fetching video details: {str(e)}"
def fetch_youtube_transcript_snippets(video_url: str) -> str:
"""
Fetch YouTube transcript snippets for the given URL.
It gets the start-time, end-time and duration of each snippet.
"""
video_id = extract_video_id(video_url)
try:
# ✅ call on the class, NOT an instance
transcript_data = yt_ap.fetch(
video_id=video_id,
languages=["en"], #You can add as many languages, use yt_ap.list(video_id) function to get the langauges
)
arr = [
{"text": snippet.text, "duration": snippet.duration, "start": snippet.start}
for snippet in transcript_data
]
return " ".join(f"Text: {entry['text']} Duration: {entry['duration']} StartTime: {entry['start']} <End>" for entry in arr)
except Exception as e:
return f"Error fetching video details: {str(e)}"
youtube_transcript_tool = FunctionTool.from_defaults(fetch_youtube_transcript)
youtube_transcript_snippet_tool = FunctionTool.from_defaults(fetch_youtube_transcript_snippets) |