File size: 4,832 Bytes
d303e2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
from langchain_core.tools import tool
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain_community.document_loaders import WikipediaLoader
from langchain_community.document_loaders import ArxivLoader
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound # Added
import os

@tool
def wiki_search(query: str) -> str:
    """Search Wikipedia for a query and return maximum 2 results.
    Args:
        query: The search query."""
    search_docs = WikipediaLoader(query=query, load_max_docs=2).load()
    formatted_search_docs = "\n\n---\n\n".join(
        [
            f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
            for doc in search_docs
        ]
    )
    return {"wiki_results": formatted_search_docs}


@tool
def web_search(query: str) -> str:
    """Search Tavily for a query and return maximum 3 results.
    Args:
        query: The search query."""
    search_docs = TavilySearchResults(max_results=3).invoke({"query": query})
    formatted_search_docs = "\n\n---\n\n".join(
        [
            f'<Document source="{doc.get("url", "")}">\n{doc.get("content", doc.get("snippet", ""))}\n</Document>'
            for doc in search_docs
        ]
    )
    return {"web_results": formatted_search_docs}


@tool
def arxiv_search(query: str) -> str:
    """Search Arxiv for a query and return maximum 3 result.
    Args:
        query: The search query."""
    search_docs = ArxivLoader(query=query, load_max_docs=3).load()
    formatted_search_docs = "\n\n---\n\n".join(
        [
            f'<Document source="{doc.metadata.get("source", "N/A")}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content[:1000]}\n</Document>'
            for doc in search_docs
        ]
    )
    return {"arxiv_results": formatted_search_docs}


@tool
def get_youtube_transcript(youtube_url: str) -> str:
    """Fetches the transcript for a given YouTube video URL using youtube-transcript-api directly.
       If the video has no transcript, it will return an error message. Then use web_search to find the transcript.
    Args:
        youtube_url: The URL of the YouTube video."""
    try:
        video_id = None
        if "watch?v=" in youtube_url:
            video_id = youtube_url.split("watch?v=")[1].split("&")[0]
        elif "youtu.be/" in youtube_url:
            video_id = youtube_url.split("youtu.be/")[1].split("?")[0]
        
        if not video_id:
            return "Error: Could not parse YouTube video ID from URL."

        transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
        
        transcript = None
        try:
            # Try fetching English first if available, then any manual, then any generated
            transcript = transcript_list.find_manually_created_transcript(['en'])
        except NoTranscriptFound:
            try:
                transcript = transcript_list.find_generated_transcript(['en'])
            except NoTranscriptFound:
                # If English not found, try any manual transcript
                try:
                    transcript = transcript_list.find_manually_created_transcript(transcript_list.languages)
                except NoTranscriptFound:
                    # Finally, try any generated transcript
                    try:
                        transcript = transcript_list.find_generated_transcript(transcript_list.languages)
                    except NoTranscriptFound:
                        return "Error: No manual or auto-generated transcripts found for this video in any language."

        fetched_transcript = transcript.fetch()
        
        if not fetched_transcript:
            return "Could not retrieve transcript for the video. The video might not have transcripts available."
        
        # Changed item['text'] to item.text to handle cases where items are objects
        full_transcript = " ".join([item.text for item in fetched_transcript])
        
        # Returning the transcript text directly, wrapped in a dictionary similar to other tools
        return {"youtube_transcript": full_transcript}

    except TranscriptsDisabled:
        return "Error: Transcripts are disabled for this video."
    except NoTranscriptFound: 
        return "Error: No transcripts found for this video (this should have been caught earlier, but good fallback)."
    except Exception as e:
        # Catching potential network errors or other API issues specifically
        if "HTTP Error 403" in str(e) or "Too Many Requests" in str(e):
            return f"Error: YouTube API request failed, possibly due to rate limiting or access restrictions: {str(e)}"
        return f"Error fetching YouTube transcript using youtube-transcript-api: {str(e)}"