Spaces:
Sleeping
Sleeping
| from smolagents import DuckDuckGoSearchTool, Tool, VisitWebpageTool | |
| from langchain_community.agent_toolkits.load_tools import load_tools | |
| import os | |
| from typing import List, Optional | |
| from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound | |
| from smolagents import tool | |
| def youtube_transcript_search( | |
| video_id: str, | |
| query: str, | |
| top_k: Optional[int] = 5 | |
| ) -> List[dict]: | |
| """ | |
| Search a YouTube transcript for occurrences of a query string. | |
| Args: | |
| video_id (str): The YouTube video ID (after 'v=' in the URL). | |
| query (str): The search term to look for in the transcript (case-insensitive). | |
| top_k (Optional[int]): Maximum number of matches to return (default: 5). | |
| Returns: | |
| List[dict]: A list of up to top_k matches, each a dict with: | |
| - 'time': float start time in seconds | |
| - 'text_snippet': snippet of transcript containing the query | |
| """ | |
| try: | |
| # Fetch transcript (choose auto-generated or manually created) | |
| transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) | |
| # prefer generated if manual unavailable | |
| transcript = transcript_list.find_manually_created_transcript(['en']) or transcript_list.find_generated_transcript(['en']) | |
| segments = transcript.fetch() | |
| except (TranscriptsDisabled, NoTranscriptFound) as e: | |
| return [{"time": 0.0, "text_snippet": f"No transcript found: {str(e)}"}] | |
| # Search query in segments | |
| query_lower = query.lower() | |
| hits = [] | |
| for seg in segments: | |
| if query_lower in seg["text"].lower(): | |
| hits.append({ | |
| "time": seg["start"], | |
| "text_snippet": seg["text"] | |
| }) | |
| if len(hits) >= top_k: | |
| break | |
| if not hits: | |
| return [{"time": 0.0, "text_snippet": "No matches found."}] | |
| return hits | |
| # Initialize the search tools | |
| duck_search_tool = DuckDuckGoSearchTool() | |
| visit_web_page_tool = VisitWebpageTool() | |
| google_search_tool = Tool.from_langchain(load_tools(["serpapi"])[0]) |