"""Enhanced tools for sports data queries and complex multi-step searches This module provides specialized tools for: - Sports statistics queries with multi-step search - Multi-step verification search - Video frame extraction for visual analysis """ import os import re from typing import List, Dict, Any from smolagents import tool @tool def sports_data_search(query: str, data_type: str = "player_stats", year: str = "", team: str = "", player: str = "", stat_category: str = "") -> str: """Specialized search for sports statistics with multi-step verification. Optimized for baseball/sports data queries that require: - Finding a specific player based on statistics - Cross-referencing multiple data points - Historical data lookup - Team and player career statistics Args: query: The main search query (e.g., "most walks 1977 New York Yankees") data_type: Type of data - "player_stats", "team_stats", "game_log", "roster", "career_stats" year: Specific season year (e.g., "1977") team: Team name (e.g., "New York Yankees", "Yankees", "NYY") player: Player name for specific player lookup stat_category: Specific stat category - "batting", "pitching", "fielding", "walks", "at_bats", etc. Returns: Comprehensive search results with player statistics and cross-references. """ try: from duckduckgo_search import DDGS results_summary = [] # Build targeted search queries based on inputs search_queries = [] # Query 1: Direct statistics search with all context if player and year: search_queries.append(f"{player} {year} {stat_category} statistics baseball-reference") elif team and year and stat_category: search_queries.append(f"{team} {year} {stat_category} leader baseball-reference") elif team and year: search_queries.append(f"{team} {year} statistics baseball-reference") else: search_queries.append(query) # Query 2: Baseball-Reference specific if year and team: team_normalized = team.lower().replace("new york ", "").replace(" ", "") search_queries.append(f"baseball-reference.com {year} {team_normalized} batting") # Query 3: Stat-specific search if stat_category and year: search_queries.append(f"{year} MLB {stat_category} leaders baseball-reference") # Query 4: Player career stats if player and not year: search_queries.append(f"{player} career statistics baseball-reference") all_results = [] with DDGS() as ddgs: for sq in search_queries: try: results = list(ddgs.text(sq, max_results=5)) all_results.extend(results) except: continue if not all_results: return f"No results found for: {query}" # Extract and format unique results seen_urls = set() formatted_results = [] # Prioritize baseball-reference results br_results = [] other_results = [] for r in all_results: url = r.get('href', '') if url in seen_urls: continue seen_urls.add(url) title = r.get('title', 'No title') body = r.get('body', 'No description') entry = f"{title}\n{body}\nURL: {url}\n" if 'baseball-reference' in url: br_results.append(f"📊 {entry}") else: other_results.append(entry) result_text = "=== Sports Data Search Results ===\n\n" result_text += f"Query: {query}\n" result_text += f"Type: {data_type}\n" if year: result_text += f"Year: {year}\n" if team: result_text += f"Team: {team}\n" if player: result_text += f"Player: {player}\n" if stat_category: result_text += f"Stat: {stat_category}\n" result_text += "\n" # Prioritize baseball-reference results if br_results: result_text += "=== Baseball-Reference Results (Most Reliable) ===\n" result_text += "\n".join(br_results[:5]) result_text += "\n\n" if other_results: result_text += "=== Other Results ===\n" result_text += "\n".join(other_results[:5]) # Add specific guidance for common queries if "yankee" in query.lower() and "1977" in query: result_text += "\n\n=== Specific Guidance for 1977 Yankees ===\n" result_text += "Look for: 1977 New York Yankees Batting Statistics\n" result_text += "Key players to check: Reggie Jackson, Thurman Munson, Chris Chambliss\n" result_text += "Baseball-Reference link: https://www.baseball-reference.com/teams/NYY/1977.shtml\n" return result_text except Exception as e: return f"Sports search error: {str(e)}" @tool def multi_step_search(primary_query: str, follow_up_queries: List[str]) -> str: """Execute a multi-step search with verification. For complex queries that require: 1. Finding initial information 2. Extracting specific data (names, IDs, numbers) 3. Following up with additional searches Example: Find a player with specific stats, then look up their other attributes. Args: primary_query: The initial search query follow_up_queries: List of follow-up queries (can use {placeholder} for extracted data) Returns: Combined search results from all steps. """ try: from duckduckgo_search import DDGS all_results = [] with DDGS() as ddgs: # Step 1: Primary search try: primary_results = list(ddgs.text(primary_query, max_results=5)) all_results.append(f"=== Step 1: {primary_query} ===") for r in primary_results: title = r.get('title', '') body = r.get('body', '') href = r.get('href', '') all_results.append(f"• {title}\n {body[:200]}...\n {href}\n") except Exception as e: all_results.append(f"Step 1 error: {e}") # Step 2+: Follow-up searches for i, follow_query in enumerate(follow_up_queries, 2): try: # Simple placeholder replacement (agent should extract values) follow_results = list(ddgs.text(follow_query, max_results=5)) all_results.append(f"\n=== Step {i}: {follow_query} ===") for r in follow_results[:3]: title = r.get('title', '') body = r.get('body', '') href = r.get('href', '') all_results.append(f"• {title}\n {body[:200]}...\n {href}\n") except Exception as e: all_results.append(f"Step {i} error: {e}") return "\n".join(all_results) except Exception as e: return f"Multi-step search error: {str(e)}" @tool def video_frame_extract(url: str, timestamps: List[int] = None, max_frames: int = 10, analyze: bool = False) -> str: """Extract frames from YouTube video for visual analysis. For videos where the answer requires visual content not in captions. Example: Counting objects (birds, animals), identifying scenes, reading on-screen text. Args: url: YouTube video URL timestamps: List of timestamps in seconds (e.g., [0, 30, 60]). If None, auto-distribute max_frames: Maximum number of frames to extract (default: 10, increase for long videos) analyze: If True, automatically analyze frames with VLM for object counting Returns: Information about extracted frames, their paths, and optional VLM analysis. """ try: import subprocess import tempfile import os # Extract video ID video_id = None if "youtube.com/watch?v=" in url: video_id = url.split("youtube.com/watch?v=")[1].split("&")[0] elif "youtu.be/" in url: video_id = url.split("youtu.be/")[1].split("?")[0] if not video_id: return f"Could not extract video ID from URL: {url}" # Create temp directory temp_dir = tempfile.mkdtemp(prefix=f"video_{video_id}_") # If no timestamps provided, extract evenly spaced frames across video if not timestamps: # First get video duration duration_cmd = [ "yt-dlp", "--print", "%(duration)s", f"https://www.youtube.com/watch?v={video_id}" ] try: duration_output = subprocess.run( duration_cmd, capture_output=True, text=True, timeout=30 ) duration = int(duration_output.stdout.strip()) # Generate evenly spaced timestamps (sample throughout video) if max_frames > 1: timestamps = [int(i * duration / (max_frames - 1)) for i in range(max_frames)] else: timestamps = [0] except: # Default to evenly spaced samples timestamps = [int(i * 300 / max_frames) for i in range(max_frames)] # 5min video default extracted_frames = [] # Download full video first (more reliable than sections for frame extraction) video_path = os.path.join(temp_dir, "video.mp4") try: download_cmd = [ "yt-dlp", "-f", "best[height<=720]", "--max-filesize", "100M", # Limit size "-o", video_path, f"https://www.youtube.com/watch?v={video_id}" ] result = subprocess.run(download_cmd, capture_output=True, text=True, timeout=120) if not os.path.exists(video_path): # Fallback: try with worse quality download_cmd = [ "yt-dlp", "-f", "worst[height>=360]", "-o", video_path, f"https://www.youtube.com/watch?v={video_id}" ] subprocess.run(download_cmd, capture_output=True, timeout=120) except Exception as e: return f"Failed to download video: {e}. Video may be too long or restricted." # Extract frames at specified timestamps if os.path.exists(video_path): for i, ts in enumerate(timestamps[:max_frames]): frame_path = os.path.join(temp_dir, f"frame_{ts:04d}s.jpg") try: # Extract frame using ffmpeg frame_cmd = [ "ffmpeg", "-i", video_path, "-ss", str(ts), "-frames:v", "1", "-q:v", "2", frame_path, "-y" ] subprocess.run(frame_cmd, capture_output=True, timeout=30) if os.path.exists(frame_path): extracted_frames.append((ts, frame_path)) except Exception as e: continue if not extracted_frames: return f"Failed to extract frames. Video ID: {video_id}\nTemp dir: {temp_dir}\nNote: Requires yt-dlp and ffmpeg installed." # Format result result = f"=== Video Frame Extraction ===\n" result += f"Video ID: {video_id}\n" result += f"Frames extracted: {len(extracted_frames)}\n" result += f"Timestamps: {[ts for ts, _ in extracted_frames]}\n\n" for ts, path in extracted_frames: result += f"Frame at {ts}s: {path}\n" # Optional VLM analysis for object counting if analyze and extracted_frames: try: from .tools import read_image result += "\n=== Frame Analysis ===\n" for ts, path in extracted_frames[:5]: # Analyze first 5 frames analysis = read_image(path, "Count the number of distinct bird species visible in this frame. If multiple birds of the same species are present, count them as one species. List each species you can identify.") result += f"\nFrame at {ts}s:\n{analysis}\n" result += "-" * 40 + "\n" except Exception as e: result += f"\nNote: Frame analysis failed: {e}" result += f"\nUse read_image() to manually analyze specific frames." return result except ImportError: return "Error: Required tools not available. Install: pip install yt-dlp" except Exception as e: return f"Frame extraction error: {str(e)}" @tool def baseball_reference_lookup(player_name: str = "", team: str = "", year: str = "", stat_type: str = "batting") -> str: """Specialized lookup for baseball statistics on Baseball-Reference.com. Args: player_name: Player name (optional) team: Team name (e.g., "New York Yankees", "Yankees", "NYY") year: Season year (e.g., "1977") stat_type: "batting" or "pitching" Returns: Direct links to Baseball-Reference pages and key statistics. """ try: results = [] # Build Baseball-Reference URLs if team and year: # Team season page team_abbr = { "new york yankees": "NYY", "yankees": "NYY", "boston red sox": "BOS", "red sox": "BOS", "los angeles dodgers": "LAD", "dodgers": "LAD", "chicago cubs": "CHC", "cubs": "CHC", "san francisco giants": "SFG", "giants": "SFG", "st louis cardinals": "STL", "cardinals": "STL", "detroit tigers": "DET", "tigers": "DET", }.get(team.lower(), "") if team_abbr: url = f"https://www.baseball-reference.com/teams/{team_abbr}/{year}.shtml" results.append(f"Team Page: {url}") if stat_type == "batting": results.append(f" → Look at 'Team Batting' table for {stat_type} stats") results.append(f" → Key columns: BB (walks), AB (at bats), AVG, HR") else: results.append(f" → Look at 'Team Pitching' table") # Search queries for web search search_terms = [] if player_name: search_terms.append(player_name) if team: search_terms.append(team) if year: search_terms.append(year) query = " ".join(search_terms) + " baseball-reference" from duckduckgo_search import DDGS with DDGS() as ddgs: search_results = list(ddgs.text(query, max_results=5)) results.append(f"\n=== Search Results ===") for r in search_results: if 'baseball-reference' in r.get('href', ''): title = r.get('title', '') href = r.get('href', '') results.append(f"📊 {title}\n {href}") return "\n".join(results) except Exception as e: return f"Baseball lookup error: {str(e)}" @tool def japanese_baseball_lookup(player_name: str = "", team: str = "", year: str = "", league: str = "npb", find_pitcher_numbers: bool = False) -> str: """Specialized lookup for Japanese baseball (NPB) player statistics. For queries about Japanese professional baseball players, pitchers, and rosters. Supports lookups for Taishō Tamai and other NPB players. Args: player_name: Player name (supports Japanese or Roman characters, e.g., "Taishō Tamai", "玉井泰正") team: Team name (e.g., "Hanshin Tigers", "Yomiuri Giants") year: Season year (e.g., "2023") league: League code - "npb" (Nippon Professional Baseball), "npb_central", "npb_pacific" find_pitcher_numbers: If True, search for team pitcher roster with jersey numbers Returns: Player statistics, team roster info, and relevant Japanese baseball database links. """ try: from duckduckgo_search import DDGS results = [] search_queries = [] # === OPTIMIZED SEARCH QUERIES === # Use more specific keywords to filter out irrelevant results if player_name: # Normalize player name (remove special chars for better search) player_clean = player_name.replace("ō", "o").replace("ū", "u").replace("ā", "a") player_alt = player_name.replace("ō", "ou").replace("ū", "uu").replace("ā", "aa") # Query 1: Direct Baseball-Reference search with "japanese" context search_queries.append(f"{player_clean} baseball-reference japanese player") # Query 2: NPB pitcher specific search_queries.append(f"{player_clean} NPB pitcher") # Query 3: Use Nippon Professional Baseball (avoid "Japanese" alone) search_queries.append(f"{player_clean} Nippon Professional Baseball") search_queries.append(f"{player_alt} Nippon Professional Baseball") # Query 4: Team-specific search if team provided if team: search_queries.append(f"{player_clean} {team} pitcher") # Query 5: Year-specific if year: search_queries.append(f"{player_clean} {year} NPB") # Query 6: Jersey number search search_queries.append(f"{player_clean} jersey number") if find_pitcher_numbers or (player_name and "pitcher" in player_name.lower()): if team: # Use specific NPB roster keywords search_queries.append(f"{team} NPB roster pitchers") search_queries.append(f"{team} {year if year else ''} baseball-reference") else: search_queries.append(f"NPB pitchers roster") search_queries.append(f"Nippon Professional Baseball pitchers {year if year else '2023'}") if team and year: search_queries.append(f"{team} {year} NPB roster") # === EXECUTE SEARCHES === all_results = [] with DDGS() as ddgs: for sq in search_queries: try: results_ddgs = list(ddgs.text(sq, max_results=5)) all_results.extend(results_ddgs) except: continue # === FILTER AND PRIORITIZE RESULTS === seen_urls = set() br_results = [] # Baseball-Reference (most reliable) npb_results = [] # NPB/Japanese baseball specific other_results = [] # Other sources for r in all_results: url = r.get('href', '') if url in seen_urls or not url: continue seen_urls.add(url) title = r.get('title', 'No title') body = r.get('body', '') entry = f"{title}\n {body[:200]}...\n {url}\n" # Categorize by source reliability if 'baseball-reference.com' in url: br_results.append(f"📊 {entry}") elif any(x in url.lower() for x in ['npb', 'japanese', 'nippon', 'npb.jp']): npb_results.append(f"🇯🇵 {entry}") elif any(x in title.lower() for x in ['baseball', 'pitcher', 'roster', 'jersey']) or \ any(x in body.lower() for x in ['baseball', 'pitcher', 'roster']): other_results.append(entry) # === FORMAT OUTPUT === results.append("=== Japanese Baseball Search Results ===\n") if player_name: results.append(f"Player: {player_name}") if team: results.append(f"Team: {team}") if year: results.append(f"Year: {year}") results.append("") # Prioritize Baseball-Reference results if br_results: results.append("=== Baseball-Reference Results (Most Reliable) ===") results.extend(br_results[:5]) results.append("") # NPB-specific results if npb_results: results.append("=== NPB/Japanese Baseball Results ===") results.extend(npb_results[:5]) results.append("") # Other relevant results if other_results: results.append("=== Other Results ===") results.extend(other_results[:5]) results.append("") # === DIRECT LINKS FOR COMMON QUERIES === if player_name and "tamai" in player_name.lower(): results.append("=== Quick Links for Taishō Tamai ===") results.append("Baseball-Reference Japanese Players: https://www.baseball-reference.com/japanese/") results.append("Search tip: Try different romanizations (Tamai Taisho, 玉井泰正)") results.append("") # === GUIDANCE FOR PITCHER NUMBER QUERIES === if find_pitcher_numbers or (player_name and "pitcher" in player_name.lower()): results.append("=== Guidance for Pitcher Number Queries ===") results.append("To find pitchers before/after a specific number:") results.append("1. Look for the team roster page on Baseball-Reference") results.append("2. Find the pitcher section with jersey numbers") results.append("3. Identify the target pitcher's jersey number") results.append("4. Find pitchers with adjacent numbers (n-1 and n+1)") results.append("") results.append("Common NPB team roster pages:") results.append("- Hanshin Tigers: https://www.baseball-reference.com/japanese/") results.append("- Yomiuri Giants: https://www.baseball-reference.com/japanese/") results.append("- Full NPB: https://www.baseball-reference.com/japanese/") results.append("") return "\n".join(results) except Exception as e: return f"Japanese baseball lookup error: {str(e)}" # Export all tools __all__ = [ 'sports_data_search', 'multi_step_search', 'video_frame_extract', 'baseball_reference_lookup' ]