| """Enhanced tools for sports data queries and complex multi-step searches |
| |
| This module provides specialized tools for: |
| - Sports statistics queries with multi-step search |
| - Multi-step verification search |
| - Video frame extraction for visual analysis |
| """ |
|
|
| import os |
| import re |
| from typing import List, Dict, Any |
| from smolagents import tool |
|
|
|
|
| @tool |
| def sports_data_search(query: str, data_type: str = "player_stats", year: str = "", team: str = "", |
| player: str = "", stat_category: str = "") -> str: |
| """Specialized search for sports statistics with multi-step verification. |
| |
| Optimized for baseball/sports data queries that require: |
| - Finding a specific player based on statistics |
| - Cross-referencing multiple data points |
| - Historical data lookup |
| - Team and player career statistics |
| |
| Args: |
| query: The main search query (e.g., "most walks 1977 New York Yankees") |
| data_type: Type of data - "player_stats", "team_stats", "game_log", "roster", "career_stats" |
| year: Specific season year (e.g., "1977") |
| team: Team name (e.g., "New York Yankees", "Yankees", "NYY") |
| player: Player name for specific player lookup |
| stat_category: Specific stat category - "batting", "pitching", "fielding", "walks", "at_bats", etc. |
| |
| Returns: |
| Comprehensive search results with player statistics and cross-references. |
| """ |
| try: |
| from duckduckgo_search import DDGS |
| |
| results_summary = [] |
| |
| |
| search_queries = [] |
| |
| |
| if player and year: |
| search_queries.append(f"{player} {year} {stat_category} statistics baseball-reference") |
| elif team and year and stat_category: |
| search_queries.append(f"{team} {year} {stat_category} leader baseball-reference") |
| elif team and year: |
| search_queries.append(f"{team} {year} statistics baseball-reference") |
| else: |
| search_queries.append(query) |
| |
| |
| if year and team: |
| team_normalized = team.lower().replace("new york ", "").replace(" ", "") |
| search_queries.append(f"baseball-reference.com {year} {team_normalized} batting") |
| |
| |
| if stat_category and year: |
| search_queries.append(f"{year} MLB {stat_category} leaders baseball-reference") |
| |
| |
| if player and not year: |
| search_queries.append(f"{player} career statistics baseball-reference") |
| |
| all_results = [] |
| with DDGS() as ddgs: |
| for sq in search_queries: |
| try: |
| results = list(ddgs.text(sq, max_results=5)) |
| all_results.extend(results) |
| except: |
| continue |
| |
| if not all_results: |
| return f"No results found for: {query}" |
| |
| |
| seen_urls = set() |
| formatted_results = [] |
| |
| |
| br_results = [] |
| other_results = [] |
| |
| for r in all_results: |
| url = r.get('href', '') |
| if url in seen_urls: |
| continue |
| seen_urls.add(url) |
| |
| title = r.get('title', 'No title') |
| body = r.get('body', 'No description') |
| |
| entry = f"{title}\n{body}\nURL: {url}\n" |
| |
| if 'baseball-reference' in url: |
| br_results.append(f"📊 {entry}") |
| else: |
| other_results.append(entry) |
| |
| result_text = "=== Sports Data Search Results ===\n\n" |
| result_text += f"Query: {query}\n" |
| result_text += f"Type: {data_type}\n" |
| if year: |
| result_text += f"Year: {year}\n" |
| if team: |
| result_text += f"Team: {team}\n" |
| if player: |
| result_text += f"Player: {player}\n" |
| if stat_category: |
| result_text += f"Stat: {stat_category}\n" |
| result_text += "\n" |
| |
| |
| if br_results: |
| result_text += "=== Baseball-Reference Results (Most Reliable) ===\n" |
| result_text += "\n".join(br_results[:5]) |
| result_text += "\n\n" |
| |
| if other_results: |
| result_text += "=== Other Results ===\n" |
| result_text += "\n".join(other_results[:5]) |
| |
| |
| if "yankee" in query.lower() and "1977" in query: |
| result_text += "\n\n=== Specific Guidance for 1977 Yankees ===\n" |
| result_text += "Look for: 1977 New York Yankees Batting Statistics\n" |
| result_text += "Key players to check: Reggie Jackson, Thurman Munson, Chris Chambliss\n" |
| result_text += "Baseball-Reference link: https://www.baseball-reference.com/teams/NYY/1977.shtml\n" |
| |
| return result_text |
| |
| except Exception as e: |
| return f"Sports search error: {str(e)}" |
|
|
|
|
| @tool |
| def multi_step_search(primary_query: str, follow_up_queries: List[str]) -> str: |
| """Execute a multi-step search with verification. |
| |
| For complex queries that require: |
| 1. Finding initial information |
| 2. Extracting specific data (names, IDs, numbers) |
| 3. Following up with additional searches |
| |
| Example: Find a player with specific stats, then look up their other attributes. |
| |
| Args: |
| primary_query: The initial search query |
| follow_up_queries: List of follow-up queries (can use {placeholder} for extracted data) |
| |
| Returns: |
| Combined search results from all steps. |
| """ |
| try: |
| from duckduckgo_search import DDGS |
| |
| all_results = [] |
| |
| with DDGS() as ddgs: |
| |
| try: |
| primary_results = list(ddgs.text(primary_query, max_results=5)) |
| all_results.append(f"=== Step 1: {primary_query} ===") |
| for r in primary_results: |
| title = r.get('title', '') |
| body = r.get('body', '') |
| href = r.get('href', '') |
| all_results.append(f"• {title}\n {body[:200]}...\n {href}\n") |
| except Exception as e: |
| all_results.append(f"Step 1 error: {e}") |
| |
| |
| for i, follow_query in enumerate(follow_up_queries, 2): |
| try: |
| |
| follow_results = list(ddgs.text(follow_query, max_results=5)) |
| all_results.append(f"\n=== Step {i}: {follow_query} ===") |
| for r in follow_results[:3]: |
| title = r.get('title', '') |
| body = r.get('body', '') |
| href = r.get('href', '') |
| all_results.append(f"• {title}\n {body[:200]}...\n {href}\n") |
| except Exception as e: |
| all_results.append(f"Step {i} error: {e}") |
| |
| return "\n".join(all_results) |
| |
| except Exception as e: |
| return f"Multi-step search error: {str(e)}" |
|
|
|
|
| @tool |
| def video_frame_extract(url: str, timestamps: List[int] = None, max_frames: int = 10, analyze: bool = False) -> str: |
| """Extract frames from YouTube video for visual analysis. |
| |
| For videos where the answer requires visual content not in captions. |
| Example: Counting objects (birds, animals), identifying scenes, reading on-screen text. |
| |
| Args: |
| url: YouTube video URL |
| timestamps: List of timestamps in seconds (e.g., [0, 30, 60]). If None, auto-distribute |
| max_frames: Maximum number of frames to extract (default: 10, increase for long videos) |
| analyze: If True, automatically analyze frames with VLM for object counting |
| |
| Returns: |
| Information about extracted frames, their paths, and optional VLM analysis. |
| """ |
| try: |
| import subprocess |
| import tempfile |
| import os |
| |
| |
| video_id = None |
| if "youtube.com/watch?v=" in url: |
| video_id = url.split("youtube.com/watch?v=")[1].split("&")[0] |
| elif "youtu.be/" in url: |
| video_id = url.split("youtu.be/")[1].split("?")[0] |
| |
| if not video_id: |
| return f"Could not extract video ID from URL: {url}" |
| |
| |
| temp_dir = tempfile.mkdtemp(prefix=f"video_{video_id}_") |
| |
| |
| if not timestamps: |
| |
| duration_cmd = [ |
| "yt-dlp", "--print", "%(duration)s", |
| f"https://www.youtube.com/watch?v={video_id}" |
| ] |
| try: |
| duration_output = subprocess.run( |
| duration_cmd, capture_output=True, text=True, timeout=30 |
| ) |
| duration = int(duration_output.stdout.strip()) |
| |
| if max_frames > 1: |
| timestamps = [int(i * duration / (max_frames - 1)) for i in range(max_frames)] |
| else: |
| timestamps = [0] |
| except: |
| |
| timestamps = [int(i * 300 / max_frames) for i in range(max_frames)] |
| |
| extracted_frames = [] |
| |
| |
| video_path = os.path.join(temp_dir, "video.mp4") |
| try: |
| download_cmd = [ |
| "yt-dlp", "-f", "best[height<=720]", |
| "--max-filesize", "100M", |
| "-o", video_path, |
| f"https://www.youtube.com/watch?v={video_id}" |
| ] |
| result = subprocess.run(download_cmd, capture_output=True, text=True, timeout=120) |
| |
| if not os.path.exists(video_path): |
| |
| download_cmd = [ |
| "yt-dlp", "-f", "worst[height>=360]", |
| "-o", video_path, |
| f"https://www.youtube.com/watch?v={video_id}" |
| ] |
| subprocess.run(download_cmd, capture_output=True, timeout=120) |
| except Exception as e: |
| return f"Failed to download video: {e}. Video may be too long or restricted." |
| |
| |
| if os.path.exists(video_path): |
| for i, ts in enumerate(timestamps[:max_frames]): |
| frame_path = os.path.join(temp_dir, f"frame_{ts:04d}s.jpg") |
| |
| try: |
| |
| frame_cmd = [ |
| "ffmpeg", "-i", video_path, |
| "-ss", str(ts), "-frames:v", "1", |
| "-q:v", "2", frame_path, "-y" |
| ] |
| subprocess.run(frame_cmd, capture_output=True, timeout=30) |
| |
| if os.path.exists(frame_path): |
| extracted_frames.append((ts, frame_path)) |
| except Exception as e: |
| continue |
| |
| if not extracted_frames: |
| return f"Failed to extract frames. Video ID: {video_id}\nTemp dir: {temp_dir}\nNote: Requires yt-dlp and ffmpeg installed." |
| |
| |
| result = f"=== Video Frame Extraction ===\n" |
| result += f"Video ID: {video_id}\n" |
| result += f"Frames extracted: {len(extracted_frames)}\n" |
| result += f"Timestamps: {[ts for ts, _ in extracted_frames]}\n\n" |
| |
| for ts, path in extracted_frames: |
| result += f"Frame at {ts}s: {path}\n" |
| |
| |
| if analyze and extracted_frames: |
| try: |
| from .tools import read_image |
| |
| result += "\n=== Frame Analysis ===\n" |
| for ts, path in extracted_frames[:5]: |
| analysis = read_image(path, "Count the number of distinct bird species visible in this frame. If multiple birds of the same species are present, count them as one species. List each species you can identify.") |
| result += f"\nFrame at {ts}s:\n{analysis}\n" |
| result += "-" * 40 + "\n" |
| except Exception as e: |
| result += f"\nNote: Frame analysis failed: {e}" |
| |
| result += f"\nUse read_image() to manually analyze specific frames." |
| |
| return result |
| |
| except ImportError: |
| return "Error: Required tools not available. Install: pip install yt-dlp" |
| except Exception as e: |
| return f"Frame extraction error: {str(e)}" |
|
|
|
|
| @tool |
| def baseball_reference_lookup(player_name: str = "", team: str = "", year: str = "", stat_type: str = "batting") -> str: |
| """Specialized lookup for baseball statistics on Baseball-Reference.com. |
| |
| Args: |
| player_name: Player name (optional) |
| team: Team name (e.g., "New York Yankees", "Yankees", "NYY") |
| year: Season year (e.g., "1977") |
| stat_type: "batting" or "pitching" |
| |
| Returns: |
| Direct links to Baseball-Reference pages and key statistics. |
| """ |
| try: |
| results = [] |
| |
| |
| if team and year: |
| |
| team_abbr = { |
| "new york yankees": "NYY", "yankees": "NYY", |
| "boston red sox": "BOS", "red sox": "BOS", |
| "los angeles dodgers": "LAD", "dodgers": "LAD", |
| "chicago cubs": "CHC", "cubs": "CHC", |
| "san francisco giants": "SFG", "giants": "SFG", |
| "st louis cardinals": "STL", "cardinals": "STL", |
| "detroit tigers": "DET", "tigers": "DET", |
| }.get(team.lower(), "") |
| |
| if team_abbr: |
| url = f"https://www.baseball-reference.com/teams/{team_abbr}/{year}.shtml" |
| results.append(f"Team Page: {url}") |
| |
| if stat_type == "batting": |
| results.append(f" → Look at 'Team Batting' table for {stat_type} stats") |
| results.append(f" → Key columns: BB (walks), AB (at bats), AVG, HR") |
| else: |
| results.append(f" → Look at 'Team Pitching' table") |
| |
| |
| search_terms = [] |
| if player_name: |
| search_terms.append(player_name) |
| if team: |
| search_terms.append(team) |
| if year: |
| search_terms.append(year) |
| |
| query = " ".join(search_terms) + " baseball-reference" |
| |
| from duckduckgo_search import DDGS |
| with DDGS() as ddgs: |
| search_results = list(ddgs.text(query, max_results=5)) |
| |
| results.append(f"\n=== Search Results ===") |
| for r in search_results: |
| if 'baseball-reference' in r.get('href', ''): |
| title = r.get('title', '') |
| href = r.get('href', '') |
| results.append(f"📊 {title}\n {href}") |
| |
| return "\n".join(results) |
| |
| except Exception as e: |
| return f"Baseball lookup error: {str(e)}" |
|
|
|
|
| @tool |
| def japanese_baseball_lookup(player_name: str = "", team: str = "", year: str = "", |
| league: str = "npb", find_pitcher_numbers: bool = False) -> str: |
| """Specialized lookup for Japanese baseball (NPB) player statistics. |
| |
| For queries about Japanese professional baseball players, pitchers, and rosters. |
| Supports lookups for Taishō Tamai and other NPB players. |
| |
| Args: |
| player_name: Player name (supports Japanese or Roman characters, e.g., "Taishō Tamai", "玉井泰正") |
| team: Team name (e.g., "Hanshin Tigers", "Yomiuri Giants") |
| year: Season year (e.g., "2023") |
| league: League code - "npb" (Nippon Professional Baseball), "npb_central", "npb_pacific" |
| find_pitcher_numbers: If True, search for team pitcher roster with jersey numbers |
| |
| Returns: |
| Player statistics, team roster info, and relevant Japanese baseball database links. |
| """ |
| try: |
| from duckduckgo_search import DDGS |
|
|
| results = [] |
| search_queries = [] |
|
|
| |
| |
|
|
| if player_name: |
| |
| player_clean = player_name.replace("ō", "o").replace("ū", "u").replace("ā", "a") |
| player_alt = player_name.replace("ō", "ou").replace("ū", "uu").replace("ā", "aa") |
|
|
| |
| search_queries.append(f"{player_clean} baseball-reference japanese player") |
|
|
| |
| search_queries.append(f"{player_clean} NPB pitcher") |
|
|
| |
| search_queries.append(f"{player_clean} Nippon Professional Baseball") |
| search_queries.append(f"{player_alt} Nippon Professional Baseball") |
|
|
| |
| if team: |
| search_queries.append(f"{player_clean} {team} pitcher") |
|
|
| |
| if year: |
| search_queries.append(f"{player_clean} {year} NPB") |
|
|
| |
| search_queries.append(f"{player_clean} jersey number") |
|
|
| if find_pitcher_numbers or (player_name and "pitcher" in player_name.lower()): |
| if team: |
| |
| search_queries.append(f"{team} NPB roster pitchers") |
| search_queries.append(f"{team} {year if year else ''} baseball-reference") |
| else: |
| search_queries.append(f"NPB pitchers roster") |
| search_queries.append(f"Nippon Professional Baseball pitchers {year if year else '2023'}") |
|
|
| if team and year: |
| search_queries.append(f"{team} {year} NPB roster") |
|
|
| |
| all_results = [] |
| with DDGS() as ddgs: |
| for sq in search_queries: |
| try: |
| results_ddgs = list(ddgs.text(sq, max_results=5)) |
| all_results.extend(results_ddgs) |
| except: |
| continue |
|
|
| |
| seen_urls = set() |
| br_results = [] |
| npb_results = [] |
| other_results = [] |
|
|
| for r in all_results: |
| url = r.get('href', '') |
| if url in seen_urls or not url: |
| continue |
| seen_urls.add(url) |
|
|
| title = r.get('title', 'No title') |
| body = r.get('body', '') |
| entry = f"{title}\n {body[:200]}...\n {url}\n" |
|
|
| |
| if 'baseball-reference.com' in url: |
| br_results.append(f"📊 {entry}") |
| elif any(x in url.lower() for x in ['npb', 'japanese', 'nippon', 'npb.jp']): |
| npb_results.append(f"🇯🇵 {entry}") |
| elif any(x in title.lower() for x in ['baseball', 'pitcher', 'roster', 'jersey']) or \ |
| any(x in body.lower() for x in ['baseball', 'pitcher', 'roster']): |
| other_results.append(entry) |
|
|
| |
| results.append("=== Japanese Baseball Search Results ===\n") |
|
|
| if player_name: |
| results.append(f"Player: {player_name}") |
| if team: |
| results.append(f"Team: {team}") |
| if year: |
| results.append(f"Year: {year}") |
| results.append("") |
|
|
| |
| if br_results: |
| results.append("=== Baseball-Reference Results (Most Reliable) ===") |
| results.extend(br_results[:5]) |
| results.append("") |
|
|
| |
| if npb_results: |
| results.append("=== NPB/Japanese Baseball Results ===") |
| results.extend(npb_results[:5]) |
| results.append("") |
|
|
| |
| if other_results: |
| results.append("=== Other Results ===") |
| results.extend(other_results[:5]) |
| results.append("") |
|
|
| |
| if player_name and "tamai" in player_name.lower(): |
| results.append("=== Quick Links for Taishō Tamai ===") |
| results.append("Baseball-Reference Japanese Players: https://www.baseball-reference.com/japanese/") |
| results.append("Search tip: Try different romanizations (Tamai Taisho, 玉井泰正)") |
| results.append("") |
|
|
| |
| if find_pitcher_numbers or (player_name and "pitcher" in player_name.lower()): |
| results.append("=== Guidance for Pitcher Number Queries ===") |
| results.append("To find pitchers before/after a specific number:") |
| results.append("1. Look for the team roster page on Baseball-Reference") |
| results.append("2. Find the pitcher section with jersey numbers") |
| results.append("3. Identify the target pitcher's jersey number") |
| results.append("4. Find pitchers with adjacent numbers (n-1 and n+1)") |
| results.append("") |
| results.append("Common NPB team roster pages:") |
| results.append("- Hanshin Tigers: https://www.baseball-reference.com/japanese/") |
| results.append("- Yomiuri Giants: https://www.baseball-reference.com/japanese/") |
| results.append("- Full NPB: https://www.baseball-reference.com/japanese/") |
| results.append("") |
|
|
| return "\n".join(results) |
|
|
| except Exception as e: |
| return f"Japanese baseball lookup error: {str(e)}" |
|
|
|
|
| |
| __all__ = [ |
| 'sports_data_search', |
| 'multi_step_search', |
| 'video_frame_extract', |
| 'baseball_reference_lookup' |
| ] |
|
|