Spaces:
Running
Running
| """ | |
| GitHub Find Examples Tool - Discover examples, tutorials, and guides for any library | |
| Lists all files in a repository and performs deterministic keyword search. | |
| """ | |
| import os | |
| from typing import Any, Dict, List | |
| import requests | |
| from thefuzz import fuzz | |
| from agent.tools.types import ToolResult | |
| # In order of priority (lower index = higher priority for sorting) | |
| EXAMPLE_PATTERNS = [ | |
| "scripts", | |
| # General example patterns (catch-all, lower priority) | |
| "examples", | |
| "example", | |
| # Notebook patterns | |
| "notebooks", | |
| "notebook", | |
| # Tutorial/learning patterns | |
| "tutorials", | |
| "tutorial", | |
| "quickstart", | |
| "walkthroughs", | |
| "walkthrough", | |
| # Cookbook/recipe patterns | |
| "cookbook", | |
| "cookbooks", | |
| "recipes", | |
| "recipe", | |
| # Demo/sample patterns | |
| "demos", | |
| "demo", | |
| "samples", | |
| "sample", | |
| # Other patterns | |
| "guides", | |
| "guide", | |
| "getting-started", | |
| "getting_started", | |
| "playground", | |
| "howto", | |
| "how-to", | |
| "use-cases", | |
| "usecases", | |
| "use_cases", | |
| "sandbox", | |
| "showcase", | |
| ] | |
| def _get_repo_tree(org: str, repo: str, token: str) -> tuple[List[Dict[str, Any]], str]: | |
| """Get all files in a repository recursively. Returns (files, error_message)""" | |
| headers = { | |
| "Accept": "application/vnd.github+json", | |
| "X-GitHub-Api-Version": "2022-11-28", | |
| "Authorization": f"Bearer {token}", | |
| } | |
| full_repo = f"{org}/{repo}" | |
| # Get default branch | |
| try: | |
| response = requests.get( | |
| f"https://api.github.com/repos/{full_repo}", headers=headers, timeout=10 | |
| ) | |
| if response.status_code == 404: | |
| return [], "not_found" | |
| if response.status_code != 200: | |
| return [], f"API error: {response.status_code}" | |
| repo_data = response.json() | |
| default_branch = repo_data.get("default_branch", "main") | |
| except Exception as e: | |
| return [], f"Error fetching repo: {str(e)}" | |
| # Get repository tree recursively | |
| try: | |
| response = requests.get( | |
| f"https://api.github.com/repos/{full_repo}/git/trees/{default_branch}", | |
| headers=headers, | |
| params={"recursive": "1"}, | |
| timeout=30, | |
| ) | |
| if response.status_code != 200: | |
| return [], f"Error fetching tree: {response.status_code}" | |
| data = response.json() | |
| tree = data.get("tree", []) | |
| # Filter to only include files (not directories) | |
| files = [ | |
| { | |
| "path": item["path"], | |
| "ref": item["sha"], | |
| "size": item.get("size", 0), | |
| "url": f"https://github.com/{full_repo}/blob/{default_branch}/{item['path']}", | |
| } | |
| for item in tree | |
| if item["type"] == "blob" | |
| ] | |
| return files, "" | |
| except Exception as e: | |
| return [], f"Error processing tree: {str(e)}" | |
| def _search_similar_repos(org: str, repo: str, token: str) -> List[Dict[str, Any]]: | |
| """Search for similar repository names in the organization""" | |
| headers = { | |
| "Accept": "application/vnd.github+json", | |
| "X-GitHub-Api-Version": "2022-11-28", | |
| "Authorization": f"Bearer {token}", | |
| } | |
| # Search for repos in the org with similar name | |
| query = f"org:{org} {repo}" | |
| try: | |
| response = requests.get( | |
| "https://api.github.com/search/repositories", | |
| headers=headers, | |
| params={"q": query, "sort": "stars", "order": "desc", "per_page": 10}, | |
| timeout=30, | |
| ) | |
| if response.status_code != 200: | |
| return [] | |
| data = response.json() | |
| items = data.get("items", []) | |
| return [ | |
| { | |
| "name": item.get("name"), | |
| "full_name": item.get("full_name"), | |
| "description": item.get("description"), | |
| "stars": item.get("stargazers_count", 0), | |
| "url": item.get("html_url"), | |
| } | |
| for item in items | |
| ] | |
| except Exception: | |
| return [] | |
| def _score_against_example_patterns(file_path: str) -> int: | |
| """Score file against example patterns using token_set_ratio""" | |
| scores = [] | |
| for pattern in EXAMPLE_PATTERNS: | |
| score = fuzz.token_set_ratio(pattern.lower(), file_path.lower()) | |
| scores.append(score) | |
| return max(scores) if scores else 0 | |
| def _score_against_keyword(file_path: str, keyword: str) -> int: | |
| """Calculate fuzzy match score for a file path against a keyword""" | |
| # Use partial_ratio for substring matching (good for paths) | |
| # Also check token_set_ratio for word-level matching | |
| partial_score = fuzz.partial_ratio(keyword.lower(), file_path.lower()) | |
| token_score = fuzz.token_set_ratio(keyword.lower(), file_path.lower()) | |
| # Return the higher of the two | |
| return max(partial_score, token_score) | |
| def _get_pattern_priority(file_path: str) -> tuple[int, int, int]: | |
| """ | |
| Get priority of a file path based on which example pattern directory it's in. | |
| Returns: (in_examples_dir, pattern_priority, path_depth) | |
| - in_examples_dir: 0 if in examples/ directory, 1 otherwise (lower is better) | |
| - pattern_priority: Index in EXAMPLE_PATTERNS (lower is better), or 999 if no match | |
| - path_depth: Number of path segments (lower is better) | |
| Note: Prioritizes files in "examples/" directory first, then by most specific pattern match. | |
| E.g., "examples/scripts/train.py" is better than "scripts/util.py" | |
| """ | |
| path_lower = file_path.lower() | |
| path_parts = path_lower.split("/") | |
| # Check if file is in examples/ directory (highest priority) | |
| in_examples_dir = 0 if (path_parts[0] in ["examples", "example"]) else 1 | |
| # Find ALL matching patterns and use the best (lowest index) one | |
| # But prefer deeper matches (more specific) over shallow ones | |
| best_priority = 999 | |
| best_depth_at_match = -1 | |
| for i, pattern in enumerate(EXAMPLE_PATTERNS): | |
| # Check if pattern appears as a directory component in the path | |
| if pattern in path_parts: | |
| # Find the depth where this pattern appears (rightmost occurrence) | |
| depth = len(path_parts) - 1 - path_parts[::-1].index(pattern) | |
| # Prefer deeper matches, or better priority if at same depth | |
| if depth > best_depth_at_match or ( | |
| depth == best_depth_at_match and i < best_priority | |
| ): | |
| best_priority = i | |
| best_depth_at_match = depth | |
| return (in_examples_dir, best_priority, len(path_parts)) | |
| def _handle_repo_tree_errors( | |
| all_files: List[Dict[str, Any]], | |
| error: str, | |
| org: str, | |
| repo: str, | |
| token: str, | |
| ) -> ToolResult | None: | |
| """Handle errors from repo tree fetch. Returns ToolResult if error, None if OK.""" | |
| if error == "not_found": | |
| similar_repos = _search_similar_repos(org, repo, token) | |
| if not similar_repos: | |
| return { | |
| "formatted": f"Repository '{org}/{repo}' not found and no similar repositories found.", | |
| "totalResults": 0, | |
| "resultsShared": 0, | |
| "isError": True, | |
| } | |
| # Format similar repos | |
| lines = [f"**Repository '{org}/{repo}' not found. Similar repositories:**\n"] | |
| for i, r in enumerate(similar_repos, 1): | |
| lines.append(f"{i}. **{r['full_name']}** (⭐ {r['stars']:,} stars)") | |
| if r["description"]: | |
| desc = ( | |
| r["description"][:100] + "..." | |
| if len(r["description"]) > 100 | |
| else r["description"] | |
| ) | |
| lines.append(f" {desc}") | |
| lines.append(f" {r['url']}\n") | |
| return { | |
| "formatted": "\n".join(lines), | |
| "totalResults": len(similar_repos), | |
| "resultsShared": len(similar_repos), | |
| "isError": True, | |
| } | |
| if error: | |
| return { | |
| "formatted": f"Error accessing repository '{org}/{repo}': {error}", | |
| "totalResults": 0, | |
| "resultsShared": 0, | |
| "isError": True, | |
| } | |
| if not all_files: | |
| return { | |
| "formatted": f"No files found in repository '{org}/{repo}'", | |
| "totalResults": 0, | |
| "resultsShared": 0, | |
| } | |
| return None | |
| def find_examples( | |
| keyword: str = "", | |
| repo: str = "", | |
| org: str = "huggingface", | |
| max_results: int = 10, | |
| min_score: int = 80, | |
| ) -> ToolResult: | |
| """ | |
| Find example files in a repository using fuzzy matching. | |
| Args: | |
| keyword: Keyword to fuzzy match against file paths (e.g., "grpo") | |
| repo: Repository name (e.g., "trl") | |
| org: GitHub organization (default: "huggingface") | |
| max_results: Maximum number of results (default 50) | |
| min_score: Minimum fuzzy match score (0-100, default 60) | |
| Returns: | |
| ToolResult with matching files, or similar repos if repo not found | |
| """ | |
| token = os.environ.get("GITHUB_TOKEN") | |
| if not token: | |
| return { | |
| "formatted": "Error: GITHUB_TOKEN environment variable is required", | |
| "totalResults": 0, | |
| "resultsShared": 0, | |
| "isError": True, | |
| } | |
| if not repo: | |
| return { | |
| "formatted": "Error: repo parameter is required", | |
| "totalResults": 0, | |
| "resultsShared": 0, | |
| "isError": True, | |
| } | |
| # Get all files in the repository | |
| all_files, error = _get_repo_tree(org, repo, token) | |
| # Handle errors (not found, API errors, empty repo) | |
| if error_result := _handle_repo_tree_errors(all_files, error, org, repo, token): | |
| return error_result | |
| # Step 1: Filter files by example patterns (score >= 60) | |
| example_threshold = 60 | |
| example_files = [] | |
| for file in all_files: | |
| example_score = _score_against_example_patterns(file["path"]) | |
| if example_score >= example_threshold: | |
| example_files.append({**file, "example_score": example_score}) | |
| if not example_files: | |
| return { | |
| "formatted": f"No example files found in {org}/{repo} (no files match example patterns with score >= {example_threshold}).", | |
| "totalResults": 0, | |
| "resultsShared": 0, | |
| } | |
| # Step 2: If keyword provided, score and filter by keyword | |
| if keyword: | |
| scored_files = [] | |
| for file in example_files: | |
| keyword_score = _score_against_keyword(file["path"], keyword) | |
| if keyword_score >= min_score: | |
| scored_files.append({**file, "score": keyword_score}) | |
| if not scored_files: | |
| return { | |
| "formatted": f"No files found in {org}/{repo} matching keyword '{keyword}' (min score: {min_score}) among {len(example_files)} example files.", | |
| "totalResults": 0, | |
| "resultsShared": 0, | |
| } | |
| # Sort by keyword score (descending) for best matches first | |
| scored_files.sort(key=lambda x: x["score"], reverse=True) | |
| else: | |
| # No keyword: prioritize by pattern directory, then path depth | |
| scored_files = [] | |
| for file in example_files: | |
| in_examples_dir, pattern_priority, path_depth = _get_pattern_priority( | |
| file["path"] | |
| ) | |
| scored_files.append( | |
| { | |
| **file, | |
| "score": file["example_score"], | |
| "in_examples_dir": in_examples_dir, | |
| "pattern_priority": pattern_priority, | |
| "path_depth": path_depth, | |
| } | |
| ) | |
| if not scored_files: | |
| return { | |
| "formatted": f"No example files found in {org}/{repo}.", | |
| "totalResults": 0, | |
| "resultsShared": 0, | |
| } | |
| # Sort by: 1) files in examples/ dir first, 2) pattern priority (scripts > datasets > etc), 3) path depth, 4) path name | |
| scored_files.sort( | |
| key=lambda x: ( | |
| x["in_examples_dir"], | |
| x["pattern_priority"], | |
| x["path_depth"], | |
| x["path"], | |
| ) | |
| ) | |
| # Limit results | |
| results = scored_files[:max_results] | |
| # Format output | |
| keyword_desc = f" matching '{keyword}'" if keyword else "" | |
| lines = [f"**Found {len(results)} example files in {org}/{repo}{keyword_desc}:**"] | |
| if len(scored_files) > max_results: | |
| lines[0] += f" (showing {max_results} of {len(scored_files)})" | |
| lines.append("") | |
| for i, file in enumerate(results, 1): | |
| lines.append(f"{i}. **{file['path']}**") | |
| lines.append(f" Size: {file['size']:,} bytes | Ref: {file['ref'][:7]}") | |
| lines.append(f" URL: {file['url']}") | |
| # Copyable parameters for read_file tool | |
| read_params = f"{{'repo': '{org}/{repo}', 'path': '{file['path']}'}}" | |
| lines.append(f" To read, use: {read_params}") | |
| lines.append("") | |
| return { | |
| "formatted": "\n".join(lines), | |
| "totalResults": len(results), | |
| "resultsShared": len(results), | |
| } | |
| # Tool specification | |
| GITHUB_FIND_EXAMPLES_TOOL_SPEC = { | |
| "name": "github_find_examples", | |
| "description": ( | |
| "Discover working code examples, tutorials, scripts, and demos in GitHub repositories. " | |
| "⚠️ CRITICAL: ALWAYS use this BEFORE implementing ML tasks - find working reference code first. " | |
| "Your training data may be outdated; real repository examples show current best practices. " | |
| "**Use when:** (1) Starting any ML implementation (training, inference, evaluation), " | |
| "(2) User asks 'how to' questions about libraries, (3) Need reference implementations, " | |
| "(4) Exploring library capabilities, (5) Before writing training/processing scripts. " | |
| "**Pattern:** github_find_examples (discover) → github_read_file (study code) → implement with researched approach. " | |
| "Returns: List of example files (scripts/notebooks/tutorials) with paths and URLs, sorted by relevance. " | |
| "**Then:** Use github_read_file to read the actual implementation code. " | |
| "**Critical for reliability:** Real examples prevent outdated API usage and show proven patterns. " | |
| "## How it works\n\n" | |
| "1. Fetches all example files (examples/, scripts/, tutorials/, demos/, notebooks/, etc.) from repository\n" | |
| "2. If keyword provided, scores files against keyword using fuzzy matching\n" | |
| "3. Returns best matches sorted by relevance and pattern priority\n" | |
| "4. Provides copyable parameters for github_read_file tool\n\n" | |
| "## Examples\n\n" | |
| "<example>\n" | |
| "// ML Workflow Step: Find GRPO training examples before implementation\n" | |
| "// Task: Starting GRPO fine-tuning project, need reference implementation\n" | |
| "{\n" | |
| " keyword: 'grpo',\n" | |
| " repo: 'trl',\n" | |
| " org: 'huggingface'\n" | |
| "}\n" | |
| "// Returns: examples/scripts/grpo_agent.py, examples/scripts/grpo_vlm.py\n" | |
| "// Next step: github_read_file to study working implementation\n" | |
| "</example>\n\n" | |
| "<example>\n" | |
| "// ML Workflow Step: Discover all available training methods\n" | |
| "// Task: Exploring TRL training options before choosing approach\n" | |
| "{\n" | |
| " repo: 'trl',\n" | |
| " org: 'huggingface',\n" | |
| " max_results: 20\n" | |
| "}\n" | |
| "// Lists: SFT, DPO, GRPO, PPO, reward modeling examples\n" | |
| "// Helps user choose appropriate method\n" | |
| "</example>\n\n" | |
| "<example>\n" | |
| "// ML Workflow Step: Find LoRA fine-tuning examples\n" | |
| "// Task: Learning parameter-efficient fine-tuning patterns\n" | |
| "{\n" | |
| " keyword: 'lora',\n" | |
| " repo: 'peft',\n" | |
| " org: 'huggingface'\n" | |
| "}\n" | |
| "// Discovers LoRA configuration and training examples\n" | |
| "// Shows current PEFT API usage patterns\n" | |
| "</example>" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "keyword": { | |
| "type": "string", | |
| "description": "Keyword to fuzzy match against file paths (e.g., 'grpo', 'sft').", | |
| }, | |
| "repo": { | |
| "type": "string", | |
| "description": "Repository name (e.g., 'trl', 'transformers'). Required.", | |
| }, | |
| "org": { | |
| "type": "string", | |
| "description": "GitHub organization or username. Default: 'huggingface'.", | |
| }, | |
| "max_results": { | |
| "type": "integer", | |
| "description": "Maximum number of results to return. Default: 50.", | |
| }, | |
| "min_score": { | |
| "type": "integer", | |
| "description": "Minimum fuzzy match score (0-100). Default: 60.", | |
| }, | |
| }, | |
| "required": ["repo"], | |
| }, | |
| } | |
| async def github_find_examples_handler(arguments: Dict[str, Any]) -> tuple[str, bool]: | |
| """Handler for agent tool router""" | |
| try: | |
| result = find_examples( | |
| keyword=arguments.get("keyword", ""), | |
| repo=arguments["repo"], | |
| org=arguments.get("org", "huggingface"), | |
| max_results=arguments.get("max_results", 50), | |
| min_score=arguments.get("min_score", 60), | |
| ) | |
| return result["formatted"], not result.get("isError", False) | |
| except Exception as e: | |
| return f"Error finding examples: {str(e)}", False | |