| """ |
| GitHub Code Search Tool - Search code across GitHub with intelligent filtering |
| |
| Maps user-friendly patterns to GitHub's Code Search API capabilities. |
| """ |
|
|
| import fnmatch |
| import os |
| import re |
| from typing import Any, Dict, Optional |
|
|
| import requests |
|
|
| from agent.tools.types import ToolResult |
|
|
|
|
| def _glob_match(text: str, pattern: str) -> bool: |
| """Check if text matches glob pattern, supporting ** for multi-level paths""" |
| if "**" in pattern: |
| regex_pattern = pattern.replace("**", "<<<DOUBLESTAR>>>") |
| regex_pattern = fnmatch.translate(regex_pattern) |
| regex_pattern = regex_pattern.replace("<<<DOUBLESTAR>>>", ".*") |
| return re.match(regex_pattern, text) is not None |
| return fnmatch.fnmatch(text, pattern) |
|
|
|
|
| def _parse_repo_filter(repo_pattern: str) -> tuple[Optional[str], Optional[str]]: |
| """ |
| Parse repository pattern into GitHub API filter and client-side glob pattern. |
| |
| Returns: (api_filter, client_glob) |
| - api_filter: GitHub API filter string (e.g., "org:huggingface") |
| - client_glob: Pattern for client-side filtering (e.g., "huggingface/trl*") |
| |
| Examples: |
| "huggingface/trl" β ("repo:huggingface/trl", None) |
| "huggingface/*" β ("org:huggingface", "huggingface/*") |
| "huggingface/trl*" β ("org:huggingface", "huggingface/trl*") |
| "huggingface" β ("org:huggingface", None) |
| "*/*" β (None, "*/*") |
| """ |
| if not repo_pattern: |
| return None, None |
|
|
| |
| if "/" in repo_pattern and "*" not in repo_pattern and "?" not in repo_pattern: |
| return f"repo:{repo_pattern}", None |
|
|
| |
| if "/" in repo_pattern and ("*" in repo_pattern or "?" in repo_pattern): |
| org_name = repo_pattern.split("/")[0] |
| if "*" not in org_name and "?" not in org_name: |
| return f"org:{org_name}", repo_pattern |
| |
| return None, repo_pattern |
|
|
| |
| if "*" not in repo_pattern and "?" not in repo_pattern: |
| return f"org:{repo_pattern}", None |
|
|
| |
| return None, repo_pattern |
|
|
|
|
| def _parse_path_filter(path_pattern: str) -> tuple[Optional[str], Optional[str]]: |
| """ |
| Parse path pattern into GitHub API filter and client-side glob pattern. |
| |
| Returns: (api_filter, client_glob) |
| |
| Examples: |
| "*.py" β ("extension:py", None) |
| "**/*.py" β ("extension:py", None) |
| "src/**/*.py" β ("extension:py", "src/**/*.py") |
| "test_*.py" β ("extension:py", "test_*.py") |
| "src/main.py" β ("path:src/main.py", None) |
| """ |
| if not path_pattern: |
| return None, None |
|
|
| |
| if "*" not in path_pattern and "?" not in path_pattern: |
| return f"path:{path_pattern}", None |
|
|
| |
| ext_match = re.search(r"\*\.(\w+)$", path_pattern) |
| if ext_match: |
| extension = ext_match.group(1) |
| api_filter = f"extension:{extension}" |
|
|
| |
| |
| if path_pattern in [f"*.{extension}", f"**/*.{extension}"]: |
| |
| return api_filter, None |
| else: |
| |
| return api_filter, path_pattern |
|
|
| |
| |
| if "/" not in path_pattern: |
| |
| if "." in path_pattern: |
| parts = path_pattern.rsplit(".", 1) |
| if "*" not in parts[-1] and "?" not in parts[-1]: |
| |
| return f"extension:{parts[-1]}", path_pattern |
| |
| return None, path_pattern |
|
|
| |
| return None, path_pattern |
|
|
|
|
| def search_code( |
| query: str, |
| repo_pattern: Optional[str] = None, |
| path_pattern: Optional[str] = None, |
| regex: bool = False, |
| max_results: int = 20, |
| ) -> ToolResult: |
| """ |
| Search for code across GitHub with intelligent pattern matching. |
| |
| This tool intelligently maps user patterns to GitHub's Code Search API capabilities: |
| |
| Repository Patterns: |
| - "owner/repo" β Searches exact repository |
| - "owner/*" or "owner" β Searches all repos in organization |
| - "*/*" β Searches all GitHub (no repo filter) |
| - Wildcards trigger client-side filtering when needed |
| |
| Path Patterns: |
| - "*.py" β Searches all Python files |
| - "**/*.js" β Searches all JavaScript files (any directory) |
| - "src/**/*.py" β Python files in src/ (uses client-side filtering) |
| - "test_*.py" β Files matching pattern (client-side filtering) |
| - "path/to/file.py" β Exact file path |
| |
| Args: |
| query: Search term or pattern to find in code |
| repo_pattern: Repository pattern (e.g., "huggingface/trl", "huggingface/*", "huggingface") |
| path_pattern: File path pattern (e.g., "*.py", "src/**/*.js") |
| regex: If True, treat query as regular expression |
| max_results: Maximum number of results to return (default 20) |
| |
| Returns: |
| ToolResult with code matches and snippets |
| """ |
| token = os.environ.get("GITHUB_TOKEN") |
| if not token: |
| return { |
| "formatted": "Error: GITHUB_TOKEN environment variable is required", |
| "totalResults": 0, |
| "resultsShared": 0, |
| "isError": True, |
| } |
|
|
| |
| query_parts = [] |
|
|
| |
| if regex: |
| query_parts.append(f"/{query}/") |
| else: |
| query_parts.append(f'"{query}"' if " " in query else query) |
|
|
| |
| repo_api_filter, repo_client_glob = _parse_repo_filter(repo_pattern) |
| if repo_api_filter: |
| query_parts.append(repo_api_filter) |
|
|
| |
| path_api_filter, path_client_glob = _parse_path_filter(path_pattern) |
| if path_api_filter: |
| query_parts.append(path_api_filter) |
|
|
| github_query = " ".join(query_parts) |
|
|
| headers = { |
| "Accept": "application/vnd.github.text-match+json", |
| "X-GitHub-Api-Version": "2022-11-28", |
| "Authorization": f"Bearer {token}", |
| } |
|
|
| all_matches = [] |
| page = 1 |
| per_page = min(100, max_results) |
|
|
| try: |
| while len(all_matches) < max_results: |
| params = { |
| "q": github_query, |
| "page": page, |
| "per_page": per_page, |
| } |
|
|
| response = requests.get( |
| "https://api.github.com/search/code", |
| headers=headers, |
| params=params, |
| timeout=30, |
| ) |
|
|
| if response.status_code == 403: |
| error_data = response.json() |
| return { |
| "formatted": f"GitHub API rate limit or permission error: {error_data.get('message', 'Unknown error')}", |
| "totalResults": 0, |
| "resultsShared": 0, |
| "isError": True, |
| } |
|
|
| if response.status_code != 200: |
| error_msg = f"GitHub API error (status {response.status_code})" |
| try: |
| error_data = response.json() |
| if "message" in error_data: |
| error_msg += f": {error_data['message']}" |
| except Exception: |
| pass |
| return { |
| "formatted": error_msg, |
| "totalResults": 0, |
| "resultsShared": 0, |
| "isError": True, |
| } |
|
|
| data = response.json() |
| items = data.get("items", []) |
|
|
| if not items: |
| break |
|
|
| for item in items: |
| repo_name = item.get("repository", {}).get("full_name", "unknown") |
| file_path = item.get("path", "") |
| sha = item.get("sha", "") |
|
|
| |
| if repo_client_glob and not _glob_match(repo_name, repo_client_glob): |
| continue |
| if path_client_glob and not _glob_match(file_path, path_client_glob): |
| continue |
|
|
| |
| text_matches = item.get("text_matches", []) |
| if text_matches: |
| for text_match in text_matches: |
| fragment = text_match.get("fragment", "") |
| lines = fragment.split("\n") |
| line_count = len([line for line in lines if line.strip()]) |
|
|
| all_matches.append( |
| { |
| "repo": repo_name, |
| "path": file_path, |
| "ref": sha, |
| "line_start": 1, |
| "line_end": line_count, |
| "snippet": fragment.strip(), |
| "url": item.get("html_url", ""), |
| } |
| ) |
| else: |
| all_matches.append( |
| { |
| "repo": repo_name, |
| "path": file_path, |
| "ref": sha, |
| "line_start": 1, |
| "line_end": 1, |
| "snippet": "(snippet not available)", |
| "url": item.get("html_url", ""), |
| } |
| ) |
|
|
| if len(all_matches) >= data.get("total_count", 0): |
| break |
|
|
| page += 1 |
|
|
| except requests.exceptions.RequestException as e: |
| return { |
| "formatted": f"Failed to connect to GitHub API: {str(e)}", |
| "totalResults": 0, |
| "resultsShared": 0, |
| "isError": True, |
| } |
|
|
| results = all_matches[:max_results] |
|
|
| if not results: |
| return { |
| "formatted": f"No code matches found for query: {query}", |
| "totalResults": 0, |
| "resultsShared": 0, |
| } |
|
|
| |
| lines_output = [f"**Found {len(results)} code matches:**\n"] |
|
|
| for i, match in enumerate(results, 1): |
| lines_output.append(f"{i}. **{match['repo']}:{match['path']}**") |
| lines_output.append( |
| f" Lines: {match['line_start']}-{match['line_end']} | Ref: {match['ref'][:7]}" |
| ) |
| lines_output.append(f" URL: {match['url']}") |
|
|
| |
| read_params = f"{{'repo': '{match['repo']}', 'path': '{match['path']}', 'ref': '{match['ref'][:7]}'}}" |
| lines_output.append(f" To read, use: {read_params}") |
|
|
| |
| snippet_lines = match["snippet"].split("\n")[:5] |
| if snippet_lines: |
| lines_output.append(" ```") |
| for line in snippet_lines: |
| lines_output.append(f" {line}") |
| if len(match["snippet"].split("\n")) > 5: |
| lines_output.append(" ...") |
| lines_output.append(" ```") |
| lines_output.append("") |
|
|
| return { |
| "formatted": "\n".join(lines_output), |
| "totalResults": len(results), |
| "resultsShared": len(results), |
| } |
|
|
|
|
| |
| GITHUB_SEARCH_CODE_TOOL_SPEC = { |
| "name": "github_search_code", |
| "description": ( |
| "Search for specific code patterns, functions, or classes across GitHub repositories. " |
| "**Use when:** (1) Need to find specific function/class implementations, " |
| "(2) Looking for how specific APIs are used across repos, (3) Searching for specific patterns or methods, " |
| "(4) Investigating feature implementations across different projects, (5) Finding usage examples of specific imports or calls. " |
| "**Pattern:** github_search_code (find usage) β github_read_file (read full context) β understand implementation. " |
| "Returns: Code snippets with line numbers, file paths, and repo URLs. Intelligently maps patterns to GitHub API. " |
| "**Then:** Use github_read_file to read full file context. " |
| "**vs github_find_examples:** Use search_code for specific code patterns (e.g., 'AutoModelForCausalLM.from_pretrained'); " |
| "use find_examples for discovering tutorial/example files. " |
| "Supports regex searches for advanced patterns.\n\n" |
| "## When to use this tool\n\n" |
| "- When searching for specific code patterns, functions, or classes across repositories\n" |
| "- When looking for implementation examples of specific methods or APIs\n" |
| "- When you need to find where specific code exists across multiple files or repos\n" |
| "- When investigating how a feature is implemented in different repositories\n" |
| "- When searching for TODO comments, specific patterns, or code structures\n" |
| "- Use this for searching actual implementation code (not example files - use github_find_examples for those)\n\n" |
| "## When NOT to use this tool\n\n" |
| "- When looking for example/tutorial files (use github_find_examples instead)\n" |
| "- When you already know the exact file path (use github_read_file directly)\n" |
| "- When you need to list repositories (use github_list_repos instead)\n\n" |
| "## Repository Patterns\n\n" |
| "- **Exact repo**: `'huggingface/trl'` β Searches only that repository\n" |
| "- **Organization**: `'huggingface'` or `'huggingface/*'` β All repos in organization\n" |
| "- **All GitHub**: `'*/*'` or omit repo_pattern β Searches across all GitHub\n" |
| "- **Wildcards**: `'huggingface/trl*'` β Automatic client-side filtering for complex patterns\n\n" |
| "## Path Patterns\n\n" |
| "- **Extension**: `'*.py'` or `'**/*.py'` β All Python files\n" |
| "- **Directory**: `'src/**/*.js'` β JavaScript files in src/ directory (client-filtered)\n" |
| "- **Pattern**: `'test_*.py'` β Files matching pattern (client-filtered)\n" |
| "- **Exact path**: `'README.md'` β Specific file\n\n" |
| "## How it works\n\n" |
| "1. Parses repository and path patterns\n" |
| "2. Converts to GitHub API filters when possible (server-side, fast)\n" |
| "3. Falls back to client-side filtering for complex patterns\n" |
| "4. Returns code snippets with line numbers, URLs, and file refs\n" |
| "5. Results can be used directly with github_read_file tool\n\n" |
| "## Examples\n\n" |
| "<example>\n" |
| "// ML Workflow Step: Find how AutoModelForCausalLM is used\n" |
| "// Use case: Learning best practices for loading LLMs in TRL\n" |
| "{\n" |
| " query: 'AutoModelForCausalLM.from_pretrained',\n" |
| " repo_pattern: 'huggingface/trl',\n" |
| " path_pattern: '*.py'\n" |
| "}\n" |
| "// Finds all model loading patterns with quantization, device_map, etc.\n" |
| "</example>\n\n" |
| "<example>\n" |
| "// ML Workflow Step: Discover TrainingArguments configurations\n" |
| "// Use case: Setting up training hyperparameters correctly\n" |
| "{\n" |
| " query: 'TrainingArguments',\n" |
| " repo_pattern: 'huggingface/transformers',\n" |
| " path_pattern: 'examples/**/*.py',\n" |
| " max_results: 10\n" |
| "}\n" |
| "// Shows various TrainingArguments setups across different tasks\n" |
| "</example>\n\n" |
| "<example>\n" |
| "// ML Workflow Step: Find dataset preprocessing patterns\n" |
| "// Use case: Learning how to prepare data for instruction tuning\n" |
| "{\n" |
| " query: 'map(tokenize',\n" |
| " repo_pattern: 'huggingface',\n" |
| " path_pattern: '*.py'\n" |
| "}\n" |
| "// Discovers tokenization and dataset mapping patterns\n" |
| "</example>\n\n" |
| "<example>\n" |
| "// ML Workflow Step: Find all Trainer class implementations\n" |
| "// Use case: Understanding available trainer variants for different tasks\n" |
| "{\n" |
| " query: 'class \\\\w+Trainer\\\\(',\n" |
| " repo_pattern: 'huggingface/trl',\n" |
| " path_pattern: 'trl/trainer/**/*.py',\n" |
| " regex: true\n" |
| "}\n" |
| "// Lists: GRPOTrainer, DPOTrainer, PPOTrainer, RewardTrainer, etc.\n" |
| "</example>" |
| ), |
| "parameters": { |
| "type": "object", |
| "properties": { |
| "query": { |
| "type": "string", |
| "description": "Search term or pattern to find in code. Required.", |
| }, |
| "repo_pattern": { |
| "type": "string", |
| "description": "Repository pattern: 'owner/repo' (exact), 'owner' (org), 'owner/*' (org with filter), '*/*' (all). Optional.", |
| }, |
| "path_pattern": { |
| "type": "string", |
| "description": "File path pattern: '*.ext' (extension), 'dir/**/*.ext' (directory), 'pattern*.ext' (name pattern). Optional.", |
| }, |
| "regex": { |
| "type": "boolean", |
| "description": "If true, treat query as regular expression. Default: false.", |
| }, |
| "max_results": { |
| "type": "integer", |
| "description": "Maximum number of results to return. Default: 20.", |
| }, |
| }, |
| "required": ["query"], |
| }, |
| } |
|
|
|
|
| async def github_search_code_handler(arguments: Dict[str, Any]) -> tuple[str, bool]: |
| """Handler for agent tool router""" |
| try: |
| result = search_code( |
| query=arguments["query"], |
| repo_pattern=arguments.get("repo_pattern"), |
| path_pattern=arguments.get("path_pattern"), |
| regex=arguments.get("regex", False), |
| max_results=arguments.get("max_results", 20), |
| ) |
| return result["formatted"], not result.get("isError", False) |
| except Exception as e: |
| return f"Error searching code: {str(e)}", False |
|
|