akseljoonas HF Staff commited on
Commit
9d53405
·
1 Parent(s): 6a4fd73

deleted search code

Browse files
agent/core/tools.py CHANGED
@@ -301,13 +301,6 @@ def create_builtin_tools() -> list[ToolSpec]:
301
  parameters=HF_REPO_GIT_TOOL_SPEC["parameters"],
302
  handler=hf_repo_git_handler,
303
  ),
304
- # NOTE: Github search code tool disabled - a bit buggy
305
- # ToolSpec(
306
- # name=GITHUB_SEARCH_CODE_TOOL_SPEC["name"],
307
- # description=GITHUB_SEARCH_CODE_TOOL_SPEC["description"],
308
- # parameters=GITHUB_SEARCH_CODE_TOOL_SPEC["parameters"],
309
- # handler=github_search_code_handler,
310
- # ),
311
  ToolSpec(
312
  name=GITHUB_FIND_EXAMPLES_TOOL_SPEC["name"],
313
  description=GITHUB_FIND_EXAMPLES_TOOL_SPEC["description"],
 
301
  parameters=HF_REPO_GIT_TOOL_SPEC["parameters"],
302
  handler=hf_repo_git_handler,
303
  ),
 
 
 
 
 
 
 
304
  ToolSpec(
305
  name=GITHUB_FIND_EXAMPLES_TOOL_SPEC["name"],
306
  description=GITHUB_FIND_EXAMPLES_TOOL_SPEC["description"],
agent/tools/github_search_code.py DELETED
@@ -1,459 +0,0 @@
1
- """
2
- GitHub Code Search Tool - Search code across GitHub with intelligent filtering
3
-
4
- Maps user-friendly patterns to GitHub's Code Search API capabilities.
5
- """
6
-
7
- import fnmatch
8
- import os
9
- import re
10
- from typing import Any, Dict, Optional
11
-
12
- import requests
13
-
14
- from agent.tools.types import ToolResult
15
-
16
-
17
- def _glob_match(text: str, pattern: str) -> bool:
18
- """Check if text matches glob pattern, supporting ** for multi-level paths"""
19
- if "**" in pattern:
20
- regex_pattern = pattern.replace("**", "<<<DOUBLESTAR>>>")
21
- regex_pattern = fnmatch.translate(regex_pattern)
22
- regex_pattern = regex_pattern.replace("<<<DOUBLESTAR>>>", ".*")
23
- return re.match(regex_pattern, text) is not None
24
- return fnmatch.fnmatch(text, pattern)
25
-
26
-
27
- def _parse_repo_filter(repo_pattern: str) -> tuple[Optional[str], Optional[str]]:
28
- """
29
- Parse repository pattern into GitHub API filter and client-side glob pattern.
30
-
31
- Returns: (api_filter, client_glob)
32
- - api_filter: GitHub API filter string (e.g., "org:huggingface")
33
- - client_glob: Pattern for client-side filtering (e.g., "huggingface/trl*")
34
-
35
- Examples:
36
- "huggingface/trl" → ("repo:huggingface/trl", None)
37
- "huggingface/*" → ("org:huggingface", "huggingface/*")
38
- "huggingface/trl*" → ("org:huggingface", "huggingface/trl*")
39
- "huggingface" → ("org:huggingface", None)
40
- "*/*" → (None, "*/*")
41
- """
42
- if not repo_pattern:
43
- return None, None
44
-
45
- # Pattern: owner/repo (exact match)
46
- if "/" in repo_pattern and "*" not in repo_pattern and "?" not in repo_pattern:
47
- return f"repo:{repo_pattern}", None
48
-
49
- # Pattern: owner/* or owner/prefix* (org + client filter)
50
- if "/" in repo_pattern and ("*" in repo_pattern or "?" in repo_pattern):
51
- org_name = repo_pattern.split("/")[0]
52
- if "*" not in org_name and "?" not in org_name:
53
- return f"org:{org_name}", repo_pattern
54
- # Org name has wildcards - can't filter server-side
55
- return None, repo_pattern
56
-
57
- # Pattern: owner (just org name, no wildcards)
58
- if "*" not in repo_pattern and "?" not in repo_pattern:
59
- return f"org:{repo_pattern}", None
60
-
61
- # Pattern: */* or other complex patterns (client-side only)
62
- return None, repo_pattern
63
-
64
-
65
- def _parse_path_filter(path_pattern: str) -> tuple[Optional[str], Optional[str]]:
66
- """
67
- Parse path pattern into GitHub API filter and client-side glob pattern.
68
-
69
- Returns: (api_filter, client_glob)
70
-
71
- Examples:
72
- "*.py" → ("extension:py", None)
73
- "**/*.py" → ("extension:py", None)
74
- "src/**/*.py" → ("extension:py", "src/**/*.py")
75
- "test_*.py" → ("extension:py", "test_*.py")
76
- "src/main.py" → ("path:src/main.py", None)
77
- """
78
- if not path_pattern:
79
- return None, None
80
-
81
- # Exact path (no wildcards)
82
- if "*" not in path_pattern and "?" not in path_pattern:
83
- return f"path:{path_pattern}", None
84
-
85
- # Extract extension if present
86
- ext_match = re.search(r"\*\.(\w+)$", path_pattern)
87
- if ext_match:
88
- extension = ext_match.group(1)
89
- api_filter = f"extension:{extension}"
90
-
91
- # Check if there's a directory prefix that needs client-side filtering
92
- # e.g., "src/**/*.py" needs client filter, "**/*.py" doesn't
93
- if path_pattern in [f"*.{extension}", f"**/*.{extension}"]:
94
- # Simple patterns - API filter is enough
95
- return api_filter, None
96
- else:
97
- # Complex pattern - need client-side filter too
98
- return api_filter, path_pattern
99
-
100
- # Pattern like "test_*.py" or "README*" - use filename with client filter
101
- # GitHub's filename: doesn't support wildcards, so we rely on client-side
102
- if "/" not in path_pattern:
103
- # Try to extract extension for API filtering
104
- if "." in path_pattern:
105
- parts = path_pattern.rsplit(".", 1)
106
- if "*" not in parts[-1] and "?" not in parts[-1]:
107
- # Extension is clean
108
- return f"extension:{parts[-1]}", path_pattern
109
- # No extension or complex - client-side only
110
- return None, path_pattern
111
-
112
- # Complex path pattern - client-side only
113
- return None, path_pattern
114
-
115
-
116
- def search_code(
117
- query: str,
118
- repo_pattern: Optional[str] = None,
119
- path_pattern: Optional[str] = None,
120
- regex: bool = False,
121
- max_results: int = 20,
122
- ) -> ToolResult:
123
- """
124
- Search for code across GitHub with intelligent pattern matching.
125
-
126
- This tool intelligently maps user patterns to GitHub's Code Search API capabilities:
127
-
128
- Repository Patterns:
129
- - "owner/repo" → Searches exact repository
130
- - "owner/*" or "owner" → Searches all repos in organization
131
- - "*/*" → Searches all GitHub (no repo filter)
132
- - Wildcards trigger client-side filtering when needed
133
-
134
- Path Patterns:
135
- - "*.py" → Searches all Python files
136
- - "**/*.js" → Searches all JavaScript files (any directory)
137
- - "src/**/*.py" → Python files in src/ (uses client-side filtering)
138
- - "test_*.py" → Files matching pattern (client-side filtering)
139
- - "path/to/file.py" → Exact file path
140
-
141
- Args:
142
- query: Search term or pattern to find in code
143
- repo_pattern: Repository pattern (e.g., "huggingface/trl", "huggingface/*", "huggingface")
144
- path_pattern: File path pattern (e.g., "*.py", "src/**/*.js")
145
- regex: If True, treat query as regular expression
146
- max_results: Maximum number of results to return (default 20)
147
-
148
- Returns:
149
- ToolResult with code matches and snippets
150
- """
151
- token = os.environ.get("GITHUB_TOKEN")
152
- if not token:
153
- return {
154
- "formatted": "Error: GITHUB_TOKEN environment variable is required",
155
- "totalResults": 0,
156
- "resultsShared": 0,
157
- "isError": True,
158
- }
159
-
160
- # Build GitHub API query
161
- query_parts = []
162
-
163
- # Add search term
164
- if regex:
165
- query_parts.append(f"/{query}/")
166
- else:
167
- query_parts.append(f'"{query}"' if " " in query else query)
168
-
169
- # Parse repository filter
170
- repo_api_filter, repo_client_glob = _parse_repo_filter(repo_pattern)
171
- if repo_api_filter:
172
- query_parts.append(repo_api_filter)
173
-
174
- # Parse path filter
175
- path_api_filter, path_client_glob = _parse_path_filter(path_pattern)
176
- if path_api_filter:
177
- query_parts.append(path_api_filter)
178
-
179
- github_query = " ".join(query_parts)
180
-
181
- headers = {
182
- "Accept": "application/vnd.github.text-match+json",
183
- "X-GitHub-Api-Version": "2022-11-28",
184
- "Authorization": f"Bearer {token}",
185
- }
186
-
187
- all_matches = []
188
- page = 1
189
- per_page = min(100, max_results)
190
-
191
- try:
192
- while len(all_matches) < max_results:
193
- params = {
194
- "q": github_query,
195
- "page": page,
196
- "per_page": per_page,
197
- }
198
-
199
- response = requests.get(
200
- "https://api.github.com/search/code",
201
- headers=headers,
202
- params=params,
203
- timeout=30,
204
- )
205
-
206
- if response.status_code == 403:
207
- error_data = response.json()
208
- return {
209
- "formatted": f"GitHub API rate limit or permission error: {error_data.get('message', 'Unknown error')}",
210
- "totalResults": 0,
211
- "resultsShared": 0,
212
- "isError": True,
213
- }
214
-
215
- if response.status_code != 200:
216
- error_msg = f"GitHub API error (status {response.status_code})"
217
- try:
218
- error_data = response.json()
219
- if "message" in error_data:
220
- error_msg += f": {error_data['message']}"
221
- except Exception:
222
- pass
223
- return {
224
- "formatted": error_msg,
225
- "totalResults": 0,
226
- "resultsShared": 0,
227
- "isError": True,
228
- }
229
-
230
- data = response.json()
231
- items = data.get("items", [])
232
-
233
- if not items:
234
- break
235
-
236
- for item in items:
237
- repo_name = item.get("repository", {}).get("full_name", "unknown")
238
- file_path = item.get("path", "")
239
- sha = item.get("sha", "")
240
-
241
- # Apply client-side filtering
242
- if repo_client_glob and not _glob_match(repo_name, repo_client_glob):
243
- continue
244
- if path_client_glob and not _glob_match(file_path, path_client_glob):
245
- continue
246
-
247
- # Extract text matches
248
- text_matches = item.get("text_matches", [])
249
- if text_matches:
250
- for text_match in text_matches:
251
- fragment = text_match.get("fragment", "")
252
- lines = fragment.split("\n")
253
- line_count = len([line for line in lines if line.strip()])
254
-
255
- all_matches.append(
256
- {
257
- "repo": repo_name,
258
- "path": file_path,
259
- "ref": sha,
260
- "line_start": 1,
261
- "line_end": line_count,
262
- "snippet": fragment.strip(),
263
- "url": item.get("html_url", ""),
264
- }
265
- )
266
- else:
267
- all_matches.append(
268
- {
269
- "repo": repo_name,
270
- "path": file_path,
271
- "ref": sha,
272
- "line_start": 1,
273
- "line_end": 1,
274
- "snippet": "(snippet not available)",
275
- "url": item.get("html_url", ""),
276
- }
277
- )
278
-
279
- if len(all_matches) >= data.get("total_count", 0):
280
- break
281
-
282
- page += 1
283
-
284
- except requests.exceptions.RequestException as e:
285
- return {
286
- "formatted": f"Failed to connect to GitHub API: {str(e)}",
287
- "totalResults": 0,
288
- "resultsShared": 0,
289
- "isError": True,
290
- }
291
-
292
- results = all_matches[:max_results]
293
-
294
- if not results:
295
- return {
296
- "formatted": f"No code matches found for query: {query}",
297
- "totalResults": 0,
298
- "resultsShared": 0,
299
- }
300
-
301
- # Format output
302
- lines_output = [f"**Found {len(results)} code matches:**\n"]
303
-
304
- for i, match in enumerate(results, 1):
305
- lines_output.append(f"{i}. **{match['repo']}:{match['path']}**")
306
- lines_output.append(
307
- f" Lines: {match['line_start']}-{match['line_end']} | Ref: {match['ref'][:7]}"
308
- )
309
- lines_output.append(f" URL: {match['url']}")
310
-
311
- # Copyable parameters for read_file tool
312
- read_params = f"{{'repo': '{match['repo']}', 'path': '{match['path']}', 'ref': '{match['ref'][:7]}'}}"
313
- lines_output.append(f" To read, use: {read_params}")
314
-
315
- # Show snippet (first 5 lines)
316
- snippet_lines = match["snippet"].split("\n")[:5]
317
- if snippet_lines:
318
- lines_output.append(" ```")
319
- for line in snippet_lines:
320
- lines_output.append(f" {line}")
321
- if len(match["snippet"].split("\n")) > 5:
322
- lines_output.append(" ...")
323
- lines_output.append(" ```")
324
- lines_output.append("")
325
-
326
- return {
327
- "formatted": "\n".join(lines_output),
328
- "totalResults": len(results),
329
- "resultsShared": len(results),
330
- }
331
-
332
-
333
- # Tool specification
334
- GITHUB_SEARCH_CODE_TOOL_SPEC = {
335
- "name": "github_search_code",
336
- "description": (
337
- "Search for specific code patterns, functions, or classes across GitHub repositories. "
338
- "**Use when:** (1) Need to find specific function/class implementations, "
339
- "(2) Looking for how specific APIs are used across repos, (3) Searching for specific patterns or methods, "
340
- "(4) Investigating feature implementations across different projects, (5) Finding usage examples of specific imports or calls. "
341
- "**Pattern:** github_search_code (find usage) → github_read_file (read full context) → understand implementation. "
342
- "Returns: Code snippets with line numbers, file paths, and repo URLs. Intelligently maps patterns to GitHub API. "
343
- "**Then:** Use github_read_file to read full file context. "
344
- "**vs github_find_examples:** Use search_code for specific code patterns (e.g., 'AutoModelForCausalLM.from_pretrained'); "
345
- "use find_examples for discovering tutorial/example files. "
346
- "Supports regex searches for advanced patterns.\n\n"
347
- "## When to use this tool\n\n"
348
- "- When searching for specific code patterns, functions, or classes across repositories\n"
349
- "- When looking for implementation examples of specific methods or APIs\n"
350
- "- When you need to find where specific code exists across multiple files or repos\n"
351
- "- When investigating how a feature is implemented in different repositories\n"
352
- "- When searching for TODO comments, specific patterns, or code structures\n"
353
- "- Use this for searching actual implementation code (not example files - use github_find_examples for those)\n\n"
354
- "## When NOT to use this tool\n\n"
355
- "- When looking for example/tutorial files (use github_find_examples instead)\n"
356
- "- When you already know the exact file path (use github_read_file directly)\n"
357
- "- When you need to list repositories (use github_list_repos instead)\n\n"
358
- "## Repository Patterns\n\n"
359
- "- **Exact repo**: `'huggingface/trl'` → Searches only that repository\n"
360
- "- **Organization**: `'huggingface'` or `'huggingface/*'` → All repos in organization\n"
361
- "- **All GitHub**: `'*/*'` or omit repo_pattern → Searches across all GitHub\n"
362
- "- **Wildcards**: `'huggingface/trl*'` → Automatic client-side filtering for complex patterns\n\n"
363
- "## Path Patterns\n\n"
364
- "- **Extension**: `'*.py'` or `'**/*.py'` → All Python files\n"
365
- "- **Directory**: `'src/**/*.js'` → JavaScript files in src/ directory (client-filtered)\n"
366
- "- **Pattern**: `'test_*.py'` → Files matching pattern (client-filtered)\n"
367
- "- **Exact path**: `'README.md'` → Specific file\n\n"
368
- "## How it works\n\n"
369
- "1. Parses repository and path patterns\n"
370
- "2. Converts to GitHub API filters when possible (server-side, fast)\n"
371
- "3. Falls back to client-side filtering for complex patterns\n"
372
- "4. Returns code snippets with line numbers, URLs, and file refs\n"
373
- "5. Results can be used directly with github_read_file tool\n\n"
374
- "## Examples\n\n"
375
- "<example>\n"
376
- "// ML Workflow Step: Find how AutoModelForCausalLM is used\n"
377
- "// Use case: Learning best practices for loading LLMs in TRL\n"
378
- "{\n"
379
- " query: 'AutoModelForCausalLM.from_pretrained',\n"
380
- " repo_pattern: 'huggingface/trl',\n"
381
- " path_pattern: '*.py'\n"
382
- "}\n"
383
- "// Finds all model loading patterns with quantization, device_map, etc.\n"
384
- "</example>\n\n"
385
- "<example>\n"
386
- "// ML Workflow Step: Discover TrainingArguments configurations\n"
387
- "// Use case: Setting up training hyperparameters correctly\n"
388
- "{\n"
389
- " query: 'TrainingArguments',\n"
390
- " repo_pattern: 'huggingface/transformers',\n"
391
- " path_pattern: 'examples/**/*.py',\n"
392
- " max_results: 10\n"
393
- "}\n"
394
- "// Shows various TrainingArguments setups across different tasks\n"
395
- "</example>\n\n"
396
- "<example>\n"
397
- "// ML Workflow Step: Find dataset preprocessing patterns\n"
398
- "// Use case: Learning how to prepare data for instruction tuning\n"
399
- "{\n"
400
- " query: 'map(tokenize',\n"
401
- " repo_pattern: 'huggingface',\n"
402
- " path_pattern: '*.py'\n"
403
- "}\n"
404
- "// Discovers tokenization and dataset mapping patterns\n"
405
- "</example>\n\n"
406
- "<example>\n"
407
- "// ML Workflow Step: Find all Trainer class implementations\n"
408
- "// Use case: Understanding available trainer variants for different tasks\n"
409
- "{\n"
410
- " query: 'class \\\\w+Trainer\\\\(',\n"
411
- " repo_pattern: 'huggingface/trl',\n"
412
- " path_pattern: 'trl/trainer/**/*.py',\n"
413
- " regex: true\n"
414
- "}\n"
415
- "// Lists: GRPOTrainer, DPOTrainer, PPOTrainer, RewardTrainer, etc.\n"
416
- "</example>"
417
- ),
418
- "parameters": {
419
- "type": "object",
420
- "properties": {
421
- "query": {
422
- "type": "string",
423
- "description": "Search term or pattern to find in code. Required.",
424
- },
425
- "repo_pattern": {
426
- "type": "string",
427
- "description": "Repository pattern: 'owner/repo' (exact), 'owner' (org), 'owner/*' (org with filter), '*/*' (all). Optional.",
428
- },
429
- "path_pattern": {
430
- "type": "string",
431
- "description": "File path pattern: '*.ext' (extension), 'dir/**/*.ext' (directory), 'pattern*.ext' (name pattern). Optional.",
432
- },
433
- "regex": {
434
- "type": "boolean",
435
- "description": "If true, treat query as regular expression. Default: false.",
436
- },
437
- "max_results": {
438
- "type": "integer",
439
- "description": "Maximum number of results to return. Default: 20.",
440
- },
441
- },
442
- "required": ["query"],
443
- },
444
- }
445
-
446
-
447
- async def github_search_code_handler(arguments: Dict[str, Any]) -> tuple[str, bool]:
448
- """Handler for agent tool router"""
449
- try:
450
- result = search_code(
451
- query=arguments["query"],
452
- repo_pattern=arguments.get("repo_pattern"),
453
- path_pattern=arguments.get("path_pattern"),
454
- regex=arguments.get("regex", False),
455
- max_results=arguments.get("max_results", 20),
456
- )
457
- return result["formatted"], not result.get("isError", False)
458
- except Exception as e:
459
- return f"Error searching code: {str(e)}", False