diff --git "a/gradio_mcp_space.py" "b/gradio_mcp_space.py" new file mode 100644--- /dev/null +++ "b/gradio_mcp_space.py" @@ -0,0 +1,2946 @@ +""" +Simplified Gradio MCP Server for Knowledge Graphs loaded from HuggingFace datasets. +""" +import os +import sys +import argparse +import difflib +import fnmatch +import re +from typing import Optional, List +import gradio as gr +from RepoKnowledgeGraphLib.utils.chunk_utils import ( + organize_chunks_by_file_name, join_organized_chunks + ) + +# Optional Langfuse integration +try: + from langfuse import get_client, observe + langfuse = get_client() + LANGFUSE_ENABLED = langfuse.auth_check() + if LANGFUSE_ENABLED: + print("✓ Langfuse client is authenticated and ready!") + else: + print("⚠️ Langfuse authentication failed. Tracing disabled.") +except Exception as e: + print(f"⚠️ Langfuse not available: {e}. Tracing disabled.") + LANGFUSE_ENABLED = False + def observe(*args, **kwargs): + def decorator(func): + return func + return decorator + + +def _sanitize_value(v): + if isinstance(v, str): + return v.strip() + if isinstance(v, dict): + return {k: _sanitize_value(val) for k, val in v.items()} + if isinstance(v, (list, tuple)): + t = type(v) + return t(_sanitize_value(x) for x in v) + return v + + +def sanitize_inputs(func): + """Decorator that trims whitespace from all string args/kwargs before calling func.""" + def wrapper(*args, **kwargs): + new_args = tuple(_sanitize_value(a) for a in args) + new_kwargs = {k: _sanitize_value(v) for k, v in kwargs.items()} + return func(*new_args, **new_kwargs) + # preserve original attributes + try: + wrapper.__name__ = func.__name__ + wrapper.__doc__ = func.__doc__ + except Exception: + pass + return wrapper + + +# Wrap the existing `observe` decorator (from langfuse or fallback) so that +# all observed tools receive sanitized inputs automatically. This avoids +# having to manually add `@sanitize_inputs` above every `@observe`. +try: + _original_observe = observe + def _observe_with_sanitize(*o_args, **o_kwargs): + def decorator(f): + return _original_observe(*o_args, **o_kwargs)(sanitize_inputs(f)) + return decorator + observe = _observe_with_sanitize +except Exception: + # If anything goes wrong, keep the existing observe as-is. + pass + +# Add parent directory to path +sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'RepoKnowledgeGraphLib')) + +from RepoKnowledgeGraphLib.RepoKnowledgeGraph import RepoKnowledgeGraph + +# Global knowledge graph instance +knowledge_graph = None + + +def initialize_knowledge_graph( + hf_dataset: str, + hf_token: Optional[str] = None, + index_nodes: bool = True, + code_index_kwargs: Optional[dict] = None +): + """Initialize the knowledge graph from a HuggingFace dataset.""" + global knowledge_graph + + model_service_kwargs = { + "embedder_type": "sentence-transformers", + "embed_model_name": "Salesforce/SFR-Embedding-Code-400M_R", + } + + print(f"Loading knowledge graph from HuggingFace dataset: {hf_dataset}") + knowledge_graph = RepoKnowledgeGraph.from_hf_dataset( + repo_id=hf_dataset, + index_nodes=index_nodes, + model_service_kwargs=model_service_kwargs, + code_index_kwargs=code_index_kwargs, + token=hf_token + ) + + +# ==================== Tool Functions ==================== +@observe(as_type="tool") +def get_node_info(node_id: str) -> str: + """ + Retrieve comprehensive details about any node in the Transformers library knowledge graph. + + PURPOSE: + Use this tool to inspect the full metadata and content of a specific node when you need + to understand what a particular code element contains, what entities it declares or calls, + and how it fits into the codebase structure. + + WHEN TO USE: + - After finding a node ID from search_nodes, list_nodes_by_type, or get_neighbors + - To see the actual code content of a chunk node + - To understand what entities (classes, functions, variables) are declared in a file or chunk + - To examine entity metadata including aliases, declaration locations, and usage locations + - To get file metadata like language and path information + + NODE TYPES SUPPORTED: + - 'chunk': Code segments with content, declared/called entities, and file position + - 'file': Source files with path, language, and entity summaries + - 'directory': Folder nodes with path information + - 'entity': Programming constructs (classes, functions, methods, variables) with declaration/usage tracking + - 'repo': Repository root node + + TYPICAL WORKFLOW: + 1. search_nodes("attention mechanism") -> get node IDs + 2. get_node_info(node_id) -> see full content and metadata + 3. get_neighbors(node_id) or find_usages(entity_name) -> explore relationships + + Args: + node_id: The unique identifier of the node (e.g., 'src/transformers/models/bert/modeling_bert.py::chunk_3' for chunks, or 'BertModel' for entities) + + Returns: + str: Formatted details including node type, name, description, content (for chunks), declared entities, called entities, and type-specific metadata + + Example node_ids: + - Chunk: 'src/transformers/models/bert/modeling_bert.py::chunk_5' + - File: 'src/transformers/models/bert/modeling_bert.py' + - Entity: 'BertModel', 'forward', 'attention_mask' + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + if node_id not in knowledge_graph.graph: + return f"Error: Node '{node_id}' not found in knowledge graph" + + node = knowledge_graph.graph.nodes[node_id]['data'] + node_type = getattr(node, 'node_type', 'Unknown') + node_class = node.__class__.__name__ + node_name = getattr(node, 'name', 'Unknown') + description = getattr(node, 'description', None) + + result = f"Node Information:\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n" + result += f"Node ID: {node_id}\nClass: {node_class}\nName: {node_name}\nType: {node_type}\n" + result += f"Description: {description or 'N/A'}\n" + + if node_class == 'EntityNode' or node_type == 'entity': + entity_type = getattr(node, 'entity_type', 'Unknown') + declaring_chunk_ids = getattr(node, 'declaring_chunk_ids', []) + calling_chunk_ids = getattr(node, 'calling_chunk_ids', []) + aliases = getattr(node, 'aliases', []) + + result += f"\nEntity Type: {entity_type}\n" + result += f"Aliases: {', '.join(aliases) if aliases else 'None'}\n" + result += f"Declared in {len(declaring_chunk_ids)} chunk(s):\n" + for cid in declaring_chunk_ids[:5]: + result += f" - {cid}\n" + if len(declaring_chunk_ids) > 5: + result += f" ... and {len(declaring_chunk_ids) - 5} more\n" + result += f"Called in {len(calling_chunk_ids)} chunk(s):\n" + for cid in calling_chunk_ids[:5]: + result += f" - {cid}\n" + if len(calling_chunk_ids) > 5: + result += f" ... and {len(calling_chunk_ids) - 5} more\n" + result += f"\nSummary: Entity {node_id} ({node_name}) — {entity_type} declared in {len(declaring_chunk_ids)} chunk(s) and called in {len(calling_chunk_ids)} chunk(s).\n" + else: + declared_entities = getattr(node, 'declared_entities', []) + called_entities = getattr(node, 'called_entities', []) + + result += f"\nDeclared Entities ({len(declared_entities)}):\n" + for entity in declared_entities[:10]: + result += f" - {entity}\n" + if len(declared_entities) > 10: + result += f" ... and {len(declared_entities) - 10} more\n" + + result += f"\nCalled Entities ({len(called_entities)}):\n" + for entity in called_entities[:10]: + result += f" - {entity}\n" + if len(called_entities) > 10: + result += f" ... and {len(called_entities) - 10} more\n" + + # Add content preview for file/chunk nodes + if node_type in ['file', 'chunk']: + content = getattr(node, 'content', None) + result += f"\nContent:\n{content or 'N/A'}\n" + if hasattr(node, 'path'): + result += f"Path: {node.path}\n" + if hasattr(node, 'language'): + result += f"Language: {node.language}\n" + if node_type == 'chunk' and hasattr(node, 'order_in_file'): + result += f"Order in File: {node.order_in_file}\n" + elif node_type == 'directory': + if hasattr(node, 'path'): + result += f"Path: {node.path}\n" + + result += f"\nSummary: Node {node_id} ({node_name}) — {node_type} with {len(declared_entities)} declared and {len(called_entities)} called entities.\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def get_node_edges(node_id: str) -> str: + """ + List all graph edges (relationships) connected to a specific node in the knowledge graph. + + PURPOSE: + Use this tool to understand how a node is connected to other parts of the codebase. + Reveals the dependency structure and relationships that link code elements together. + + WHEN TO USE: + - To discover what code calls or depends on a specific function/class + - To find parent-child relationships (e.g., which file contains a chunk) + - To trace declaration and usage patterns through the codebase + - To understand the connectivity of an entity in the dependency graph + - When you need a raw view of all relationships without filtering + + EDGE TYPES YOU'LL SEE: + - 'contains': Parent-child (file→chunk, directory→file, repo→directory) + - 'calls': Entity usage relationships (chunk→entity it calls) + - 'declares': Entity declaration relationships (chunk→entity it defines) + + DIRECTION MEANINGS: + - Incoming edges (←): Other nodes pointing TO this node (e.g., "who calls me?") + - Outgoing edges (→): This node pointing TO others (e.g., "what do I call?") + + COMPARISON WITH get_neighbors: + - get_node_edges: Shows edge metadata and direction, raw relationship view + - get_neighbors: Shows neighboring node details, easier for exploration + + Args: + node_id: The unique identifier of the node to inspect edges for + + Returns: + str: List of incoming and outgoing edges with source/target node IDs and relationship types + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + if node_id not in knowledge_graph.graph: + return f"Error: Node '{node_id}' not found in knowledge graph" + + g = knowledge_graph.graph + + incoming = [ + {"source": src, "target": tgt, "relation": data.get("relation", "?")} + for src, tgt, data in g.in_edges(node_id, data=True) + ] + outgoing = [ + {"source": src, "target": tgt, "relation": data.get("relation", "?")} + for src, tgt, data in g.out_edges(node_id, data=True) + ] + + result = f"""Node Edges for '{node_id}': +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +Incoming Edges ({len(incoming)}): +""" + for edge in incoming[:20]: + result += f" ← {edge['source']} [{edge['relation']}]\n" + if len(incoming) > 20: + result += f" ... and {len(incoming) - 20} more\n" + + result += f"\nOutgoing Edges ({len(outgoing)}):\n" + for edge in outgoing[:20]: + result += f" → {edge['target']} [{edge['relation']}]\n" + if len(outgoing) > 20: + result += f" ... and {len(outgoing) - 20} more\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def search_nodes(query: str, limit: int = 10, page: int = 1) -> str: + """ + Search the Transformers codebase using keyword matching against code content and metadata. + + PURPOSE: + This is your PRIMARY SEARCH TOOL for exploring the codebase. Use it to find relevant + code chunks based on natural language queries, function names, class names, comments, + or any text that might appear in the source code. + + WHEN TO USE: + - FIRST STEP when investigating any topic in the Transformers library + - To find implementations of specific features (e.g., "rotary embeddings", "flash attention") + - To locate code by function/class name when you don't have the exact node ID + - To discover code related to a concept (e.g., "gradient checkpointing", "tokenization") + - When you don't know where something is implemented + + SEARCH TIPS: + - Use specific technical terms: "rope embedding" rather than just "embedding" + - Include class/function names if known: "BertSelfAttention forward" + - Try multiple related queries if first results aren't satisfactory + - Results are ranked by relevance to your query + + TYPICAL WORKFLOW: + 1. search_nodes("attention mask handling") -> find relevant chunks + 2. get_node_info(chunk_id) -> examine the code content + 3. get_chunk_context(chunk_id) -> see surrounding code for fuller picture + 4. go_to_definition(entity_name) -> find where an entity is defined + + Args: + query: Search terms to match against code content. Can be natural language, function names, class names, or code snippets. More specific queries yield better results. + limit: Results per page (default: 10, max recommended: 50). Use smaller limits for faster responses. + page: Page number starting from 1. Use pagination to browse through many results. + + Returns: + str: Ranked list of matching code chunks with IDs and content previews. Use the returned IDs with get_node_info or get_chunk_context for full details. + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + # Convert limit to int if it's a string (MCP may pass strings) + if isinstance(limit, str): + try: + limit = int(limit) + except ValueError: + return f"Error: 'limit' must be an integer, got '{limit}'" + + # Convert page to int if it's a string + if isinstance(page, str): + try: + page = int(page) + except ValueError: + return f"Error: 'page' must be an integer, got '{page}'" + + if limit <= 0: + return "Error: limit must be a positive integer" + if page < 1: + return "Error: 'page' must be a positive integer (1 or greater)" + + # Fetch more results to support pagination + max_fetch = limit * page + results = knowledge_graph.code_index.query(query, n_results=max_fetch) + metadatas = results.get("metadatas", [[]])[0] + + if not metadatas: + return f"No results found for '{query}'." + + total = len(metadatas) + # Pagination + total_pages = (total + limit - 1) // limit + if page > total_pages: + return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} results at {limit} per page)" + + start_idx = (page - 1) * limit + end_idx = start_idx + limit + page_slice = metadatas[start_idx:end_idx] + + result = f"Search Results for '{query}' (Page {page}/{total_pages}, {total} total):\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + for i, res in enumerate(page_slice, start=start_idx + 1): + result += f"{i}. ID: {res.get('id', 'N/A')}\n" + content = res.get('content', '') + if content: + result += f" Content: {content}\n" + result += "\n" + + # Pagination hint + if page < total_pages: + result += f"Use page={page + 1} to see the next page\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + +@observe(as_type="tool") +def get_graph_stats() -> str: + """ + Get a comprehensive statistical overview of the Transformers library knowledge graph. + + PURPOSE: + Use this tool to understand the scope and structure of the knowledge graph. + Provides counts and breakdowns of all node types, entity types, and relationship types. + + WHEN TO USE: + - At the START of an exploration session to understand the codebase scope + - To learn what types of entities and relationships are available for querying + - To understand the terminology used in this knowledge graph (chunks, entities, edges) + - When you need to report on the overall structure of the Transformers library + + WHAT YOU'LL LEARN: + - Total number of nodes and edges in the graph + - Breakdown of node types (chunks, files, directories, entities) + - Entity type distribution (classes, functions, methods, variables, etc.) + - Edge relationship types (contains, calls, declares) + - Definitions of key concepts used throughout the tools + + GRAPH TERMINOLOGY: + - Chunks: Logical code segments (a function body, a class definition, etc.) + - Entities: Named programming constructs tracked across the codebase + - Edges: Relationships connecting nodes (contains, calls, declares) + + Returns: + str: Detailed statistics including node counts by type, entity breakdown, edge relation counts, and concept definitions to help you use other tools effectively. + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + g = knowledge_graph.graph + num_nodes = g.number_of_nodes() + num_edges = g.number_of_edges() + + # Count node types + node_types = {} + entity_breakdown = {} + + for _, node_attrs in g.nodes(data=True): + node_type = getattr(node_attrs['data'], 'node_type', 'Unknown') + node_types[node_type] = node_types.get(node_type, 0) + 1 + + # For entity nodes, get entity_type breakdown + if node_type == 'entity': + entity_type = getattr(node_attrs['data'], 'entity_type', 'Unknown') + + # Fallback: if entity_type is empty, check entities dictionary + if not entity_type: + node_id = node_attrs['data'].id if hasattr(node_attrs['data'], 'id') else None + if node_id and node_id in knowledge_graph.entities: + entity_types = knowledge_graph.entities[node_id].get('type', []) + entity_type = entity_types[0] if entity_types else 'Unknown' + + entity_breakdown[entity_type] = entity_breakdown.get(entity_type, 0) + 1 + + # Count edge relations + edge_relations = {} + for _, _, attrs in g.edges(data=True): + relation = attrs.get('relation', 'Unknown') + edge_relations[relation] = edge_relations.get(relation, 0) + 1 + + # Build result + result = f"""Knowledge Graph Statistics: +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +📊 Overview: + Total Nodes: {num_nodes:,} + Total Edges: {num_edges:,} + +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +📦 Node Types: +""" + + # Sort node types by count + for ntype, count in sorted(node_types.items(), key=lambda x: x[1], reverse=True): + result += f" • {ntype}: {count:,}\n" + + # If this is entity type, show breakdown + if ntype == 'entity' and entity_breakdown: + result += f" └─ Entity Breakdown:\n" + for etype, ecount in sorted(entity_breakdown.items(), key=lambda x: x[1], reverse=True): + percentage = (ecount / count * 100) if count > 0 else 0 + result += f" ├─ {etype}: {ecount:,} ({percentage:.1f}%)\n" + + result += f""" +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +🔗 Edge Relations: +""" + for relation, count in sorted(edge_relations.items(), key=lambda x: x[1], reverse=True): + result += f" • {relation}: {count:,}\n" + + # Add explanation section + result += f""" +━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ + +ℹ️ Definitions: + +Chunks: Code segments representing logical portions of files. Each chunk + contains a section of code (like a function, class, or code block) + along with metadata about what entities it declares and calls. + +Entities: Programming constructs identified in the code including: + - Classes: Class definitions + - Functions: Function definitions + - Methods: Class method definitions + - Variables: Variable declarations + - Parameters: Function/method parameters + - Function_call/Method_call: Usage references + +Files: Source code files in the repository +Directories: Folder structure containing files +Repo: Root repository node + +Edge Relations: + - contains: Parent-child relationships (file contains chunks) + - declares: Entity declaration relationships + - calls: Entity usage/invocation relationships +""" + + return result + except Exception as e: + return f"Error: {str(e)}" + +@observe(as_type="tool") +def list_nodes_by_type(node_type: str, limit: int = 20, page: int = 1) -> str: + """ + List all nodes of a specific type in the Transformers knowledge graph with pagination. + + PURPOSE: + Use this tool to browse and discover nodes by their type. Helpful when you want to + see what classes, functions, files, or other constructs exist in the codebase. + + WHEN TO USE: + - To get a list of all classes in the Transformers library: node_type='class' + - To see all Python files: node_type='file' + - To list all functions: node_type='function' + - To browse all methods: node_type='method' + - When you need to find node IDs for further exploration + + VALID node_type VALUES: + For entities (programming constructs): + - 'class': Class definitions (e.g., BertModel, GPT2LMHeadModel) + - 'function': Standalone function definitions + - 'method': Class method definitions + - 'variable': Variable declarations + - 'parameter': Function/method parameters + + For structural nodes: + - 'file': Source code files + - 'chunk': Code segments within files + - 'directory': Folder structure nodes + - 'repo': Repository root (typically one) + + COMPARISON WITH search_by_type_and_name: + - list_nodes_by_type: Browse ALL nodes of a type (no name filter) + - search_by_type_and_name: Filter by type AND search by name substring + + Args: + node_type: The type to filter by. Use lowercase: 'class', 'function', 'method', 'file', 'chunk', 'directory' + limit: Maximum results per page (default: 20). Increase for broader browsing. + page: Page number starting from 1 for pagination through large result sets. + + Returns: + str: Alphabetically sorted list of matching nodes with their IDs and types. Use IDs with get_node_info for details. + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + # Convert limit/page to int if they're strings (MCP/Gradio may pass strings) + if isinstance(limit, str): + try: + limit = int(limit) + except ValueError: + return f"Error: 'limit' must be an integer, got '{limit}'" + + if isinstance(page, str): + try: + page = int(page) + except ValueError: + return f"Error: 'page' must be an integer, got '{page}'" + + if limit <= 0: + return "Error: limit must be a positive integer" + if page < 1: + return "Error: 'page' must be a positive integer (1 or greater)" + + g = knowledge_graph.graph + matching_nodes = [] + + for node_id, data in g.nodes(data=True): + node = data['data'] + current_node_type = getattr(node, 'node_type', None) + node_name = getattr(node, 'name', 'Unknown') + + # For entity nodes, check entity_type instead of node_type + if current_node_type == 'entity': + entity_type = getattr(node, 'entity_type', '') + + # Fallback: if entity_type is empty, check the entities dictionary + if not entity_type and node_id in knowledge_graph.entities: + entity_types = knowledge_graph.entities[node_id].get('type', []) + entity_type = entity_types[0] if entity_types else '' + + if entity_type and entity_type.lower() == node_type.lower(): + matching_nodes.append({ + "id": node_id, + "name": node_name, + "type": f"entity ({entity_type})" + }) + # For other nodes, check node_type directly + elif current_node_type == node_type: + matching_nodes.append({ + "id": node_id, + "name": node_name, + "type": current_node_type + }) + + # Sort by name for consistent ordering + matching_nodes.sort(key=lambda x: x['name'].lower()) + + total = len(matching_nodes) + if total == 0: + return f"No nodes found of type '{node_type}'." + + # Pagination + total_pages = (total + limit - 1) // limit + if page > total_pages: + return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} nodes at {limit} per page)" + + start_idx = (page - 1) * limit + end_idx = start_idx + limit + page_slice = matching_nodes[start_idx:end_idx] + + result = f"Nodes of type '{node_type}' (Page {page}/{total_pages}, {total} total):\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + for i, node in enumerate(page_slice, start=start_idx + 1): + result += f"{i}. {node['name']}\n" + result += f" ID: {node['id']}\n" + result += f" Type: {node['type']}\n\n" + + # Pagination hint + if page < total_pages: + result += f"Use page={page + 1} to see the next page\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def get_neighbors(node_id: str, limit: int = 20, page: int = 1) -> str: + """ + Get all nodes directly connected to a given node with their relationship information. + + PURPOSE: + Use this tool to explore the local neighborhood of any node in the knowledge graph. + Shows what's connected to a node and how, making it easy to navigate the codebase structure. + + WHEN TO USE: + - To explore what a node is connected to (files, chunks, entities) + - To navigate from one code element to related elements + - To understand the local structure around a specific node + - After using get_node_info when you want to explore connected nodes + - To discover related code without knowing exact names + + WHAT YOU'LL SEE: + - Neighbor node IDs and names + - Node types (chunk, file, entity, etc.) + - Relationship direction (→ outgoing, ← incoming) + - Relationship type (contains, calls, declares) + + TYPICAL NAVIGATION PATTERNS: + - From a file: see its chunks and declared entities + - From a chunk: see entities it declares/calls and its parent file + - From an entity: see chunks that declare or call it + - From a directory: see contained files and subdirectories + + COMPARISON WITH get_node_edges: + - get_neighbors: Shows neighboring NODE details (name, type) - better for exploration + - get_node_edges: Shows raw EDGE information - better for understanding relationships + + Args: + node_id: The ID of the node to explore neighbors for + limit: Maximum neighbors to return per page (default: 20) + page: Page number for pagination when node has many connections + + Returns: + str: List of connected nodes with their IDs, names, types, and the relationships connecting them + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + if node_id not in knowledge_graph.graph: + return f"Error: Node '{node_id}' not found in knowledge graph" + + # Convert limit/page to int if they're strings + if isinstance(limit, str): + try: + limit = int(limit) + except ValueError: + return f"Error: 'limit' must be an integer, got '{limit}'" + + if isinstance(page, str): + try: + page = int(page) + except ValueError: + return f"Error: 'page' must be an integer, got '{page}'" + + if limit <= 0: + return "Error: limit must be a positive integer" + if page < 1: + return "Error: 'page' must be a positive integer (1 or greater)" + + neighbors = knowledge_graph.get_neighbors(node_id) + if not neighbors: + return f"No neighbors found for node '{node_id}'" + + total = len(neighbors) + # Pagination + total_pages = (total + limit - 1) // limit + if page > total_pages: + return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} neighbors at {limit} per page)" + + start_idx = (page - 1) * limit + end_idx = start_idx + limit + page_slice = neighbors[start_idx:end_idx] + + result = f"Neighbors of '{node_id}' (Page {page}/{total_pages}, {total} total):\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + for i, neighbor in enumerate(page_slice, start=start_idx + 1): + result += f"{i}. {neighbor.id}\n" + result += f" Name: {getattr(neighbor, 'name', 'Unknown')}\n" + result += f" Type: {neighbor.node_type}\n" + + if knowledge_graph.graph.has_edge(node_id, neighbor.id): + edge_data = knowledge_graph.graph.get_edge_data(node_id, neighbor.id) + result += f" → Relation: {edge_data.get('relation', 'Unknown')}\n" + elif knowledge_graph.graph.has_edge(neighbor.id, node_id): + edge_data = knowledge_graph.graph.get_edge_data(neighbor.id, node_id) + result += f" ← Relation: {edge_data.get('relation', 'Unknown')}\n" + result += "\n" + + # Pagination hint + if page < total_pages: + result += f"Use page={page + 1} to see the next page\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def go_to_definition(entity_name: str) -> str: + """ + Jump to the source code location(s) where an entity is defined/declared. + + PURPOSE: + Use this tool to find WHERE in the codebase a class, function, method, or variable + is defined. Returns the actual code content of the definition along with file location. + + WHEN TO USE: + - To see the implementation of a class like 'BertModel' or 'GPT2Attention' + - To find where a function is defined when you know its name + - To examine the source code of any entity found through search or listing + - When you need to understand HOW something is implemented (not just WHERE it's used) + - To get the actual code definition for analysis or explanation + + WHAT YOU'LL GET: + - Entity type (class, function, method, variable) + - Data type if available + - List of all locations where the entity is declared (some entities may be defined in multiple places) + - For each location: file path, chunk order, and FULL CODE CONTENT + + TYPICAL WORKFLOW: + 1. search_nodes("attention") -> find entity names + 2. go_to_definition("BertSelfAttention") -> see the class implementation + 3. find_usages("BertSelfAttention") -> see where it's used + + COMPARISON WITH find_usages: + - go_to_definition: Shows WHERE entity is DEFINED (the implementation) + - find_usages: Shows WHERE entity is USED/CALLED (the consumers) + + Args: + entity_name: Exact name of the entity (case-sensitive). Examples: 'BertModel', 'forward', 'attention_mask', 'get_extended_attention_mask' + + Returns: + str: Entity type, file location(s), and complete source code of the definition(s). Returns error if entity not found. + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + if entity_name not in knowledge_graph.entities: + return f"Error: Entity '{entity_name}' not found in knowledge graph" + + entity_info = knowledge_graph.entities[entity_name] + declaring_chunks = entity_info.get('declaring_chunk_ids', []) + + if not declaring_chunks: + return f"Entity '{entity_name}' found but no declarations identified." + + result = f"Definition(s) for '{entity_name}':\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + result += f"Type: {', '.join(entity_info.get('type', ['Unknown']))}\n" + if entity_info.get('dtype'): + result += f"Data Type: {entity_info['dtype']}\n" + result += f"\nDeclared in {len(declaring_chunks)} location(s):\n\n" + + for i, chunk_id in enumerate(declaring_chunks[:5], 1): + if chunk_id in knowledge_graph.graph: + chunk = knowledge_graph.graph.nodes[chunk_id]['data'] + result += f"{i}. Chunk: {chunk_id}\n" + result += f" File: {chunk.path}\n" + result += f" Order: {chunk.order_in_file}\n" + result += f" Content:\n{chunk.content}\n\n" + + if len(declaring_chunks) > 5: + result += f"... and {len(declaring_chunks) - 5} more locations\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def find_usages(entity_name: str, limit: int = 20, page: int = 1) -> str: + """ + Find all locations in the codebase where an entity is used or called. + + PURPOSE: + Use this tool to understand the impact and usage patterns of any entity. + Shows every place where a class is instantiated, a function is called, + or a variable is referenced throughout the Transformers library. + + WHEN TO USE: + - To understand how widely used a class or function is + - To see usage examples of a particular API or function + - To assess the impact of changing an entity (who depends on it?) + - To learn how to use a class/function by seeing real examples + - To trace data flow through the codebase + + WHAT YOU'LL GET: + - Total count of usage locations + - For each usage: file path, chunk position, and full code context showing the usage + - Paginated results for entities with many usages + + TYPICAL WORKFLOWS: + + Impact Analysis: + 1. go_to_definition("deprecated_function") -> understand what it does + 2. find_usages("deprecated_function") -> see all code that needs updating + + Learning by Example: + 1. list_nodes_by_type('class') -> find interesting classes + 2. find_usages("BertModel") -> see how it's instantiated and used + + COMPARISON WITH go_to_definition: + - find_usages: WHERE is this entity CALLED/USED (consumers) + - go_to_definition: WHERE is this entity DEFINED (implementation) + + Args: + entity_name: Exact name of the entity to find usages for (case-sensitive) + limit: Usages per page (default: 20). Many popular classes have 100+ usages. + page: Page number for pagination (starts at 1) + + Returns: + str: List of code chunks that use this entity, with file paths and full code content showing the usage in context + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + # Convert limit to int if it's a string (MCP may pass strings) + if isinstance(limit, str): + try: + limit = int(limit) + except ValueError: + return f"Error: 'limit' must be an integer, got '{limit}'" + + # Convert page to int if it's a string + if isinstance(page, str): + try: + page = int(page) + except ValueError: + return f"Error: 'page' must be an integer, got '{page}'" + + if entity_name not in knowledge_graph.entities: + return f"Error: Entity '{entity_name}' not found in knowledge graph" + + if limit <= 0: + return "Error: limit must be a positive integer" + if page < 1: + return "Error: 'page' must be a positive integer (1 or greater)" + + entity_info = knowledge_graph.entities[entity_name] + calling_chunks = entity_info.get('calling_chunk_ids', []) + + if not calling_chunks: + return f"Entity '{entity_name}' found but no usages identified." + + total = len(calling_chunks) + # Pagination + total_pages = (total + limit - 1) // limit + if page > total_pages: + return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} usages at {limit} per page)" + + start_idx = (page - 1) * limit + end_idx = start_idx + limit + page_slice = calling_chunks[start_idx:end_idx] + + result = f"Usages of '{entity_name}' (Page {page}/{total_pages}, {total} total):\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + for i, chunk_id in enumerate(page_slice, start=start_idx + 1): + if chunk_id in knowledge_graph.graph: + chunk = knowledge_graph.graph.nodes[chunk_id]['data'] + result += f"{i}. {chunk.path} (chunk {chunk.order_in_file})\n" + result += f" Content:\n{chunk.content}\n\n" + + # Pagination hint + if page < total_pages: + result += f"Use page={page + 1} to see the next page\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def get_file_structure(file_path: str) -> str: + """ + Get a structural overview of a source file showing its chunks and declared entities. + + PURPOSE: + Use this tool to understand the organization of a specific file. Shows what classes, + functions, and other entities are defined in the file, plus how the file is divided into chunks. + + WHEN TO USE: + - To get a table of contents for a file before diving into specifics + - To see what classes and functions a file defines + - To understand how code is organized within a file + - To find chunk IDs for further exploration with get_node_info or get_chunk_context + - When you know the file path but need to understand its contents + + WHAT YOU'LL SEE: + - File path and detected programming language + - Total number of code chunks in the file + - List of declared entities (classes, functions, methods, variables) with their types + - Ordered list of chunks with their IDs and descriptions + + HOW TO GET FILE PATHS: + - Use list_files_in_directory() to browse files + - Use search_nodes() and look at file paths in results + - Use list_nodes_by_type('file') to get file node IDs (which are the paths) + + TYPICAL WORKFLOW: + 1. list_files_in_directory('src/transformers/models/bert') -> find files + 2. get_file_structure('src/transformers/models/bert/modeling_bert.py') -> see structure + 3. get_node_info(chunk_id) -> examine specific code chunks + + Args: + file_path: The full path to the file (e.g., 'src/transformers/models/bert/modeling_bert.py'). Must match exactly as stored in the knowledge graph. + + Returns: + str: File overview including language, chunk count, declared entities list, and chunk descriptions + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + if file_path not in knowledge_graph.graph: + return f"Error: File '{file_path}' not found in knowledge graph" + + file_node = knowledge_graph.graph.nodes[file_path]['data'] + chunks = knowledge_graph.get_chunks_of_file(file_path) + + result = f"File Structure: {file_node.name}\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + result += f"Path: {file_path}\n" + result += f"Language: {getattr(file_node, 'language', 'Unknown')}\n" + result += f"Total Chunks: {len(chunks)}\n\n" + + if hasattr(file_node, 'declared_entities') and file_node.declared_entities: + result += f"Declared Entities ({len(file_node.declared_entities)}):\n" + for entity in file_node.declared_entities[:15]: + if isinstance(entity, dict): + result += f" - {entity.get('name', '?')} ({entity.get('type', '?')})\n" + else: + result += f" - {entity}\n" + if len(file_node.declared_entities) > 15: + result += f" ... and {len(file_node.declared_entities) - 15} more\n" + + result += f"\nChunks:\n" + for chunk in chunks[:10]: + result += f" [{chunk.order_in_file}] {chunk.id}\n" + if chunk.description: + desc = chunk.description[:80] + "..." if len(chunk.description) > 80 else chunk.description + result += f" {desc}\n" + + if len(chunks) > 10: + result += f" ... and {len(chunks) - 10} more chunks\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def get_related_chunks(chunk_id: str, relation_type: str = "calls", limit: int = 20, page: int = 1) -> str: + """ + Find code chunks connected to a given chunk through a specific relationship type. + + PURPOSE: + Use this tool to trace code dependencies by following relationship edges from a chunk. + Helps understand what code a chunk depends on or what depends on it. + + WHEN TO USE: + - To find what entities/code a chunk calls or uses (relation_type='calls') + - To trace dependencies from a specific piece of code + - To explore the call graph emanating from a chunk + - When you have a chunk ID and want to see connected code + + RELATIONSHIP TYPES: + - 'calls': Entities/chunks that this chunk calls or references (most common) + - 'contains': Child nodes contained by this node (for files/directories) + - 'declares': Entities declared by this chunk + - 'all' or '': Get all outgoing relationships regardless of type + + TYPICAL WORKFLOW: + 1. search_nodes("BertAttention forward") -> find a chunk + 2. get_related_chunks(chunk_id, 'calls') -> see what it calls + 3. get_node_info(related_chunk_id) -> examine called code + + COMPARISON WITH OTHER TOOLS: + - get_neighbors: All connected nodes (any direction, any type) + - get_related_chunks: Outgoing edges only, filtered by relationship type + - entity_relationships: Focused on entity nodes and their relationships + + Args: + chunk_id: The ID of the chunk to explore from (e.g., 'src/transformers/models/bert/modeling_bert.py::chunk_5') + relation_type: Filter by relationship type: 'calls', 'contains', 'declares', or 'all' for everything (default: 'calls') + limit: Maximum results per page (default: 20) + page: Page number for pagination + + Returns: + str: List of related chunks with their IDs, file paths, and entity names involved in the relationship + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + if chunk_id not in knowledge_graph.graph: + return f"Error: Chunk '{chunk_id}' not found in knowledge graph" + + # Convert limit/page to int if they're strings + if isinstance(limit, str): + try: + limit = int(limit) + except ValueError: + return f"Error: 'limit' must be an integer, got '{limit}'" + + if isinstance(page, str): + try: + page = int(page) + except ValueError: + return f"Error: 'page' must be an integer, got '{page}'" + + if limit <= 0: + return "Error: limit must be a positive integer" + if page < 1: + return "Error: 'page' must be a positive integer (1 or greater)" + + related = [] + if relation_type == "" or relation_type == "all": + # Get all outgoing edges regardless of relation type + for _, target, attrs in knowledge_graph.graph.out_edges(chunk_id, data=True): + target_node = knowledge_graph.graph.nodes[target]['data'] + related.append({ + "id": target, + "file_path": getattr(target_node, 'path', 'Unknown'), + "entity_name": attrs.get('entity_name') + }) + else: + for _, target, attrs in knowledge_graph.graph.out_edges(chunk_id, data=True): + if attrs.get('relation') == relation_type: + target_node = knowledge_graph.graph.nodes[target]['data'] + related.append({ + "id": target, + "file_path": getattr(target_node, 'path', 'Unknown'), + "entity_name": attrs.get('entity_name') + }) + + if not related: + return f"No chunks found with '{relation_type}' relationship from '{chunk_id}'" + + total = len(related) + # Pagination + total_pages = (total + limit - 1) // limit + if page > total_pages: + return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} results at {limit} per page)" + + start_idx = (page - 1) * limit + end_idx = start_idx + limit + page_slice = related[start_idx:end_idx] + + result = f"Chunks related to '{chunk_id}' via '{relation_type}' (Page {page}/{total_pages}, {total} total):\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + for i, chunk in enumerate(page_slice, start=start_idx + 1): + result += f"{i}. {chunk['id']}\n" + result += f" File: {chunk['file_path']}\n" + if chunk['entity_name']: + result += f" Entity: {chunk['entity_name']}\n" + result += "\n" + + # Pagination hint + if page < total_pages: + result += f"Use page={page + 1} to see the next page\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def list_all_entities( + limit: int = 50, + page: int = 1, + entity_type: Optional[str] = None, + declared_in_repo: Optional[bool] = None, + called_in_repo: Optional[bool] = None +) -> str: + """ + Browse all programming entities (classes, functions, methods, variables) tracked in the knowledge graph. + + PURPOSE: + Use this tool to explore the full inventory of code entities in the Transformers library. + Supports filtering by type and usage patterns, making it powerful for targeted exploration. + + WHEN TO USE: + - To browse all classes, functions, or methods in the codebase + - To find entities that are defined but never used (dead code analysis) + - To find external entities that are called but not defined in the repo + - To get an overview of entity distribution in the codebase + - When you need entity names for use with go_to_definition or find_usages + + FILTERING OPTIONS: + + By entity_type: + - 'class': Class definitions (BertModel, GPT2Config, etc.) + - 'function': Standalone functions + - 'method': Class methods + - 'variable': Variable declarations + - 'parameter': Function/method parameters + - None: All entity types + + By declaration status (declared_in_repo): + - True: Only entities DEFINED in this repo (has source code) + - False: Only external entities (imported from other packages) + - None: All entities + + By usage status (called_in_repo): + - True: Only entities that ARE USED somewhere in the code + - False: Only entities that are NEVER USED (potential dead code) + - None: All entities + + USEFUL FILTER COMBINATIONS: + - All classes: entity_type='class' + - Defined classes: entity_type='class', declared_in_repo=True + - Unused functions: entity_type='function', called_in_repo=False + - External dependencies: declared_in_repo=False, called_in_repo=True + + Args: + limit: Entities per page (default: 50). Use larger values for comprehensive listings. + page: Page number starting from 1 for pagination + entity_type: Filter by type: 'class', 'function', 'method', 'variable', 'parameter', or None for all + declared_in_repo: True=defined in repo, False=external only, None=all + called_in_repo: True=has usages, False=never used, None=all + + Returns: + str: List of entities with their types, declaration count, and usage count. Use entity names with go_to_definition or find_usages. + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + # Convert limit to int if it's a string (MCP may pass strings) + if isinstance(limit, str): + try: + limit = int(limit) + except ValueError: + return f"Error: 'limit' must be an integer, got '{limit}'" + + # Convert page to int if it's a string (MCP may pass strings) + if isinstance(page, str): + try: + page = int(page) + except ValueError: + return f"Error: 'page' must be an integer, got '{page}'" + + if page < 1: + return "Error: 'page' must be a positive integer (1 or greater)" + + # Handle entity_type - empty string should be treated as None + if entity_type == "" or entity_type == "null": + entity_type = None + + # Handle declared_in_repo - convert string to bool if needed + if isinstance(declared_in_repo, str): + if declared_in_repo.lower() in ("true", "1", "yes"): + declared_in_repo = True + elif declared_in_repo.lower() in ("false", "0", "no"): + declared_in_repo = False + elif declared_in_repo.lower() in ("none", "null", "all", ""): + declared_in_repo = None + + # Handle called_in_repo - convert string to bool if needed + if isinstance(called_in_repo, str): + if called_in_repo.lower() in ("true", "1", "yes"): + called_in_repo = True + elif called_in_repo.lower() in ("false", "0", "no"): + called_in_repo = False + elif called_in_repo.lower() in ("none", "null", "all", ""): + called_in_repo = None + + if not knowledge_graph.entities: + return "No entities found in the knowledge graph." + + # Filter entities based on criteria + filtered_entities = {} + for entity_name, info in knowledge_graph.entities.items(): + # Filter by entity type if specified + if entity_type is not None: + entity_types = [t.lower() if t else '' for t in info.get('type', [])] + if entity_type.lower() not in entity_types: + continue + + # Filter by declared_in_repo if specified + if declared_in_repo is not None: + has_declaration = len(info.get('declaring_chunk_ids', [])) > 0 + if declared_in_repo and not has_declaration: + continue + if not declared_in_repo and has_declaration: + continue + + # Filter by called_in_repo (usages) if specified + if called_in_repo is not None: + has_calls = len(info.get('calling_chunk_ids', [])) > 0 + if called_in_repo and not has_calls: + continue + if not called_in_repo and has_calls: + continue + + filtered_entities[entity_name] = info + + # Build the response with filtered entities + if not filtered_entities: + filter_desc = [] + if entity_type: + filter_desc.append(f"type={entity_type}") + if declared_in_repo is not None: + filter_desc.append(f"declared_in_repo={declared_in_repo}") + if called_in_repo is not None: + filter_desc.append(f"called_in_repo={called_in_repo}") + filter_text = f" (filtered by {', '.join(filter_desc)})" if filter_desc else "" + return f"No entities found{filter_text}." + + # Calculate pagination + total_entities = len(filtered_entities) + total_pages = (total_entities + limit - 1) // limit # Ceiling division + + if page > total_pages: + return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total_entities} entities at {limit} per page)" + + start_idx = (page - 1) * limit + end_idx = start_idx + limit + + # Get the paginated slice of entities + entity_items = list(filtered_entities.items()) + paginated_items = entity_items[start_idx:end_idx] + + result = f"All Entities (Page {page}/{total_pages}, {total_entities} total):\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + for i, (entity_name, info) in enumerate(paginated_items, start=start_idx + 1): + result += f"{i}. {entity_name}\n" + result += f" Types: {', '.join(info.get('type', ['Unknown']))}\n" + result += f" Declarations: {len(info.get('declaring_chunk_ids', []))}\n" + result += f" Usages: {len(info.get('calling_chunk_ids', []))}\n\n" + + # Add pagination info + result += f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n" + result += f"Showing {start_idx + 1}-{min(end_idx, total_entities)} of {total_entities} entities\n" + result += f"Page {page} of {total_pages}\n" + + if page < total_pages: + result += f"Use page={page + 1} to see the next page\n" + + # Add filter information + if entity_type: + result += f"\n(Filtered by type={entity_type})\n" + if declared_in_repo is not None: + result += f"(Filtered by declared_in_repo={declared_in_repo})\n" + if called_in_repo is not None: + result += f"(Filtered by called_in_repo={called_in_repo})\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def diff_chunks(node_id_1: str, node_id_2: str) -> str: + """ + Compare two code chunks and show their differences in unified diff format. + + PURPOSE: + Use this tool to compare two pieces of code side-by-side. Shows exactly what's + different between them using standard unified diff format (like git diff). + + WHEN TO USE: + - To compare similar implementations (e.g., two attention mechanisms) + - To understand differences between related classes or functions + - To analyze variations in code patterns across the codebase + - To compare two versions or implementations of similar functionality + - When you suspect code duplication and want to see exact differences + + DIFF FORMAT: + - Lines starting with '-' are only in the first chunk + - Lines starting with '+' are only in the second chunk + - Lines without prefix are common to both + - @@ markers show line number context + + TYPICAL WORKFLOW: + 1. search_nodes("attention") -> find attention implementations + 2. Get chunk IDs from two different attention classes + 3. diff_chunks(chunk_id_1, chunk_id_2) -> compare implementations + + COMPARISON IDEAS: + - BertAttention vs GPT2Attention + - Different forward() implementations + - Similar utility functions in different modules + + Args: + node_id_1: ID of the first chunk/node to compare + node_id_2: ID of the second chunk/node to compare + + Returns: + str: Unified diff output showing line-by-line differences. Returns 'No differences found' if chunks are identical. + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + if node_id_1 not in knowledge_graph.graph: + return f"Error: Node '{node_id_1}' not found in knowledge graph" + if node_id_2 not in knowledge_graph.graph: + return f"Error: Node '{node_id_2}' not found in knowledge graph" + + g = knowledge_graph.graph + content1 = getattr(g.nodes[node_id_1]['data'], 'content', None) + content2 = getattr(g.nodes[node_id_2]['data'], 'content', None) + + if not content1 or not content2: + return "Error: One or both nodes have no content." + + diff = list(difflib.unified_diff( + content1.splitlines(), content2.splitlines(), + fromfile=node_id_1, tofile=node_id_2, lineterm="" + )) + + if not diff: + return "No differences found between the two chunks." + + return "\n".join(diff) + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def print_tree(root_id: str = "root", max_depth: int = 3) -> str: + """ + Display a hierarchical tree view of the repository structure starting from any node. + + PURPOSE: + Use this tool to visualize the structure of the codebase. Shows parent-child relationships + in a familiar tree format, helping you understand how files and directories are organized. + + WHEN TO USE: + - To explore the directory structure of the Transformers repository + - To see what's inside a specific directory (use directory as root_id) + - To understand the file organization for a component + - To get an overview of the codebase hierarchy + - When you need to understand where files are located + + TREE VISUALIZATION: + - Each level shows node name and type (repo, directory, file, chunk) + - Indentation represents depth in the hierarchy + - Children are limited to prevent overwhelming output + + TIPS: + - Start with max_depth=2 for a high-level overview + - Increase max_depth to see more detail (but output gets larger) + - Use a directory path as root_id to focus on a specific area + - Use list_files_in_directory for more detailed file listings + + TYPICAL USAGE: + - print_tree('root', max_depth=2) -> see top-level structure + - print_tree('src/transformers/models', max_depth=2) -> see model organization + - print_tree('src/transformers/models/bert', max_depth=3) -> see bert module structure + + Args: + root_id: Starting node ID. Use 'root' for repository root, or a directory/file path to start from a specific location. + max_depth: How many levels deep to show (default: 3). Higher values show more detail but larger output. + + Returns: + str: ASCII tree visualization showing the hierarchical structure with node names and types + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + # Convert max_depth to int if it's a string (MCP may pass strings) + if isinstance(max_depth, str): + try: + max_depth = int(max_depth) + except ValueError: + return f"Error: 'max_depth' must be an integer, got '{max_depth}'" + + g = knowledge_graph.graph + + if root_id not in g: + # Try to find a suitable root + roots = [n for n, d in g.nodes(data=True) + if getattr(d['data'], 'node_type', None) in ('repo', 'directory', 'file')] + if roots: + root_id = roots[0] + else: + return f"Error: Node '{root_id}' not found and no suitable root found" + + result = f"Tree View (starting from '{root_id}', max depth: {max_depth}):\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + def format_node(node_id, depth): + if depth > max_depth: + return "" + + node = g.nodes[node_id]['data'] + name = getattr(node, 'name', node_id) + node_type = getattr(node, 'node_type', '?') + + line = " " * depth + f"- {name} ({node_type})\n" + + children = [t for s, t in g.out_edges(node_id)] + for child in children[:20]: # Limit children to prevent huge output + line += format_node(child, depth + 1) + + if len(children) > 20: + line += " " * (depth + 1) + f"... and {len(children) - 20} more\n" + + return line + + result += format_node(root_id, 0) + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def entity_relationships(node_id: str) -> str: + """ + Display all incoming and outgoing relationships for any node, with relationship types. + + PURPOSE: + Use this tool to get a complete picture of how a node connects to the rest of the + knowledge graph. Shows both what points TO this node and what this node points TO. + + WHEN TO USE: + - To understand all dependencies of an entity + - To see what declares or calls a specific entity + - To trace the full relationship network around any node + - When you need more detail than get_neighbors provides about relationship types + - For entity-centric analysis (understanding a class or function's connections) + + WHAT YOU'LL SEE: + - Incoming relationships: Other nodes that have edges pointing TO this node + (e.g., chunks that CALL this function, files that CONTAIN this chunk) + - Outgoing relationships: This node's edges pointing TO other nodes + (e.g., entities this chunk CALLS, chunks this file CONTAINS) + - Relationship types for each edge (calls, declares, contains) + + COMPARISON WITH SIMILAR TOOLS: + - get_node_edges: Same information but different formatting + - get_neighbors: Shows neighbor node details, not edge details + - get_related_chunks: Filtered by relationship type, chunks only + + TYPICAL WORKFLOW: + 1. go_to_definition("BertModel") -> find entity + 2. entity_relationships("BertModel") -> see what calls/uses BertModel + + Args: + node_id: The ID of any node (entity, chunk, file, directory) + + Returns: + str: Complete list of incoming and outgoing relationships with source/target IDs and relationship types + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + if node_id not in knowledge_graph.graph: + return f"Error: Node '{node_id}' not found in knowledge graph" + + g = knowledge_graph.graph + + result = f"Relationships for '{node_id}':\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + incoming = list(g.in_edges(node_id, data=True)) + outgoing = list(g.out_edges(node_id, data=True)) + + if incoming: + result += f"Incoming Relationships ({len(incoming)}):\n" + for source, target, data in incoming[:20]: + result += f" ← {source} [{data.get('relation', '?')}]\n" + if len(incoming) > 20: + result += f" ... and {len(incoming) - 20} more\n" + result += "\n" + + if outgoing: + result += f"Outgoing Relationships ({len(outgoing)}):\n" + for source, target, data in outgoing[:20]: + result += f" → {target} [{data.get('relation', '?')}]\n" + if len(outgoing) > 20: + result += f" ... and {len(outgoing) - 20} more\n" + + if not incoming and not outgoing: + result += "No relationships found.\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def search_by_type_and_name(node_type: str, name_query: str, limit: int = 10, page: int = 1, partial_allowed: bool = True) -> str: + """ + Search for nodes by combining type filtering with name pattern matching. + + PURPOSE: + Use this tool for precise, targeted searches when you know the type of node you're looking + for and have a partial name. More efficient than list_nodes_by_type when you have name hints. + + WHEN TO USE: + - To find all classes containing 'Attention': search_by_type_and_name('class', 'Attention') + - To find functions with 'forward' in name: search_by_type_and_name('function', 'forward') + - To find files named 'config': search_by_type_and_name('file', 'config') + - When you know the type AND have a partial name to search for + - For pattern-based discovery of related components + + SEARCH BEHAVIOR: + - Case-insensitive matching + - partial_allowed=True (default): Fuzzy matching, finds 'BertEmbeddings' when searching 'Embed' + - partial_allowed=False: Requires exact substring match + - Results sorted by match quality (exact matches first, then substring, then fuzzy) + + VALID node_type VALUES: + For entities: 'class', 'function', 'method', 'variable', 'parameter' + For structural: 'file', 'chunk', 'directory' + + SEARCH EXAMPLES: + - All Attention classes: search_by_type_and_name('class', 'Attention') + - All Embedding classes: search_by_type_and_name('class', 'Embedding') + - Config files: search_by_type_and_name('file', 'config') + - Forward methods: search_by_type_and_name('method', 'forward') + - Test files: search_by_type_and_name('file', 'test_') + + COMPARISON WITH SIMILAR TOOLS: + - search_nodes: Full-text search in code content (doesn't filter by type) + - list_nodes_by_type: Lists all of a type (no name filter) + - search_by_type_and_name: Combines type filter + name search (best of both) + + Args: + node_type: Type to filter by: 'class', 'function', 'method', 'file', 'chunk', 'directory', etc. + name_query: Name pattern to search for (case-insensitive). Can be partial. + limit: Results per page (default: 10) + page: Page number for pagination + partial_allowed: Enable fuzzy matching (default: True). Set False for stricter matching. + + Returns: + str: Matching nodes sorted by relevance, with IDs and types. Use IDs with get_node_info for details. + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + # Convert limit/page to int if they're strings (MCP/Gradio may pass strings) + if isinstance(limit, str): + try: + limit = int(limit) + except ValueError: + return f"Error: 'limit' must be an integer, got '{limit}'" + + if isinstance(page, str): + try: + page = int(page) + except ValueError: + return f"Error: 'page' must be an integer, got '{page}'" + + # Convert partial_allowed to bool if it's a string + if isinstance(partial_allowed, str): + partial_allowed = partial_allowed.lower() in ('true', '1', 'yes') + + if limit <= 0: + return "Error: limit must be a positive integer" + if page < 1: + return "Error: 'page' must be a positive integer (1 or greater)" + + g = knowledge_graph.graph + matches = [] + query_lower = name_query.lower() + + # Build regex pattern for partial_allowed matching + # This will match names containing all characters of the query in order + if partial_allowed: + # Create pattern that matches query as substring or with characters spread out + # e.g., "Embed" matches "Embedding", "BertEmbeddings", "EmbedLayer" + partial_pattern = '.*'.join(re.escape(c) for c in query_lower) + partial_regex = re.compile(partial_pattern, re.IGNORECASE) + + for nid, n in g.nodes(data=True): + node = n['data'] + node_name = getattr(node, 'name', '') + + if not node_name: + continue + + # Check if name matches the query + name_matches = False + if partial_allowed: + # Partial match: substring match OR regex pattern match + if query_lower in node_name.lower() or partial_regex.search(node_name): + name_matches = True + else: + # Exact substring match + if query_lower in node_name.lower(): + name_matches = True + + if not name_matches: + continue + + # Check type based on node_type + current_node_type = getattr(node, 'node_type', None) + + # For entity nodes, check entity_type instead of node_type + if current_node_type == 'entity': + entity_type = getattr(node, 'entity_type', '') + + # Fallback: if entity_type is empty, check the entities dictionary + # This handles cases where EntityNode was created before the fix + if not entity_type and nid in knowledge_graph.entities: + entity_types = knowledge_graph.entities[nid].get('type', []) + entity_type = entity_types[0] if entity_types else '' + + if entity_type and entity_type.lower() == node_type.lower(): + # Calculate match score for sorting (exact matches first) + score = 0 if query_lower == node_name.lower() else (1 if query_lower in node_name.lower() else 2) + matches.append({ + "id": nid, + "name": node_name, + "type": f"entity ({entity_type})", + "score": score + }) + # For other nodes, check node_type directly + elif current_node_type == node_type: + score = 0 if query_lower == node_name.lower() else (1 if query_lower in node_name.lower() else 2) + matches.append({ + "id": nid, + "name": node_name, + "type": current_node_type, + "score": score + }) + + # Sort by match score (best matches first) + matches.sort(key=lambda x: (x['score'], x['name'].lower())) + + total = len(matches) + if total == 0: + return f"No matches for type '{node_type}' and name containing '{name_query}'." + + # Pagination + total_pages = (total + limit - 1) // limit + if page > total_pages: + return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} results at {limit} per page)" + + start_idx = (page - 1) * limit + end_idx = start_idx + limit + page_slice = matches[start_idx:end_idx] + + result = f"Matches for type '{node_type}' and name '{name_query}' (Page {page}/{total_pages}, {total} total):\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + for i, match in enumerate(page_slice, start=start_idx + 1): + result += f"{i}. {match['name']}\n" + result += f" ID: {match['id']}\n" + result += f" Type: {match['type']}\n\n" + + if page < total_pages: + result += f"Use page={page + 1} to see the next page\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def get_chunk_context(node_id: str) -> str: + """ + Get expanded code context by retrieving a chunk along with its previous and next chunks. + + PURPOSE: + Use this tool when you need to see MORE CODE CONTEXT around a specific chunk. + Chunks are logical code segments, but sometimes you need to see surrounding code + to fully understand the implementation. + + WHEN TO USE: + - After search_nodes or get_node_info when you need more surrounding context + - When a chunk shows a partial function/class and you need the complete picture + - To understand code flow across chunk boundaries + - To see imports or setup code that precedes a chunk + - To see what code follows after a chunk + + WHAT YOU'LL GET: + - The previous chunk's content (if it exists) + - The target chunk's content + - The next chunk's content (if it exists) + - All organized by file and joined together seamlessly + + CONTEXT EXPANSION: + - Shows up to 3 consecutive chunks (prev + current + next) + - Useful for understanding function bodies that span chunks + - Helps see class context when looking at individual methods + + TYPICAL WORKFLOW: + 1. search_nodes("attention forward") -> find relevant chunk + 2. get_node_info(chunk_id) -> see chunk content + 3. get_chunk_context(chunk_id) -> see surrounding code for fuller understanding + + COMPARISON WITH get_node_info: + - get_node_info: Single chunk content + full metadata + - get_chunk_context: Expanded code view (prev + current + next chunks), less metadata + + Args: + node_id: The chunk ID to get context for (e.g., 'src/transformers/models/bert/modeling_bert.py::chunk_5') + + Returns: + str: Combined content of previous, current, and next chunks organized by file. Provides seamless code view. + """ + + + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + if node_id not in knowledge_graph.graph: + return f"Error: Node '{node_id}' not found in knowledge graph" + + g = knowledge_graph.graph + current_chunk = g.nodes[node_id]['data'] + previous_chunk = knowledge_graph.get_previous_chunk(node_id) + next_chunk = knowledge_graph.get_next_chunk(node_id) + + # Collect all chunks (previous, current, next) + chunks = [] + if previous_chunk: + chunks.append(previous_chunk) + chunks.append(current_chunk) + if next_chunk: + chunks.append(next_chunk) + + # Organize and join chunks + organized = organize_chunks_by_file_name(chunks) + full_content = join_organized_chunks(organized) + + return full_content + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def get_file_stats(path: str) -> str: + """ + Get detailed statistics and metrics for a specific file or directory. + + PURPOSE: + Use this tool to get quantitative metrics about a file including line counts, + entity counts, and chunk counts. Useful for understanding file complexity. + + WHEN TO USE: + - To assess the size and complexity of a file + - To see summary counts of entities declared and called + - To understand how a file is chunked + - For code metrics and analysis tasks + - When deciding which files to explore further + + METRICS PROVIDED: + - Line count (total lines in the file) + - Declared entities count with a sample list + - Called entities count with a sample list + - Number of chunks the file is divided into + + COMPARISON WITH get_file_structure: + - get_file_stats: Quantitative metrics (counts, numbers) + - get_file_structure: Qualitative overview (entity names, chunk IDs) + + TYPICAL USAGE: + - get_file_stats('src/transformers/models/bert/modeling_bert.py') -> see metrics + - Use this to identify large/complex files before diving in + + Args: + path: The file path to analyze. Must match the path as stored in the knowledge graph. + + Returns: + str: Statistics including line count, declared entities, called entities, and chunk count + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + g = knowledge_graph.graph + nodes = [n for n, d in g.nodes(data=True) if getattr(d['data'], 'path', None) == path] + + if not nodes: + return f"No nodes found for path '{path}'." + + result = f"Statistics for '{path}':\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + for node_id in nodes: + node = g.nodes[node_id]['data'] + content = getattr(node, 'content', '') + declared = getattr(node, 'declared_entities', []) + called = getattr(node, 'called_entities', []) + chunks = [t for s, t in g.out_edges(node_id) + if getattr(g.nodes[t]['data'], 'node_type', None) == 'chunk'] + + result += f"Node: {node_id} ({getattr(node, 'node_type', '?')})\n" + result += f" Lines: {len(content.splitlines()) if content else 0}\n" + result += f" Declared entities: {len(declared)}\n" + + if declared: + for entity in declared[:10]: + if isinstance(entity, dict): + result += f" - {entity.get('name', '?')} ({entity.get('type', '?')})\n" + else: + result += f" - {entity}\n" + if len(declared) > 10: + result += f" ... and {len(declared) - 10} more\n" + + result += f" Called entities: {len(called)}\n" + if called: + for entity in called[:10]: + result += f" - {entity}\n" + if len(called) > 10: + result += f" ... and {len(called) - 10} more\n" + + result += f" Chunks: {len(chunks)}\n\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def find_path(source_id: str, target_id: str, max_depth: int = 5) -> str: + """ + Find the shortest path between two nodes in the knowledge graph. + + PURPOSE: + Use this tool to discover how two code elements are connected through the graph. + Reveals the chain of relationships linking two seemingly unrelated pieces of code. + + WHEN TO USE: + - To understand how two classes/functions are related + - To trace dependency chains between components + - To discover indirect connections between code elements + - To verify if two nodes are connected at all + - For understanding code architecture and coupling + + WHAT YOU'LL GET: + - Path length (number of hops) + - Ordered list of nodes from source to target + - Visual representation of the path + + LIMITATIONS: + - max_depth limits search to avoid long computations + - If no path found within max_depth, nodes may still be connected via longer path + - Very distant nodes may require increasing max_depth + + EXAMPLE QUERIES: + - How is BertModel connected to GPT2Model? + - What's the path from a utility function to a model class? + - How many hops between two files? + + TYPICAL WORKFLOW: + 1. Identify two node IDs of interest + 2. find_path(source, target) -> discover connection + 3. get_node_info for nodes in the path to understand the relationship + + Args: + source_id: Starting node ID (any node type) + target_id: Destination node ID (any node type) + max_depth: Maximum path length to search (default: 5). Increase for distant nodes. + + Returns: + str: Path from source to target showing each node in sequence, or message if no path found + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + # Convert max_depth to int if it's a string (MCP may pass strings) + if isinstance(max_depth, str): + try: + max_depth = int(max_depth) + except ValueError: + return f"Error: 'max_depth' must be an integer, got '{max_depth}'" + + path_result = knowledge_graph.find_path(source_id, target_id, max_depth) + + if "error" in path_result: + return f"Error: {path_result['error']}" + + if not path_result.get("path"): + return f"No path found from '{source_id}' to '{target_id}' within depth {max_depth}" + + result = f"Path from '{source_id}' to '{target_id}':\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + result += f"Length: {path_result['length']}\n\n" + + path = path_result['path'] + for i, node_id in enumerate(path): + result += f"{i}. {node_id}\n" + if i < len(path) - 1: + result += " ↓\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def get_subgraph(node_id: str, depth: int = 2, edge_types: Optional[str] = None) -> str: + """ + Extract a local subgraph around a node up to a specified depth. + + PURPOSE: + Use this tool to get a bounded view of the graph neighborhood around any node. + Shows all nodes reachable within a certain number of hops, optionally filtered by edge type. + + WHEN TO USE: + - To understand the local network around a class or function + - To extract a bounded region of the knowledge graph for analysis + - To see all nodes within N hops of a target node + - To analyze the dependency neighborhood of a component + - When get_neighbors isn't enough and you need multi-hop exploration + + DEPTH EXPLANATION: + - depth=1: Only immediate neighbors (same as get_neighbors) + - depth=2: Neighbors and their neighbors (2 hops) + - depth=3+: Larger neighborhood (exponentially more nodes) + + EDGE TYPE FILTERING: + - Pass comma-separated edge types to filter: 'calls,declares' + - Common types: 'calls', 'contains', 'declares' + - Leave empty or None for all edge types + + OUTPUT: + - Node count and edge count in the subgraph + - List of all node IDs in the extracted subgraph + - Filtered by edge types if specified + + TYPICAL WORKFLOW: + 1. Find a central node of interest + 2. get_subgraph(node_id, depth=2) -> see local neighborhood + 3. Use node IDs from result with get_node_info for details + + COMPARISON WITH get_neighbors: + - get_neighbors: Single hop, shows node details + - get_subgraph: Multi-hop, shows subgraph structure and counts + + Args: + node_id: Central node to build subgraph around + depth: Radius in hops from central node (default: 2). Higher = larger subgraph. + edge_types: Optional comma-separated filter: 'calls,contains,declares' or None for all + + Returns: + str: Subgraph summary with node/edge counts and list of included node IDs + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + # Convert depth to int if it's a string (MCP may pass strings) + if isinstance(depth, str): + try: + depth = int(depth) + except ValueError: + return f"Error: 'depth' must be an integer, got '{depth}'" + + edge_types_list = edge_types.split(",") if edge_types else None + subgraph_result = knowledge_graph.get_subgraph(node_id, depth, edge_types_list) + + if "error" in subgraph_result: + return f"Error: {subgraph_result['error']}" + + result = f"Subgraph around '{node_id}' (depth: {depth}):\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + result += f"Nodes: {subgraph_result['node_count']}\n" + result += f"Edges: {subgraph_result['edge_count']}\n" + + if edge_types_list: + result += f"Filtered by edge types: {', '.join(edge_types_list)}\n" + + result += "\nNodes in subgraph:\n" + for node in subgraph_result['nodes'][:30]: + result += f" - {node}\n" + + if len(subgraph_result['nodes']) > 30: + result += f" ... and {len(subgraph_result['nodes']) - 30} more\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def list_files_in_directory(directory_path: str = "", pattern: str = "*", recursive: bool = True, limit: int = 50, page: int = 1) -> str: + """ + Browse and list files in the repository with flexible filtering options. + + PURPOSE: + Use this tool to explore the file structure of the Transformers library. + Supports directory scoping, glob patterns, and recursive/non-recursive modes. + + WHEN TO USE: + - To see what files exist in a directory + - To find files by pattern (e.g., all Python files, all test files) + - To explore the repository structure directory by directory + - To find specific file types in specific locations + - When you need file paths for use with other tools + + FILTERING OPTIONS: + + directory_path: + - Empty string '': Search all files in the repository + - 'src/transformers/models': Only files under this directory + - 'src/transformers/models/bert': Focus on a specific model + + pattern (glob patterns): + - '*': All files (default) + - '*.py': Python files only + - 'test_*.py': Test files + - '*config*': Files with 'config' in name + - 'modeling_*.py': Modeling files + + recursive: + - True (default): Include files in subdirectories + - False: Only files directly in the specified directory + + COMMON USE CASES: + - All files: list_files_in_directory() + - Bert model files: list_files_in_directory('src/transformers/models/bert') + - All Python files: list_files_in_directory(pattern='*.py') + - Test files only: list_files_in_directory(pattern='test_*.py') + - Config files: list_files_in_directory(pattern='*config*') + + COMPARISON WITH print_tree: + - print_tree: Visual hierarchy, includes directories + - list_files_in_directory: Flat file list with details, better for finding specific files + + Args: + directory_path: Directory to search in. Empty string for entire repository. + pattern: Glob pattern for filename filtering (default: '*' matches all) + recursive: Search subdirectories (default: True) + limit: Files per page (default: 50) + page: Page number for pagination + + Returns: + str: List of matching files with paths, languages, and entity counts + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + # Convert limit to int if it's a string + if isinstance(limit, str): + try: + limit = int(limit) + except ValueError: + return f"Error: 'limit' must be an integer, got '{limit}'" + + # Convert page to int if it's a string + if isinstance(page, str): + try: + page = int(page) + except ValueError: + return f"Error: 'page' must be an integer, got '{page}'" + + if limit <= 0: + return "Error: limit must be a positive integer" + if page < 1: + return "Error: 'page' must be a positive integer (1 or greater)" + + # Convert recursive to bool if it's a string + if isinstance(recursive, str): + recursive = recursive.lower() in ('true', '1', 'yes') + + g = knowledge_graph.graph + matching_files = [] + + for nid, n in g.nodes(data=True): + node = n['data'] + node_type = getattr(node, 'node_type', None) + + # Only look at file nodes + if node_type != 'file': + continue + + file_path = getattr(node, 'path', nid) + file_name = getattr(node, 'name', '') + + # Filter by directory path if specified + if directory_path: + if recursive: + # Check if file is under the directory + if not file_path.startswith(directory_path.rstrip('/') + '/') and file_path != directory_path: + continue + else: + # Check if file is directly in the directory (not in subdirectories) + parent_dir = '/'.join(file_path.rsplit('/', 1)[:-1]) if '/' in file_path else '' + if parent_dir != directory_path.rstrip('/'): + continue + + # Apply glob pattern matching + if pattern and pattern != '*': + # Match against both full path and filename + if not (fnmatch.fnmatch(file_path, pattern) or + fnmatch.fnmatch(file_name, pattern) or + fnmatch.fnmatch(file_path, f'**/{pattern}')): + continue + + language = getattr(node, 'language', 'Unknown') + declared_entities = getattr(node, 'declared_entities', []) + + matching_files.append({ + 'path': file_path, + 'name': file_name, + 'language': language, + 'entity_count': len(declared_entities) + }) + + # Sort by path for consistent ordering + matching_files.sort(key=lambda x: x['path']) + + if not matching_files: + filter_desc = f" in '{directory_path}'" if directory_path else "" + pattern_desc = f" matching '{pattern}'" if pattern and pattern != '*' else "" + return f"No files found{filter_desc}{pattern_desc}." + + total = len(matching_files) + # Pagination + total_pages = (total + limit - 1) // limit + if page > total_pages: + return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} files at {limit} per page)" + + start_idx = (page - 1) * limit + end_idx = start_idx + limit + page_slice = matching_files[start_idx:end_idx] + + result = f"Files" + if directory_path: + result += f" in '{directory_path}'" + if pattern and pattern != '*': + result += f" matching '{pattern}'" + result += f" (Page {page}/{total_pages}, {total} total):\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + for i, f in enumerate(page_slice, start=start_idx + 1): + result += f"{i}. {f['path']}\n" + result += f" Language: {f['language']}, Entities: {f['entity_count']}\n\n" + + # Pagination hint + if page < total_pages: + result += f"Use page={page + 1} to see the next page\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def find_files_importing(module_or_entity: str, limit: int = 30, page: int = 1) -> str: + """ + Find all files that import or use a specific module, class, or function. + + PURPOSE: + Use this tool to trace import dependencies and understand which parts of the + codebase depend on a particular module or entity. + + WHEN TO USE: + - To find all files that import a specific module (e.g., 'torch', 'numpy') + - To trace dependencies on a class or function + - To understand the impact scope of a module + - To find usage patterns of external libraries + - For dependency analysis and impact assessment + + SEARCH BEHAVIOR: + - Searches through 'called_entities' metadata + - Also scans code chunks for import statement patterns + - Matches import, from...import, require, use patterns + - Case-insensitive matching + + WHAT YOU'LL GET: + - List of files that import/use the specified module or entity + - Match type (called_entity or import_statement) + - Matched entity names when applicable + + EXAMPLE QUERIES: + - find_files_importing('torch') -> files using PyTorch + - find_files_importing('numpy') -> files using NumPy + - find_files_importing('BertModel') -> files using BertModel + - find_files_importing('attention') -> files related to attention + + LIMITATIONS: + - May not catch all dynamic imports + - Pattern matching may have false positives/negatives + - For comprehensive search, combine with search_nodes + + Args: + module_or_entity: Name of the module, class, or function to search for (case-insensitive) + limit: Maximum results per page (default: 30) + page: Page number for pagination + + Returns: + str: List of files that import or use the specified module/entity, with match details + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + # Convert limit to int if it's a string + if isinstance(limit, str): + try: + limit = int(limit) + except ValueError: + return f"Error: 'limit' must be an integer, got '{limit}'" + + # Convert page to int if it's a string + if isinstance(page, str): + try: + page = int(page) + except ValueError: + return f"Error: 'page' must be an integer, got '{page}'" + + if limit <= 0: + return "Error: limit must be a positive integer" + if page < 1: + return "Error: 'page' must be a positive integer (1 or greater)" + + g = knowledge_graph.graph + importing_files = [] + search_term = module_or_entity.lower() + + # Search through file nodes + for nid, n in g.nodes(data=True): + node = n['data'] + node_type = getattr(node, 'node_type', None) + + if node_type != 'file': + continue + + file_path = getattr(node, 'path', nid) + called_entities = getattr(node, 'called_entities', []) + + # Check if the module/entity is in called entities + found_in_calls = False + matched_entities = [] + for entity in called_entities: + entity_str = str(entity).lower() if not isinstance(entity, dict) else entity.get('name', '').lower() + if search_term in entity_str: + found_in_calls = True + matched_entities.append(entity_str) + + if found_in_calls: + importing_files.append({ + 'path': file_path, + 'name': getattr(node, 'name', ''), + 'matched_entities': matched_entities[:5], + 'match_type': 'called_entity' + }) + continue + + # Also check chunk contents for import statements + chunks = knowledge_graph.get_chunks_of_file(file_path) if hasattr(knowledge_graph, 'get_chunks_of_file') else [] + for chunk in chunks[:3]: # Check first few chunks (usually where imports are) + content = getattr(chunk, 'content', '') + # Look for import patterns + import_patterns = [ + rf'import\s+.*{re.escape(module_or_entity)}', + rf'from\s+.*{re.escape(module_or_entity)}.*\s+import', + rf'require\s*\(\s*["\'].*{re.escape(module_or_entity)}', + rf'use\s+.*{re.escape(module_or_entity)}', + ] + for pattern in import_patterns: + if re.search(pattern, content, re.IGNORECASE): + if not any(f['path'] == file_path for f in importing_files): + importing_files.append({ + 'path': file_path, + 'name': getattr(node, 'name', ''), + 'matched_entities': [], + 'match_type': 'import_statement' + }) + break + + # Sort by path + importing_files.sort(key=lambda x: x['path']) + + if not importing_files: + return f"No files found importing '{module_or_entity}'.\n\nTip: Try searching for the module name in code content using search_nodes." + + total = len(importing_files) + # Pagination + total_pages = (total + limit - 1) // limit + if page > total_pages: + return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} files at {limit} per page)" + + start_idx = (page - 1) * limit + end_idx = start_idx + limit + page_slice = importing_files[start_idx:end_idx] + + result = f"Files importing '{module_or_entity}' (Page {page}/{total_pages}, {total} total):\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + for i, f in enumerate(page_slice, start=start_idx + 1): + result += f"{i}. {f['path']}\n" + result += f" Match type: {f['match_type']}\n" + if f['matched_entities']: + result += f" Matched: {', '.join(f['matched_entities'][:3])}\n" + result += "\n" + + # Pagination hint + if page < total_pages: + result += f"Use page={page + 1} to see the next page\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +@observe(as_type="tool") +def get_concept_overview(concept: str, limit: int = 15) -> str: + """ + Get a high-level overview of how a concept is implemented across the Transformers codebase. + + PURPOSE: + Use this tool for broad exploration of a concept or feature. Aggregates related + classes, functions, files, and code snippets into a single comprehensive view. + Ideal for initial investigation of a topic. + + WHEN TO USE: + - FIRST STEP when exploring a new concept (before detailed searches) + - To understand how a feature is implemented across the codebase + - To discover all components related to a concept + - To get a bird's-eye view before diving into specifics + - When you're not sure where to start investigating + + SEARCH STRATEGY: + This tool combines multiple search approaches: + - Searches entity names (classes, functions, methods) containing the concept + - Searches file names and paths + - Searches chunk content and descriptions + - Aggregates results into categorized sections + + CONCEPT EXAMPLES: + - 'attention' -> attention mechanisms across all models + - 'embedding' -> embedding layers and utilities + - 'tokenizer' -> tokenization components + - 'generation' -> text generation utilities + - 'config' -> configuration classes + - 'cache' -> caching mechanisms + - 'rope' -> rotary position embeddings + - 'flash' -> flash attention implementations + + OUTPUT STRUCTURE: + - Related Classes: Class definitions matching the concept + - Related Functions/Methods: Functions matching the concept + - Related Files: Files with concept in path/name + - Code Snippets: Relevant code chunks + + TYPICAL WORKFLOW: + 1. get_concept_overview('attention') -> see all attention-related components + 2. Identify specific classes/functions of interest + 3. go_to_definition or search_nodes for detailed exploration + + Args: + concept: The concept to explore (e.g., 'attention', 'embedding', 'generation', 'tokenizer') + limit: Maximum items per category (default: 15) + + Returns: + str: Categorized overview with related classes, functions, files, and code snippets + """ + if knowledge_graph is None: + return "Error: Knowledge graph not initialized" + + try: + # Convert limit to int if it's a string + if isinstance(limit, str): + try: + limit = int(limit) + except ValueError: + return f"Error: 'limit' must be an integer, got '{limit}'" + + g = knowledge_graph.graph + concept_lower = concept.lower() + + # Categories to collect + related_classes = [] + related_functions = [] + related_files = [] + related_chunks = [] + + # Search through all nodes + for nid, n in g.nodes(data=True): + node = n['data'] + node_type = getattr(node, 'node_type', None) + node_name = getattr(node, 'name', '') + + # Check if concept appears in name + name_match = concept_lower in node_name.lower() + + if node_type == 'entity': + entity_type = getattr(node, 'entity_type', '') + if name_match: + if entity_type.lower() == 'class' and len(related_classes) < limit: + declaring = getattr(node, 'declaring_chunk_ids', []) + related_classes.append({ + 'name': node_name, + 'id': nid, + 'file': declaring[0] if declaring else 'Unknown' + }) + elif entity_type.lower() in ('function', 'method') and len(related_functions) < limit: + declaring = getattr(node, 'declaring_chunk_ids', []) + related_functions.append({ + 'name': node_name, + 'id': nid, + 'type': entity_type, + 'file': declaring[0] if declaring else 'Unknown' + }) + + elif node_type == 'file' and len(related_files) < limit: + # Check if concept in filename or path + file_path = getattr(node, 'path', '') + if concept_lower in file_path.lower() or name_match: + declared = getattr(node, 'declared_entities', []) + related_files.append({ + 'path': file_path, + 'name': node_name, + 'entity_count': len(declared) + }) + + elif node_type == 'chunk' and len(related_chunks) < limit // 2: + # Check if concept in chunk content or description + content = getattr(node, 'content', '') + description = getattr(node, 'description', '') + if concept_lower in content.lower() or concept_lower in (description or '').lower(): + file_path = getattr(node, 'path', '') + related_chunks.append({ + 'id': nid, + 'file': file_path, + 'content': content + }) + + # Build the overview + result = f"Concept Overview: '{concept}'\n" + result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" + + # Summary + total = len(related_classes) + len(related_functions) + len(related_files) + len(related_chunks) + result += f"Found {total} related items across the codebase.\n\n" + + if related_classes: + result += f"📦 Related Classes ({len(related_classes)}):\n" + for cls in related_classes[:10]: + result += f" • {cls['name']}\n" + result += f" File: {cls['file']}\n" + if len(related_classes) > 10: + result += f" ... and {len(related_classes) - 10} more\n" + result += "\n" + + if related_functions: + result += f"⚡ Related Functions/Methods ({len(related_functions)}):\n" + for func in related_functions[:10]: + result += f" • {func['name']} ({func['type']})\n" + result += f" File: {func['file']}\n" + if len(related_functions) > 10: + result += f" ... and {len(related_functions) - 10} more\n" + result += "\n" + + if related_files: + result += f"📄 Related Files ({len(related_files)}):\n" + for f in related_files[:10]: + result += f" • {f['path']}\n" + result += f" Entities: {f['entity_count']}\n" + if len(related_files) > 10: + result += f" ... and {len(related_files) - 10} more\n" + result += "\n" + + if related_chunks: + result += f"📝 Code Snippets ({len(related_chunks)}):\n" + for chunk in related_chunks[:5]: + result += f" • {chunk['id']}\n" + result += f" Content:\n{chunk['content']}\n\n" + if len(related_chunks) > 5: + result += f" ... and {len(related_chunks) - 5} more\n" + + if total == 0: + result += "No direct matches found.\n\n" + result += "Suggestions:\n" + result += f" • Try searching with: search_nodes('{concept}')\n" + result += f" • Try partial name: search_by_type_and_name('class', '{concept[:4]}')\n" + result += f" • Check entity list: list_all_entities(entity_type='class')\n" + + return result + except Exception as e: + return f"Error: {str(e)}" + + +def _get_header_explorer(): + html = """ + + +