""" Simplified Gradio MCP Server for Knowledge Graphs loaded from HuggingFace datasets. """ import os import sys import argparse import difflib import fnmatch import re from typing import Optional, List import gradio as gr from RepoKnowledgeGraphLib.utils.chunk_utils import ( organize_chunks_by_file_name, join_organized_chunks ) # Optional Langfuse integration try: from langfuse import get_client, observe langfuse = get_client() LANGFUSE_ENABLED = langfuse.auth_check() if LANGFUSE_ENABLED: print("✓ Langfuse client is authenticated and ready!") else: print("⚠️ Langfuse authentication failed. Tracing disabled.") except Exception as e: print(f"⚠️ Langfuse not available: {e}. Tracing disabled.") LANGFUSE_ENABLED = False def observe(*args, **kwargs): def decorator(func): return func return decorator def _sanitize_value(v): if isinstance(v, str): return v.strip() if isinstance(v, dict): return {k: _sanitize_value(val) for k, val in v.items()} if isinstance(v, (list, tuple)): t = type(v) return t(_sanitize_value(x) for x in v) return v def sanitize_inputs(func): """Decorator that trims whitespace from all string args/kwargs before calling func.""" def wrapper(*args, **kwargs): new_args = tuple(_sanitize_value(a) for a in args) new_kwargs = {k: _sanitize_value(v) for k, v in kwargs.items()} return func(*new_args, **new_kwargs) # preserve original attributes try: wrapper.__name__ = func.__name__ wrapper.__doc__ = func.__doc__ except Exception: pass return wrapper # Wrap the existing `observe` decorator (from langfuse or fallback) so that # all observed tools receive sanitized inputs automatically. This avoids # having to manually add `@sanitize_inputs` above every `@observe`. try: _original_observe = observe def _observe_with_sanitize(*o_args, **o_kwargs): def decorator(f): return _original_observe(*o_args, **o_kwargs)(sanitize_inputs(f)) return decorator observe = _observe_with_sanitize except Exception: # If anything goes wrong, keep the existing observe as-is. pass # Add parent directory to path sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'RepoKnowledgeGraphLib')) from RepoKnowledgeGraphLib.RepoKnowledgeGraph import RepoKnowledgeGraph # Global knowledge graph instance knowledge_graph = None def initialize_knowledge_graph( hf_dataset: str, hf_token: Optional[str] = None, index_nodes: bool = True, code_index_kwargs: Optional[dict] = None ): """Initialize the knowledge graph from a HuggingFace dataset.""" global knowledge_graph model_service_kwargs = { "embedder_type": "sentence-transformers", "embed_model_name": "Salesforce/SFR-Embedding-Code-400M_R", } print(f"Loading knowledge graph from HuggingFace dataset: {hf_dataset}") knowledge_graph = RepoKnowledgeGraph.from_hf_dataset( repo_id=hf_dataset, index_nodes=index_nodes, model_service_kwargs=model_service_kwargs, code_index_kwargs=code_index_kwargs, token=hf_token ) # ==================== Tool Functions ==================== @observe(as_type="tool") def get_node_info(node_id: str) -> str: """ Retrieve comprehensive details about any node in the Transformers library knowledge graph. PURPOSE: Use this tool to inspect the full metadata and content of a specific node when you need to understand what a particular code element contains, what entities it declares or calls, and how it fits into the codebase structure. WHEN TO USE: - After finding a node ID from search_nodes, list_nodes_by_type, or get_neighbors - To see the actual code content of a chunk node - To understand what entities (classes, functions, variables) are declared in a file or chunk - To examine entity metadata including aliases, declaration locations, and usage locations - To get file metadata like language and path information NODE TYPES SUPPORTED: - 'chunk': Code segments with content, declared/called entities, and file position - 'file': Source files with path, language, and entity summaries - 'directory': Folder nodes with path information - 'entity': Programming constructs (classes, functions, methods, variables) with declaration/usage tracking - 'repo': Repository root node TYPICAL WORKFLOW: 1. search_nodes("attention mechanism") -> get node IDs 2. get_node_info(node_id) -> see full content and metadata 3. get_neighbors(node_id) or find_usages(entity_name) -> explore relationships Args: node_id: The unique identifier of the node (e.g., 'src/transformers/models/bert/modeling_bert.py::chunk_3' for chunks, or 'BertModel' for entities) Returns: str: Formatted details including node type, name, description, content (for chunks), declared entities, called entities, and type-specific metadata Example node_ids: - Chunk: 'src/transformers/models/bert/modeling_bert.py::chunk_5' - File: 'src/transformers/models/bert/modeling_bert.py' - Entity: 'BertModel', 'forward', 'attention_mask' """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: if node_id not in knowledge_graph.graph: return f"Error: Node '{node_id}' not found in knowledge graph" node = knowledge_graph.graph.nodes[node_id]['data'] node_type = getattr(node, 'node_type', 'Unknown') node_class = node.__class__.__name__ node_name = getattr(node, 'name', 'Unknown') description = getattr(node, 'description', None) result = f"Node Information:\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n" result += f"Node ID: {node_id}\nClass: {node_class}\nName: {node_name}\nType: {node_type}\n" result += f"Description: {description or 'N/A'}\n" if node_class == 'EntityNode' or node_type == 'entity': entity_type = getattr(node, 'entity_type', 'Unknown') declaring_chunk_ids = getattr(node, 'declaring_chunk_ids', []) calling_chunk_ids = getattr(node, 'calling_chunk_ids', []) aliases = getattr(node, 'aliases', []) result += f"\nEntity Type: {entity_type}\n" result += f"Aliases: {', '.join(aliases) if aliases else 'None'}\n" result += f"Declared in {len(declaring_chunk_ids)} chunk(s):\n" for cid in declaring_chunk_ids[:5]: result += f" - {cid}\n" if len(declaring_chunk_ids) > 5: result += f" ... and {len(declaring_chunk_ids) - 5} more\n" result += f"Called in {len(calling_chunk_ids)} chunk(s):\n" for cid in calling_chunk_ids[:5]: result += f" - {cid}\n" if len(calling_chunk_ids) > 5: result += f" ... and {len(calling_chunk_ids) - 5} more\n" result += f"\nSummary: Entity {node_id} ({node_name}) — {entity_type} declared in {len(declaring_chunk_ids)} chunk(s) and called in {len(calling_chunk_ids)} chunk(s).\n" else: declared_entities = getattr(node, 'declared_entities', []) called_entities = getattr(node, 'called_entities', []) result += f"\nDeclared Entities ({len(declared_entities)}):\n" for entity in declared_entities[:10]: result += f" - {entity}\n" if len(declared_entities) > 10: result += f" ... and {len(declared_entities) - 10} more\n" result += f"\nCalled Entities ({len(called_entities)}):\n" for entity in called_entities[:10]: result += f" - {entity}\n" if len(called_entities) > 10: result += f" ... and {len(called_entities) - 10} more\n" # Add content preview for file/chunk nodes if node_type in ['file', 'chunk']: content = getattr(node, 'content', None) result += f"\nContent:\n{content or 'N/A'}\n" if hasattr(node, 'path'): result += f"Path: {node.path}\n" if hasattr(node, 'language'): result += f"Language: {node.language}\n" if node_type == 'chunk' and hasattr(node, 'order_in_file'): result += f"Order in File: {node.order_in_file}\n" elif node_type == 'directory': if hasattr(node, 'path'): result += f"Path: {node.path}\n" result += f"\nSummary: Node {node_id} ({node_name}) — {node_type} with {len(declared_entities)} declared and {len(called_entities)} called entities.\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def get_node_edges(node_id: str) -> str: """ List all graph edges (relationships) connected to a specific node in the knowledge graph. PURPOSE: Use this tool to understand how a node is connected to other parts of the codebase. Reveals the dependency structure and relationships that link code elements together. WHEN TO USE: - To discover what code calls or depends on a specific function/class - To find parent-child relationships (e.g., which file contains a chunk) - To trace declaration and usage patterns through the codebase - To understand the connectivity of an entity in the dependency graph - When you need a raw view of all relationships without filtering EDGE TYPES YOU'LL SEE: - 'contains': Parent-child (file→chunk, directory→file, repo→directory) - 'calls': Entity usage relationships (chunk→entity it calls) - 'declares': Entity declaration relationships (chunk→entity it defines) DIRECTION MEANINGS: - Incoming edges (←): Other nodes pointing TO this node (e.g., "who calls me?") - Outgoing edges (→): This node pointing TO others (e.g., "what do I call?") COMPARISON WITH get_neighbors: - get_node_edges: Shows edge metadata and direction, raw relationship view - get_neighbors: Shows neighboring node details, easier for exploration Args: node_id: The unique identifier of the node to inspect edges for Returns: str: List of incoming and outgoing edges with source/target node IDs and relationship types """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: if node_id not in knowledge_graph.graph: return f"Error: Node '{node_id}' not found in knowledge graph" g = knowledge_graph.graph incoming = [ {"source": src, "target": tgt, "relation": data.get("relation", "?")} for src, tgt, data in g.in_edges(node_id, data=True) ] outgoing = [ {"source": src, "target": tgt, "relation": data.get("relation", "?")} for src, tgt, data in g.out_edges(node_id, data=True) ] result = f"""Node Edges for '{node_id}': ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ Incoming Edges ({len(incoming)}): """ for edge in incoming[:20]: result += f" ← {edge['source']} [{edge['relation']}]\n" if len(incoming) > 20: result += f" ... and {len(incoming) - 20} more\n" result += f"\nOutgoing Edges ({len(outgoing)}):\n" for edge in outgoing[:20]: result += f" → {edge['target']} [{edge['relation']}]\n" if len(outgoing) > 20: result += f" ... and {len(outgoing) - 20} more\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def search_nodes(query: str, limit: int = 10, page: int = 1) -> str: """ Search the Transformers codebase using keyword matching against code content and metadata. PURPOSE: This is your PRIMARY SEARCH TOOL for exploring the codebase. Use it to find relevant code chunks based on natural language queries, function names, class names, comments, or any text that might appear in the source code. WHEN TO USE: - FIRST STEP when investigating any topic in the Transformers library - To find implementations of specific features (e.g., "rotary embeddings", "flash attention") - To locate code by function/class name when you don't have the exact node ID - To discover code related to a concept (e.g., "gradient checkpointing", "tokenization") - When you don't know where something is implemented SEARCH TIPS: - Use specific technical terms: "rope embedding" rather than just "embedding" - Include class/function names if known: "BertSelfAttention forward" - Try multiple related queries if first results aren't satisfactory - Results are ranked by relevance to your query TYPICAL WORKFLOW: 1. search_nodes("attention mask handling") -> find relevant chunks 2. get_node_info(chunk_id) -> examine the code content 3. get_chunk_context(chunk_id) -> see surrounding code for fuller picture 4. go_to_definition(entity_name) -> find where an entity is defined Args: query: Search terms to match against code content. Can be natural language, function names, class names, or code snippets. More specific queries yield better results. limit: Results per page (default: 10, max recommended: 50). Use smaller limits for faster responses. page: Page number starting from 1. Use pagination to browse through many results. Returns: str: Ranked list of matching code chunks with IDs and content previews. Use the returned IDs with get_node_info or get_chunk_context for full details. """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: # Convert limit to int if it's a string (MCP may pass strings) if isinstance(limit, str): try: limit = int(limit) except ValueError: return f"Error: 'limit' must be an integer, got '{limit}'" # Convert page to int if it's a string if isinstance(page, str): try: page = int(page) except ValueError: return f"Error: 'page' must be an integer, got '{page}'" if limit <= 0: return "Error: limit must be a positive integer" if page < 1: return "Error: 'page' must be a positive integer (1 or greater)" # Fetch more results to support pagination max_fetch = limit * page results = knowledge_graph.code_index.query(query, n_results=max_fetch) metadatas = results.get("metadatas", [[]])[0] if not metadatas: return f"No results found for '{query}'." total = len(metadatas) # Pagination total_pages = (total + limit - 1) // limit if page > total_pages: return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} results at {limit} per page)" start_idx = (page - 1) * limit end_idx = start_idx + limit page_slice = metadatas[start_idx:end_idx] result = f"Search Results for '{query}' (Page {page}/{total_pages}, {total} total):\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" for i, res in enumerate(page_slice, start=start_idx + 1): result += f"{i}. ID: {res.get('id', 'N/A')}\n" content = res.get('content', '') if content: result += f" Content: {content}\n" result += "\n" # Pagination hint if page < total_pages: result += f"Use page={page + 1} to see the next page\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def get_graph_stats() -> str: """ Get a comprehensive statistical overview of the Transformers library knowledge graph. PURPOSE: Use this tool to understand the scope and structure of the knowledge graph. Provides counts and breakdowns of all node types, entity types, and relationship types. WHEN TO USE: - At the START of an exploration session to understand the codebase scope - To learn what types of entities and relationships are available for querying - To understand the terminology used in this knowledge graph (chunks, entities, edges) - When you need to report on the overall structure of the Transformers library WHAT YOU'LL LEARN: - Total number of nodes and edges in the graph - Breakdown of node types (chunks, files, directories, entities) - Entity type distribution (classes, functions, methods, variables, etc.) - Edge relationship types (contains, calls, declares) - Definitions of key concepts used throughout the tools GRAPH TERMINOLOGY: - Chunks: Logical code segments (a function body, a class definition, etc.) - Entities: Named programming constructs tracked across the codebase - Edges: Relationships connecting nodes (contains, calls, declares) Returns: str: Detailed statistics including node counts by type, entity breakdown, edge relation counts, and concept definitions to help you use other tools effectively. """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: g = knowledge_graph.graph num_nodes = g.number_of_nodes() num_edges = g.number_of_edges() # Count node types node_types = {} entity_breakdown = {} for _, node_attrs in g.nodes(data=True): node_type = getattr(node_attrs['data'], 'node_type', 'Unknown') node_types[node_type] = node_types.get(node_type, 0) + 1 # For entity nodes, get entity_type breakdown if node_type == 'entity': entity_type = getattr(node_attrs['data'], 'entity_type', 'Unknown') # Fallback: if entity_type is empty, check entities dictionary if not entity_type: node_id = node_attrs['data'].id if hasattr(node_attrs['data'], 'id') else None if node_id and node_id in knowledge_graph.entities: entity_types = knowledge_graph.entities[node_id].get('type', []) entity_type = entity_types[0] if entity_types else 'Unknown' entity_breakdown[entity_type] = entity_breakdown.get(entity_type, 0) + 1 # Count edge relations edge_relations = {} for _, _, attrs in g.edges(data=True): relation = attrs.get('relation', 'Unknown') edge_relations[relation] = edge_relations.get(relation, 0) + 1 # Build result result = f"""Knowledge Graph Statistics: ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 📊 Overview: Total Nodes: {num_nodes:,} Total Edges: {num_edges:,} ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 📦 Node Types: """ # Sort node types by count for ntype, count in sorted(node_types.items(), key=lambda x: x[1], reverse=True): result += f" • {ntype}: {count:,}\n" # If this is entity type, show breakdown if ntype == 'entity' and entity_breakdown: result += f" └─ Entity Breakdown:\n" for etype, ecount in sorted(entity_breakdown.items(), key=lambda x: x[1], reverse=True): percentage = (ecount / count * 100) if count > 0 else 0 result += f" ├─ {etype}: {ecount:,} ({percentage:.1f}%)\n" result += f""" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 🔗 Edge Relations: """ for relation, count in sorted(edge_relations.items(), key=lambda x: x[1], reverse=True): result += f" • {relation}: {count:,}\n" # Add explanation section result += f""" ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ ℹ️ Definitions: Chunks: Code segments representing logical portions of files. Each chunk contains a section of code (like a function, class, or code block) along with metadata about what entities it declares and calls. Entities: Programming constructs identified in the code including: - Classes: Class definitions - Functions: Function definitions - Methods: Class method definitions - Variables: Variable declarations - Parameters: Function/method parameters - Function_call/Method_call: Usage references Files: Source code files in the repository Directories: Folder structure containing files Repo: Root repository node Edge Relations: - contains: Parent-child relationships (file contains chunks) - declares: Entity declaration relationships - calls: Entity usage/invocation relationships """ return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def list_nodes_by_type(node_type: str, limit: int = 20, page: int = 1) -> str: """ List all nodes of a specific type in the Transformers knowledge graph with pagination. PURPOSE: Use this tool to browse and discover nodes by their type. Helpful when you want to see what classes, functions, files, or other constructs exist in the codebase. WHEN TO USE: - To get a list of all classes in the Transformers library: node_type='class' - To see all Python files: node_type='file' - To list all functions: node_type='function' - To browse all methods: node_type='method' - When you need to find node IDs for further exploration VALID node_type VALUES: For entities (programming constructs): - 'class': Class definitions (e.g., BertModel, GPT2LMHeadModel) - 'function': Standalone function definitions - 'method': Class method definitions - 'variable': Variable declarations - 'parameter': Function/method parameters For structural nodes: - 'file': Source code files - 'chunk': Code segments within files - 'directory': Folder structure nodes - 'repo': Repository root (typically one) COMPARISON WITH search_by_type_and_name: - list_nodes_by_type: Browse ALL nodes of a type (no name filter) - search_by_type_and_name: Filter by type AND search by name substring Args: node_type: The type to filter by. Use lowercase: 'class', 'function', 'method', 'file', 'chunk', 'directory' limit: Maximum results per page (default: 20). Increase for broader browsing. page: Page number starting from 1 for pagination through large result sets. Returns: str: Alphabetically sorted list of matching nodes with their IDs and types. Use IDs with get_node_info for details. """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: # Convert limit/page to int if they're strings (MCP/Gradio may pass strings) if isinstance(limit, str): try: limit = int(limit) except ValueError: return f"Error: 'limit' must be an integer, got '{limit}'" if isinstance(page, str): try: page = int(page) except ValueError: return f"Error: 'page' must be an integer, got '{page}'" if limit <= 0: return "Error: limit must be a positive integer" if page < 1: return "Error: 'page' must be a positive integer (1 or greater)" g = knowledge_graph.graph matching_nodes = [] for node_id, data in g.nodes(data=True): node = data['data'] current_node_type = getattr(node, 'node_type', None) node_name = getattr(node, 'name', 'Unknown') # For entity nodes, check entity_type instead of node_type if current_node_type == 'entity': entity_type = getattr(node, 'entity_type', '') # Fallback: if entity_type is empty, check the entities dictionary if not entity_type and node_id in knowledge_graph.entities: entity_types = knowledge_graph.entities[node_id].get('type', []) entity_type = entity_types[0] if entity_types else '' if entity_type and entity_type.lower() == node_type.lower(): matching_nodes.append({ "id": node_id, "name": node_name, "type": f"entity ({entity_type})" }) # For other nodes, check node_type directly elif current_node_type == node_type: matching_nodes.append({ "id": node_id, "name": node_name, "type": current_node_type }) # Sort by name for consistent ordering matching_nodes.sort(key=lambda x: x['name'].lower()) total = len(matching_nodes) if total == 0: return f"No nodes found of type '{node_type}'." # Pagination total_pages = (total + limit - 1) // limit if page > total_pages: return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} nodes at {limit} per page)" start_idx = (page - 1) * limit end_idx = start_idx + limit page_slice = matching_nodes[start_idx:end_idx] result = f"Nodes of type '{node_type}' (Page {page}/{total_pages}, {total} total):\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" for i, node in enumerate(page_slice, start=start_idx + 1): result += f"{i}. {node['name']}\n" result += f" ID: {node['id']}\n" result += f" Type: {node['type']}\n\n" # Pagination hint if page < total_pages: result += f"Use page={page + 1} to see the next page\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def get_neighbors(node_id: str, limit: int = 20, page: int = 1) -> str: """ Get all nodes directly connected to a given node with their relationship information. PURPOSE: Use this tool to explore the local neighborhood of any node in the knowledge graph. Shows what's connected to a node and how, making it easy to navigate the codebase structure. WHEN TO USE: - To explore what a node is connected to (files, chunks, entities) - To navigate from one code element to related elements - To understand the local structure around a specific node - After using get_node_info when you want to explore connected nodes - To discover related code without knowing exact names WHAT YOU'LL SEE: - Neighbor node IDs and names - Node types (chunk, file, entity, etc.) - Relationship direction (→ outgoing, ← incoming) - Relationship type (contains, calls, declares) TYPICAL NAVIGATION PATTERNS: - From a file: see its chunks and declared entities - From a chunk: see entities it declares/calls and its parent file - From an entity: see chunks that declare or call it - From a directory: see contained files and subdirectories COMPARISON WITH get_node_edges: - get_neighbors: Shows neighboring NODE details (name, type) - better for exploration - get_node_edges: Shows raw EDGE information - better for understanding relationships Args: node_id: The ID of the node to explore neighbors for limit: Maximum neighbors to return per page (default: 20) page: Page number for pagination when node has many connections Returns: str: List of connected nodes with their IDs, names, types, and the relationships connecting them """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: if node_id not in knowledge_graph.graph: return f"Error: Node '{node_id}' not found in knowledge graph" # Convert limit/page to int if they're strings if isinstance(limit, str): try: limit = int(limit) except ValueError: return f"Error: 'limit' must be an integer, got '{limit}'" if isinstance(page, str): try: page = int(page) except ValueError: return f"Error: 'page' must be an integer, got '{page}'" if limit <= 0: return "Error: limit must be a positive integer" if page < 1: return "Error: 'page' must be a positive integer (1 or greater)" neighbors = knowledge_graph.get_neighbors(node_id) if not neighbors: return f"No neighbors found for node '{node_id}'" total = len(neighbors) # Pagination total_pages = (total + limit - 1) // limit if page > total_pages: return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} neighbors at {limit} per page)" start_idx = (page - 1) * limit end_idx = start_idx + limit page_slice = neighbors[start_idx:end_idx] result = f"Neighbors of '{node_id}' (Page {page}/{total_pages}, {total} total):\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" for i, neighbor in enumerate(page_slice, start=start_idx + 1): result += f"{i}. {neighbor.id}\n" result += f" Name: {getattr(neighbor, 'name', 'Unknown')}\n" result += f" Type: {neighbor.node_type}\n" if knowledge_graph.graph.has_edge(node_id, neighbor.id): edge_data = knowledge_graph.graph.get_edge_data(node_id, neighbor.id) result += f" → Relation: {edge_data.get('relation', 'Unknown')}\n" elif knowledge_graph.graph.has_edge(neighbor.id, node_id): edge_data = knowledge_graph.graph.get_edge_data(neighbor.id, node_id) result += f" ← Relation: {edge_data.get('relation', 'Unknown')}\n" result += "\n" # Pagination hint if page < total_pages: result += f"Use page={page + 1} to see the next page\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def go_to_definition(entity_name: str) -> str: """ Jump to the source code location(s) where an entity is defined/declared. PURPOSE: Use this tool to find WHERE in the codebase a class, function, method, or variable is defined. Returns the actual code content of the definition along with file location. WHEN TO USE: - To see the implementation of a class like 'BertModel' or 'GPT2Attention' - To find where a function is defined when you know its name - To examine the source code of any entity found through search or listing - When you need to understand HOW something is implemented (not just WHERE it's used) - To get the actual code definition for analysis or explanation WHAT YOU'LL GET: - Entity type (class, function, method, variable) - Data type if available - List of all locations where the entity is declared (some entities may be defined in multiple places) - For each location: file path, chunk order, and FULL CODE CONTENT TYPICAL WORKFLOW: 1. search_nodes("attention") -> find entity names 2. go_to_definition("BertSelfAttention") -> see the class implementation 3. find_usages("BertSelfAttention") -> see where it's used COMPARISON WITH find_usages: - go_to_definition: Shows WHERE entity is DEFINED (the implementation) - find_usages: Shows WHERE entity is USED/CALLED (the consumers) Args: entity_name: Exact name of the entity (case-sensitive). Examples: 'BertModel', 'forward', 'attention_mask', 'get_extended_attention_mask' Returns: str: Entity type, file location(s), and complete source code of the definition(s). Returns error if entity not found. """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: if entity_name not in knowledge_graph.entities: return f"Error: Entity '{entity_name}' not found in knowledge graph" entity_info = knowledge_graph.entities[entity_name] declaring_chunks = entity_info.get('declaring_chunk_ids', []) if not declaring_chunks: return f"Entity '{entity_name}' found but no declarations identified." result = f"Definition(s) for '{entity_name}':\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" result += f"Type: {', '.join(entity_info.get('type', ['Unknown']))}\n" if entity_info.get('dtype'): result += f"Data Type: {entity_info['dtype']}\n" result += f"\nDeclared in {len(declaring_chunks)} location(s):\n\n" for i, chunk_id in enumerate(declaring_chunks[:5], 1): if chunk_id in knowledge_graph.graph: chunk = knowledge_graph.graph.nodes[chunk_id]['data'] result += f"{i}. Chunk: {chunk_id}\n" result += f" File: {chunk.path}\n" result += f" Order: {chunk.order_in_file}\n" result += f" Content:\n{chunk.content}\n\n" if len(declaring_chunks) > 5: result += f"... and {len(declaring_chunks) - 5} more locations\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def find_usages(entity_name: str, limit: int = 20, page: int = 1) -> str: """ Find all locations in the codebase where an entity is used or called. PURPOSE: Use this tool to understand the impact and usage patterns of any entity. Shows every place where a class is instantiated, a function is called, or a variable is referenced throughout the Transformers library. WHEN TO USE: - To understand how widely used a class or function is - To see usage examples of a particular API or function - To assess the impact of changing an entity (who depends on it?) - To learn how to use a class/function by seeing real examples - To trace data flow through the codebase WHAT YOU'LL GET: - Total count of usage locations - For each usage: file path, chunk position, and full code context showing the usage - Paginated results for entities with many usages TYPICAL WORKFLOWS: Impact Analysis: 1. go_to_definition("deprecated_function") -> understand what it does 2. find_usages("deprecated_function") -> see all code that needs updating Learning by Example: 1. list_nodes_by_type('class') -> find interesting classes 2. find_usages("BertModel") -> see how it's instantiated and used COMPARISON WITH go_to_definition: - find_usages: WHERE is this entity CALLED/USED (consumers) - go_to_definition: WHERE is this entity DEFINED (implementation) Args: entity_name: Exact name of the entity to find usages for (case-sensitive) limit: Usages per page (default: 20). Many popular classes have 100+ usages. page: Page number for pagination (starts at 1) Returns: str: List of code chunks that use this entity, with file paths and full code content showing the usage in context """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: # Convert limit to int if it's a string (MCP may pass strings) if isinstance(limit, str): try: limit = int(limit) except ValueError: return f"Error: 'limit' must be an integer, got '{limit}'" # Convert page to int if it's a string if isinstance(page, str): try: page = int(page) except ValueError: return f"Error: 'page' must be an integer, got '{page}'" if entity_name not in knowledge_graph.entities: return f"Error: Entity '{entity_name}' not found in knowledge graph" if limit <= 0: return "Error: limit must be a positive integer" if page < 1: return "Error: 'page' must be a positive integer (1 or greater)" entity_info = knowledge_graph.entities[entity_name] calling_chunks = entity_info.get('calling_chunk_ids', []) if not calling_chunks: return f"Entity '{entity_name}' found but no usages identified." total = len(calling_chunks) # Pagination total_pages = (total + limit - 1) // limit if page > total_pages: return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} usages at {limit} per page)" start_idx = (page - 1) * limit end_idx = start_idx + limit page_slice = calling_chunks[start_idx:end_idx] result = f"Usages of '{entity_name}' (Page {page}/{total_pages}, {total} total):\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" for i, chunk_id in enumerate(page_slice, start=start_idx + 1): if chunk_id in knowledge_graph.graph: chunk = knowledge_graph.graph.nodes[chunk_id]['data'] result += f"{i}. {chunk.path} (chunk {chunk.order_in_file})\n" result += f" Content:\n{chunk.content}\n\n" # Pagination hint if page < total_pages: result += f"Use page={page + 1} to see the next page\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def get_file_structure(file_path: str) -> str: """ Get a structural overview of a source file showing its chunks and declared entities. PURPOSE: Use this tool to understand the organization of a specific file. Shows what classes, functions, and other entities are defined in the file, plus how the file is divided into chunks. WHEN TO USE: - To get a table of contents for a file before diving into specifics - To see what classes and functions a file defines - To understand how code is organized within a file - To find chunk IDs for further exploration with get_node_info or get_chunk_context - When you know the file path but need to understand its contents WHAT YOU'LL SEE: - File path and detected programming language - Total number of code chunks in the file - List of declared entities (classes, functions, methods, variables) with their types - Ordered list of chunks with their IDs and descriptions HOW TO GET FILE PATHS: - Use list_files_in_directory() to browse files - Use search_nodes() and look at file paths in results - Use list_nodes_by_type('file') to get file node IDs (which are the paths) TYPICAL WORKFLOW: 1. list_files_in_directory('src/transformers/models/bert') -> find files 2. get_file_structure('src/transformers/models/bert/modeling_bert.py') -> see structure 3. get_node_info(chunk_id) -> examine specific code chunks Args: file_path: The full path to the file (e.g., 'src/transformers/models/bert/modeling_bert.py'). Must match exactly as stored in the knowledge graph. Returns: str: File overview including language, chunk count, declared entities list, and chunk descriptions """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: if file_path not in knowledge_graph.graph: return f"Error: File '{file_path}' not found in knowledge graph" file_node = knowledge_graph.graph.nodes[file_path]['data'] chunks = knowledge_graph.get_chunks_of_file(file_path) result = f"File Structure: {file_node.name}\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" result += f"Path: {file_path}\n" result += f"Language: {getattr(file_node, 'language', 'Unknown')}\n" result += f"Total Chunks: {len(chunks)}\n\n" if hasattr(file_node, 'declared_entities') and file_node.declared_entities: result += f"Declared Entities ({len(file_node.declared_entities)}):\n" for entity in file_node.declared_entities[:15]: if isinstance(entity, dict): result += f" - {entity.get('name', '?')} ({entity.get('type', '?')})\n" else: result += f" - {entity}\n" if len(file_node.declared_entities) > 15: result += f" ... and {len(file_node.declared_entities) - 15} more\n" result += f"\nChunks:\n" for chunk in chunks[:10]: result += f" [{chunk.order_in_file}] {chunk.id}\n" if chunk.description: desc = chunk.description[:80] + "..." if len(chunk.description) > 80 else chunk.description result += f" {desc}\n" if len(chunks) > 10: result += f" ... and {len(chunks) - 10} more chunks\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def get_related_chunks(chunk_id: str, relation_type: str = "calls", limit: int = 20, page: int = 1) -> str: """ Find code chunks connected to a given chunk through a specific relationship type. PURPOSE: Use this tool to trace code dependencies by following relationship edges from a chunk. Helps understand what code a chunk depends on or what depends on it. WHEN TO USE: - To find what entities/code a chunk calls or uses (relation_type='calls') - To trace dependencies from a specific piece of code - To explore the call graph emanating from a chunk - When you have a chunk ID and want to see connected code RELATIONSHIP TYPES: - 'calls': Entities/chunks that this chunk calls or references (most common) - 'contains': Child nodes contained by this node (for files/directories) - 'declares': Entities declared by this chunk - 'all' or '': Get all outgoing relationships regardless of type TYPICAL WORKFLOW: 1. search_nodes("BertAttention forward") -> find a chunk 2. get_related_chunks(chunk_id, 'calls') -> see what it calls 3. get_node_info(related_chunk_id) -> examine called code COMPARISON WITH OTHER TOOLS: - get_neighbors: All connected nodes (any direction, any type) - get_related_chunks: Outgoing edges only, filtered by relationship type - entity_relationships: Focused on entity nodes and their relationships Args: chunk_id: The ID of the chunk to explore from (e.g., 'src/transformers/models/bert/modeling_bert.py::chunk_5') relation_type: Filter by relationship type: 'calls', 'contains', 'declares', or 'all' for everything (default: 'calls') limit: Maximum results per page (default: 20) page: Page number for pagination Returns: str: List of related chunks with their IDs, file paths, and entity names involved in the relationship """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: if chunk_id not in knowledge_graph.graph: return f"Error: Chunk '{chunk_id}' not found in knowledge graph" # Convert limit/page to int if they're strings if isinstance(limit, str): try: limit = int(limit) except ValueError: return f"Error: 'limit' must be an integer, got '{limit}'" if isinstance(page, str): try: page = int(page) except ValueError: return f"Error: 'page' must be an integer, got '{page}'" if limit <= 0: return "Error: limit must be a positive integer" if page < 1: return "Error: 'page' must be a positive integer (1 or greater)" related = [] if relation_type == "" or relation_type == "all": # Get all outgoing edges regardless of relation type for _, target, attrs in knowledge_graph.graph.out_edges(chunk_id, data=True): target_node = knowledge_graph.graph.nodes[target]['data'] related.append({ "id": target, "file_path": getattr(target_node, 'path', 'Unknown'), "entity_name": attrs.get('entity_name') }) else: for _, target, attrs in knowledge_graph.graph.out_edges(chunk_id, data=True): if attrs.get('relation') == relation_type: target_node = knowledge_graph.graph.nodes[target]['data'] related.append({ "id": target, "file_path": getattr(target_node, 'path', 'Unknown'), "entity_name": attrs.get('entity_name') }) if not related: return f"No chunks found with '{relation_type}' relationship from '{chunk_id}'" total = len(related) # Pagination total_pages = (total + limit - 1) // limit if page > total_pages: return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} results at {limit} per page)" start_idx = (page - 1) * limit end_idx = start_idx + limit page_slice = related[start_idx:end_idx] result = f"Chunks related to '{chunk_id}' via '{relation_type}' (Page {page}/{total_pages}, {total} total):\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" for i, chunk in enumerate(page_slice, start=start_idx + 1): result += f"{i}. {chunk['id']}\n" result += f" File: {chunk['file_path']}\n" if chunk['entity_name']: result += f" Entity: {chunk['entity_name']}\n" result += "\n" # Pagination hint if page < total_pages: result += f"Use page={page + 1} to see the next page\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def list_all_entities( limit: int = 50, page: int = 1, entity_type: Optional[str] = None, declared_in_repo: Optional[bool] = None, called_in_repo: Optional[bool] = None ) -> str: """ Browse all programming entities (classes, functions, methods, variables) tracked in the knowledge graph. PURPOSE: Use this tool to explore the full inventory of code entities in the Transformers library. Supports filtering by type and usage patterns, making it powerful for targeted exploration. WHEN TO USE: - To browse all classes, functions, or methods in the codebase - To find entities that are defined but never used (dead code analysis) - To find external entities that are called but not defined in the repo - To get an overview of entity distribution in the codebase - When you need entity names for use with go_to_definition or find_usages FILTERING OPTIONS: By entity_type: - 'class': Class definitions (BertModel, GPT2Config, etc.) - 'function': Standalone functions - 'method': Class methods - 'variable': Variable declarations - 'parameter': Function/method parameters - None: All entity types By declaration status (declared_in_repo): - True: Only entities DEFINED in this repo (has source code) - False: Only external entities (imported from other packages) - None: All entities By usage status (called_in_repo): - True: Only entities that ARE USED somewhere in the code - False: Only entities that are NEVER USED (potential dead code) - None: All entities USEFUL FILTER COMBINATIONS: - All classes: entity_type='class' - Defined classes: entity_type='class', declared_in_repo=True - Unused functions: entity_type='function', called_in_repo=False - External dependencies: declared_in_repo=False, called_in_repo=True Args: limit: Entities per page (default: 50). Use larger values for comprehensive listings. page: Page number starting from 1 for pagination entity_type: Filter by type: 'class', 'function', 'method', 'variable', 'parameter', or None for all declared_in_repo: True=defined in repo, False=external only, None=all called_in_repo: True=has usages, False=never used, None=all Returns: str: List of entities with their types, declaration count, and usage count. Use entity names with go_to_definition or find_usages. """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: # Convert limit to int if it's a string (MCP may pass strings) if isinstance(limit, str): try: limit = int(limit) except ValueError: return f"Error: 'limit' must be an integer, got '{limit}'" # Convert page to int if it's a string (MCP may pass strings) if isinstance(page, str): try: page = int(page) except ValueError: return f"Error: 'page' must be an integer, got '{page}'" if page < 1: return "Error: 'page' must be a positive integer (1 or greater)" # Handle entity_type - empty string should be treated as None if entity_type == "" or entity_type == "null": entity_type = None # Handle declared_in_repo - convert string to bool if needed if isinstance(declared_in_repo, str): if declared_in_repo.lower() in ("true", "1", "yes"): declared_in_repo = True elif declared_in_repo.lower() in ("false", "0", "no"): declared_in_repo = False elif declared_in_repo.lower() in ("none", "null", "all", ""): declared_in_repo = None # Handle called_in_repo - convert string to bool if needed if isinstance(called_in_repo, str): if called_in_repo.lower() in ("true", "1", "yes"): called_in_repo = True elif called_in_repo.lower() in ("false", "0", "no"): called_in_repo = False elif called_in_repo.lower() in ("none", "null", "all", ""): called_in_repo = None if not knowledge_graph.entities: return "No entities found in the knowledge graph." # Filter entities based on criteria filtered_entities = {} for entity_name, info in knowledge_graph.entities.items(): # Filter by entity type if specified if entity_type is not None: entity_types = [t.lower() if t else '' for t in info.get('type', [])] if entity_type.lower() not in entity_types: continue # Filter by declared_in_repo if specified if declared_in_repo is not None: has_declaration = len(info.get('declaring_chunk_ids', [])) > 0 if declared_in_repo and not has_declaration: continue if not declared_in_repo and has_declaration: continue # Filter by called_in_repo (usages) if specified if called_in_repo is not None: has_calls = len(info.get('calling_chunk_ids', [])) > 0 if called_in_repo and not has_calls: continue if not called_in_repo and has_calls: continue filtered_entities[entity_name] = info # Build the response with filtered entities if not filtered_entities: filter_desc = [] if entity_type: filter_desc.append(f"type={entity_type}") if declared_in_repo is not None: filter_desc.append(f"declared_in_repo={declared_in_repo}") if called_in_repo is not None: filter_desc.append(f"called_in_repo={called_in_repo}") filter_text = f" (filtered by {', '.join(filter_desc)})" if filter_desc else "" return f"No entities found{filter_text}." # Calculate pagination total_entities = len(filtered_entities) total_pages = (total_entities + limit - 1) // limit # Ceiling division if page > total_pages: return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total_entities} entities at {limit} per page)" start_idx = (page - 1) * limit end_idx = start_idx + limit # Get the paginated slice of entities entity_items = list(filtered_entities.items()) paginated_items = entity_items[start_idx:end_idx] result = f"All Entities (Page {page}/{total_pages}, {total_entities} total):\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" for i, (entity_name, info) in enumerate(paginated_items, start=start_idx + 1): result += f"{i}. {entity_name}\n" result += f" Types: {', '.join(info.get('type', ['Unknown']))}\n" result += f" Declarations: {len(info.get('declaring_chunk_ids', []))}\n" result += f" Usages: {len(info.get('calling_chunk_ids', []))}\n\n" # Add pagination info result += f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n" result += f"Showing {start_idx + 1}-{min(end_idx, total_entities)} of {total_entities} entities\n" result += f"Page {page} of {total_pages}\n" if page < total_pages: result += f"Use page={page + 1} to see the next page\n" # Add filter information if entity_type: result += f"\n(Filtered by type={entity_type})\n" if declared_in_repo is not None: result += f"(Filtered by declared_in_repo={declared_in_repo})\n" if called_in_repo is not None: result += f"(Filtered by called_in_repo={called_in_repo})\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def diff_chunks(node_id_1: str, node_id_2: str) -> str: """ Compare two code chunks and show their differences in unified diff format. PURPOSE: Use this tool to compare two pieces of code side-by-side. Shows exactly what's different between them using standard unified diff format (like git diff). WHEN TO USE: - To compare similar implementations (e.g., two attention mechanisms) - To understand differences between related classes or functions - To analyze variations in code patterns across the codebase - To compare two versions or implementations of similar functionality - When you suspect code duplication and want to see exact differences DIFF FORMAT: - Lines starting with '-' are only in the first chunk - Lines starting with '+' are only in the second chunk - Lines without prefix are common to both - @@ markers show line number context TYPICAL WORKFLOW: 1. search_nodes("attention") -> find attention implementations 2. Get chunk IDs from two different attention classes 3. diff_chunks(chunk_id_1, chunk_id_2) -> compare implementations COMPARISON IDEAS: - BertAttention vs GPT2Attention - Different forward() implementations - Similar utility functions in different modules Args: node_id_1: ID of the first chunk/node to compare node_id_2: ID of the second chunk/node to compare Returns: str: Unified diff output showing line-by-line differences. Returns 'No differences found' if chunks are identical. """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: if node_id_1 not in knowledge_graph.graph: return f"Error: Node '{node_id_1}' not found in knowledge graph" if node_id_2 not in knowledge_graph.graph: return f"Error: Node '{node_id_2}' not found in knowledge graph" g = knowledge_graph.graph content1 = getattr(g.nodes[node_id_1]['data'], 'content', None) content2 = getattr(g.nodes[node_id_2]['data'], 'content', None) if not content1 or not content2: return "Error: One or both nodes have no content." diff = list(difflib.unified_diff( content1.splitlines(), content2.splitlines(), fromfile=node_id_1, tofile=node_id_2, lineterm="" )) if not diff: return "No differences found between the two chunks." return "\n".join(diff) except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def print_tree(root_id: str = "root", max_depth: int = 3) -> str: """ Display a hierarchical tree view of the repository structure starting from any node. PURPOSE: Use this tool to visualize the structure of the codebase. Shows parent-child relationships in a familiar tree format, helping you understand how files and directories are organized. WHEN TO USE: - To explore the directory structure of the Transformers repository - To see what's inside a specific directory (use directory as root_id) - To understand the file organization for a component - To get an overview of the codebase hierarchy - When you need to understand where files are located TREE VISUALIZATION: - Each level shows node name and type (repo, directory, file, chunk) - Indentation represents depth in the hierarchy - Children are limited to prevent overwhelming output TIPS: - Start with max_depth=2 for a high-level overview - Increase max_depth to see more detail (but output gets larger) - Use a directory path as root_id to focus on a specific area - Use list_files_in_directory for more detailed file listings TYPICAL USAGE: - print_tree('root', max_depth=2) -> see top-level structure - print_tree('src/transformers/models', max_depth=2) -> see model organization - print_tree('src/transformers/models/bert', max_depth=3) -> see bert module structure Args: root_id: Starting node ID. Use 'root' for repository root, or a directory/file path to start from a specific location. max_depth: How many levels deep to show (default: 3). Higher values show more detail but larger output. Returns: str: ASCII tree visualization showing the hierarchical structure with node names and types """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: # Convert max_depth to int if it's a string (MCP may pass strings) if isinstance(max_depth, str): try: max_depth = int(max_depth) except ValueError: return f"Error: 'max_depth' must be an integer, got '{max_depth}'" g = knowledge_graph.graph if root_id not in g: # Try to find a suitable root roots = [n for n, d in g.nodes(data=True) if getattr(d['data'], 'node_type', None) in ('repo', 'directory', 'file')] if roots: root_id = roots[0] else: return f"Error: Node '{root_id}' not found and no suitable root found" result = f"Tree View (starting from '{root_id}', max depth: {max_depth}):\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" def format_node(node_id, depth): if depth > max_depth: return "" node = g.nodes[node_id]['data'] name = getattr(node, 'name', node_id) node_type = getattr(node, 'node_type', '?') line = " " * depth + f"- {name} ({node_type})\n" children = [t for s, t in g.out_edges(node_id)] for child in children[:20]: # Limit children to prevent huge output line += format_node(child, depth + 1) if len(children) > 20: line += " " * (depth + 1) + f"... and {len(children) - 20} more\n" return line result += format_node(root_id, 0) return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def entity_relationships(node_id: str) -> str: """ Display all incoming and outgoing relationships for any node, with relationship types. PURPOSE: Use this tool to get a complete picture of how a node connects to the rest of the knowledge graph. Shows both what points TO this node and what this node points TO. WHEN TO USE: - To understand all dependencies of an entity - To see what declares or calls a specific entity - To trace the full relationship network around any node - When you need more detail than get_neighbors provides about relationship types - For entity-centric analysis (understanding a class or function's connections) WHAT YOU'LL SEE: - Incoming relationships: Other nodes that have edges pointing TO this node (e.g., chunks that CALL this function, files that CONTAIN this chunk) - Outgoing relationships: This node's edges pointing TO other nodes (e.g., entities this chunk CALLS, chunks this file CONTAINS) - Relationship types for each edge (calls, declares, contains) COMPARISON WITH SIMILAR TOOLS: - get_node_edges: Same information but different formatting - get_neighbors: Shows neighbor node details, not edge details - get_related_chunks: Filtered by relationship type, chunks only TYPICAL WORKFLOW: 1. go_to_definition("BertModel") -> find entity 2. entity_relationships("BertModel") -> see what calls/uses BertModel Args: node_id: The ID of any node (entity, chunk, file, directory) Returns: str: Complete list of incoming and outgoing relationships with source/target IDs and relationship types """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: if node_id not in knowledge_graph.graph: return f"Error: Node '{node_id}' not found in knowledge graph" g = knowledge_graph.graph result = f"Relationships for '{node_id}':\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" incoming = list(g.in_edges(node_id, data=True)) outgoing = list(g.out_edges(node_id, data=True)) if incoming: result += f"Incoming Relationships ({len(incoming)}):\n" for source, target, data in incoming[:20]: result += f" ← {source} [{data.get('relation', '?')}]\n" if len(incoming) > 20: result += f" ... and {len(incoming) - 20} more\n" result += "\n" if outgoing: result += f"Outgoing Relationships ({len(outgoing)}):\n" for source, target, data in outgoing[:20]: result += f" → {target} [{data.get('relation', '?')}]\n" if len(outgoing) > 20: result += f" ... and {len(outgoing) - 20} more\n" if not incoming and not outgoing: result += "No relationships found.\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def search_by_type_and_name(node_type: str, name_query: str, limit: int = 10, page: int = 1, partial_allowed: bool = True) -> str: """ Search for nodes by combining type filtering with name pattern matching. PURPOSE: Use this tool for precise, targeted searches when you know the type of node you're looking for and have a partial name. More efficient than list_nodes_by_type when you have name hints. WHEN TO USE: - To find all classes containing 'Attention': search_by_type_and_name('class', 'Attention') - To find functions with 'forward' in name: search_by_type_and_name('function', 'forward') - To find files named 'config': search_by_type_and_name('file', 'config') - When you know the type AND have a partial name to search for - For pattern-based discovery of related components SEARCH BEHAVIOR: - Case-insensitive matching - partial_allowed=True (default): Fuzzy matching, finds 'BertEmbeddings' when searching 'Embed' - partial_allowed=False: Requires exact substring match - Results sorted by match quality (exact matches first, then substring, then fuzzy) VALID node_type VALUES: For entities: 'class', 'function', 'method', 'variable', 'parameter' For structural: 'file', 'chunk', 'directory' SEARCH EXAMPLES: - All Attention classes: search_by_type_and_name('class', 'Attention') - All Embedding classes: search_by_type_and_name('class', 'Embedding') - Config files: search_by_type_and_name('file', 'config') - Forward methods: search_by_type_and_name('method', 'forward') - Test files: search_by_type_and_name('file', 'test_') COMPARISON WITH SIMILAR TOOLS: - search_nodes: Full-text search in code content (doesn't filter by type) - list_nodes_by_type: Lists all of a type (no name filter) - search_by_type_and_name: Combines type filter + name search (best of both) Args: node_type: Type to filter by: 'class', 'function', 'method', 'file', 'chunk', 'directory', etc. name_query: Name pattern to search for (case-insensitive). Can be partial. limit: Results per page (default: 10) page: Page number for pagination partial_allowed: Enable fuzzy matching (default: True). Set False for stricter matching. Returns: str: Matching nodes sorted by relevance, with IDs and types. Use IDs with get_node_info for details. """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: # Convert limit/page to int if they're strings (MCP/Gradio may pass strings) if isinstance(limit, str): try: limit = int(limit) except ValueError: return f"Error: 'limit' must be an integer, got '{limit}'" if isinstance(page, str): try: page = int(page) except ValueError: return f"Error: 'page' must be an integer, got '{page}'" # Convert partial_allowed to bool if it's a string if isinstance(partial_allowed, str): partial_allowed = partial_allowed.lower() in ('true', '1', 'yes') if limit <= 0: return "Error: limit must be a positive integer" if page < 1: return "Error: 'page' must be a positive integer (1 or greater)" g = knowledge_graph.graph matches = [] query_lower = name_query.lower() # Build regex pattern for partial_allowed matching # This will match names containing all characters of the query in order if partial_allowed: # Create pattern that matches query as substring or with characters spread out # e.g., "Embed" matches "Embedding", "BertEmbeddings", "EmbedLayer" partial_pattern = '.*'.join(re.escape(c) for c in query_lower) partial_regex = re.compile(partial_pattern, re.IGNORECASE) for nid, n in g.nodes(data=True): node = n['data'] node_name = getattr(node, 'name', '') if not node_name: continue # Check if name matches the query name_matches = False if partial_allowed: # Partial match: substring match OR regex pattern match if query_lower in node_name.lower() or partial_regex.search(node_name): name_matches = True else: # Exact substring match if query_lower in node_name.lower(): name_matches = True if not name_matches: continue # Check type based on node_type current_node_type = getattr(node, 'node_type', None) # For entity nodes, check entity_type instead of node_type if current_node_type == 'entity': entity_type = getattr(node, 'entity_type', '') # Fallback: if entity_type is empty, check the entities dictionary # This handles cases where EntityNode was created before the fix if not entity_type and nid in knowledge_graph.entities: entity_types = knowledge_graph.entities[nid].get('type', []) entity_type = entity_types[0] if entity_types else '' if entity_type and entity_type.lower() == node_type.lower(): # Calculate match score for sorting (exact matches first) score = 0 if query_lower == node_name.lower() else (1 if query_lower in node_name.lower() else 2) matches.append({ "id": nid, "name": node_name, "type": f"entity ({entity_type})", "score": score }) # For other nodes, check node_type directly elif current_node_type == node_type: score = 0 if query_lower == node_name.lower() else (1 if query_lower in node_name.lower() else 2) matches.append({ "id": nid, "name": node_name, "type": current_node_type, "score": score }) # Sort by match score (best matches first) matches.sort(key=lambda x: (x['score'], x['name'].lower())) total = len(matches) if total == 0: return f"No matches for type '{node_type}' and name containing '{name_query}'." # Pagination total_pages = (total + limit - 1) // limit if page > total_pages: return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} results at {limit} per page)" start_idx = (page - 1) * limit end_idx = start_idx + limit page_slice = matches[start_idx:end_idx] result = f"Matches for type '{node_type}' and name '{name_query}' (Page {page}/{total_pages}, {total} total):\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" for i, match in enumerate(page_slice, start=start_idx + 1): result += f"{i}. {match['name']}\n" result += f" ID: {match['id']}\n" result += f" Type: {match['type']}\n\n" if page < total_pages: result += f"Use page={page + 1} to see the next page\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def get_chunk_context(node_id: str) -> str: """ Get expanded code context by retrieving a chunk along with its previous and next chunks. PURPOSE: Use this tool when you need to see MORE CODE CONTEXT around a specific chunk. Chunks are logical code segments, but sometimes you need to see surrounding code to fully understand the implementation. WHEN TO USE: - After search_nodes or get_node_info when you need more surrounding context - When a chunk shows a partial function/class and you need the complete picture - To understand code flow across chunk boundaries - To see imports or setup code that precedes a chunk - To see what code follows after a chunk WHAT YOU'LL GET: - The previous chunk's content (if it exists) - The target chunk's content - The next chunk's content (if it exists) - All organized by file and joined together seamlessly CONTEXT EXPANSION: - Shows up to 3 consecutive chunks (prev + current + next) - Useful for understanding function bodies that span chunks - Helps see class context when looking at individual methods TYPICAL WORKFLOW: 1. search_nodes("attention forward") -> find relevant chunk 2. get_node_info(chunk_id) -> see chunk content 3. get_chunk_context(chunk_id) -> see surrounding code for fuller understanding COMPARISON WITH get_node_info: - get_node_info: Single chunk content + full metadata - get_chunk_context: Expanded code view (prev + current + next chunks), less metadata Args: node_id: The chunk ID to get context for (e.g., 'src/transformers/models/bert/modeling_bert.py::chunk_5') Returns: str: Combined content of previous, current, and next chunks organized by file. Provides seamless code view. """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: if node_id not in knowledge_graph.graph: return f"Error: Node '{node_id}' not found in knowledge graph" g = knowledge_graph.graph current_chunk = g.nodes[node_id]['data'] previous_chunk = knowledge_graph.get_previous_chunk(node_id) next_chunk = knowledge_graph.get_next_chunk(node_id) # Collect all chunks (previous, current, next) chunks = [] if previous_chunk: chunks.append(previous_chunk) chunks.append(current_chunk) if next_chunk: chunks.append(next_chunk) # Organize and join chunks organized = organize_chunks_by_file_name(chunks) full_content = join_organized_chunks(organized) return full_content except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def get_file_stats(path: str) -> str: """ Get detailed statistics and metrics for a specific file or directory. PURPOSE: Use this tool to get quantitative metrics about a file including line counts, entity counts, and chunk counts. Useful for understanding file complexity. WHEN TO USE: - To assess the size and complexity of a file - To see summary counts of entities declared and called - To understand how a file is chunked - For code metrics and analysis tasks - When deciding which files to explore further METRICS PROVIDED: - Line count (total lines in the file) - Declared entities count with a sample list - Called entities count with a sample list - Number of chunks the file is divided into COMPARISON WITH get_file_structure: - get_file_stats: Quantitative metrics (counts, numbers) - get_file_structure: Qualitative overview (entity names, chunk IDs) TYPICAL USAGE: - get_file_stats('src/transformers/models/bert/modeling_bert.py') -> see metrics - Use this to identify large/complex files before diving in Args: path: The file path to analyze. Must match the path as stored in the knowledge graph. Returns: str: Statistics including line count, declared entities, called entities, and chunk count """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: g = knowledge_graph.graph nodes = [n for n, d in g.nodes(data=True) if getattr(d['data'], 'path', None) == path] if not nodes: return f"No nodes found for path '{path}'." result = f"Statistics for '{path}':\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" for node_id in nodes: node = g.nodes[node_id]['data'] content = getattr(node, 'content', '') declared = getattr(node, 'declared_entities', []) called = getattr(node, 'called_entities', []) chunks = [t for s, t in g.out_edges(node_id) if getattr(g.nodes[t]['data'], 'node_type', None) == 'chunk'] result += f"Node: {node_id} ({getattr(node, 'node_type', '?')})\n" result += f" Lines: {len(content.splitlines()) if content else 0}\n" result += f" Declared entities: {len(declared)}\n" if declared: for entity in declared[:10]: if isinstance(entity, dict): result += f" - {entity.get('name', '?')} ({entity.get('type', '?')})\n" else: result += f" - {entity}\n" if len(declared) > 10: result += f" ... and {len(declared) - 10} more\n" result += f" Called entities: {len(called)}\n" if called: for entity in called[:10]: result += f" - {entity}\n" if len(called) > 10: result += f" ... and {len(called) - 10} more\n" result += f" Chunks: {len(chunks)}\n\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def find_path(source_id: str, target_id: str, max_depth: int = 5) -> str: """ Find the shortest path between two nodes in the knowledge graph. PURPOSE: Use this tool to discover how two code elements are connected through the graph. Reveals the chain of relationships linking two seemingly unrelated pieces of code. WHEN TO USE: - To understand how two classes/functions are related - To trace dependency chains between components - To discover indirect connections between code elements - To verify if two nodes are connected at all - For understanding code architecture and coupling WHAT YOU'LL GET: - Path length (number of hops) - Ordered list of nodes from source to target - Visual representation of the path LIMITATIONS: - max_depth limits search to avoid long computations - If no path found within max_depth, nodes may still be connected via longer path - Very distant nodes may require increasing max_depth EXAMPLE QUERIES: - How is BertModel connected to GPT2Model? - What's the path from a utility function to a model class? - How many hops between two files? TYPICAL WORKFLOW: 1. Identify two node IDs of interest 2. find_path(source, target) -> discover connection 3. get_node_info for nodes in the path to understand the relationship Args: source_id: Starting node ID (any node type) target_id: Destination node ID (any node type) max_depth: Maximum path length to search (default: 5). Increase for distant nodes. Returns: str: Path from source to target showing each node in sequence, or message if no path found """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: # Convert max_depth to int if it's a string (MCP may pass strings) if isinstance(max_depth, str): try: max_depth = int(max_depth) except ValueError: return f"Error: 'max_depth' must be an integer, got '{max_depth}'" path_result = knowledge_graph.find_path(source_id, target_id, max_depth) if "error" in path_result: return f"Error: {path_result['error']}" if not path_result.get("path"): return f"No path found from '{source_id}' to '{target_id}' within depth {max_depth}" result = f"Path from '{source_id}' to '{target_id}':\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" result += f"Length: {path_result['length']}\n\n" path = path_result['path'] for i, node_id in enumerate(path): result += f"{i}. {node_id}\n" if i < len(path) - 1: result += " ↓\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def get_subgraph(node_id: str, depth: int = 2, edge_types: Optional[str] = None) -> str: """ Extract a local subgraph around a node up to a specified depth. PURPOSE: Use this tool to get a bounded view of the graph neighborhood around any node. Shows all nodes reachable within a certain number of hops, optionally filtered by edge type. WHEN TO USE: - To understand the local network around a class or function - To extract a bounded region of the knowledge graph for analysis - To see all nodes within N hops of a target node - To analyze the dependency neighborhood of a component - When get_neighbors isn't enough and you need multi-hop exploration DEPTH EXPLANATION: - depth=1: Only immediate neighbors (same as get_neighbors) - depth=2: Neighbors and their neighbors (2 hops) - depth=3+: Larger neighborhood (exponentially more nodes) EDGE TYPE FILTERING: - Pass comma-separated edge types to filter: 'calls,declares' - Common types: 'calls', 'contains', 'declares' - Leave empty or None for all edge types OUTPUT: - Node count and edge count in the subgraph - List of all node IDs in the extracted subgraph - Filtered by edge types if specified TYPICAL WORKFLOW: 1. Find a central node of interest 2. get_subgraph(node_id, depth=2) -> see local neighborhood 3. Use node IDs from result with get_node_info for details COMPARISON WITH get_neighbors: - get_neighbors: Single hop, shows node details - get_subgraph: Multi-hop, shows subgraph structure and counts Args: node_id: Central node to build subgraph around depth: Radius in hops from central node (default: 2). Higher = larger subgraph. edge_types: Optional comma-separated filter: 'calls,contains,declares' or None for all Returns: str: Subgraph summary with node/edge counts and list of included node IDs """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: # Convert depth to int if it's a string (MCP may pass strings) if isinstance(depth, str): try: depth = int(depth) except ValueError: return f"Error: 'depth' must be an integer, got '{depth}'" edge_types_list = edge_types.split(",") if edge_types else None subgraph_result = knowledge_graph.get_subgraph(node_id, depth, edge_types_list) if "error" in subgraph_result: return f"Error: {subgraph_result['error']}" result = f"Subgraph around '{node_id}' (depth: {depth}):\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" result += f"Nodes: {subgraph_result['node_count']}\n" result += f"Edges: {subgraph_result['edge_count']}\n" if edge_types_list: result += f"Filtered by edge types: {', '.join(edge_types_list)}\n" result += "\nNodes in subgraph:\n" for node in subgraph_result['nodes'][:30]: result += f" - {node}\n" if len(subgraph_result['nodes']) > 30: result += f" ... and {len(subgraph_result['nodes']) - 30} more\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def list_files_in_directory(directory_path: str = "", pattern: str = "*", recursive: bool = True, limit: int = 50, page: int = 1) -> str: """ Browse and list files in the repository with flexible filtering options. PURPOSE: Use this tool to explore the file structure of the Transformers library. Supports directory scoping, glob patterns, and recursive/non-recursive modes. WHEN TO USE: - To see what files exist in a directory - To find files by pattern (e.g., all Python files, all test files) - To explore the repository structure directory by directory - To find specific file types in specific locations - When you need file paths for use with other tools FILTERING OPTIONS: directory_path: - Empty string '': Search all files in the repository - 'src/transformers/models': Only files under this directory - 'src/transformers/models/bert': Focus on a specific model pattern (glob patterns): - '*': All files (default) - '*.py': Python files only - 'test_*.py': Test files - '*config*': Files with 'config' in name - 'modeling_*.py': Modeling files recursive: - True (default): Include files in subdirectories - False: Only files directly in the specified directory COMMON USE CASES: - All files: list_files_in_directory() - Bert model files: list_files_in_directory('src/transformers/models/bert') - All Python files: list_files_in_directory(pattern='*.py') - Test files only: list_files_in_directory(pattern='test_*.py') - Config files: list_files_in_directory(pattern='*config*') COMPARISON WITH print_tree: - print_tree: Visual hierarchy, includes directories - list_files_in_directory: Flat file list with details, better for finding specific files Args: directory_path: Directory to search in. Empty string for entire repository. pattern: Glob pattern for filename filtering (default: '*' matches all) recursive: Search subdirectories (default: True) limit: Files per page (default: 50) page: Page number for pagination Returns: str: List of matching files with paths, languages, and entity counts """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: # Convert limit to int if it's a string if isinstance(limit, str): try: limit = int(limit) except ValueError: return f"Error: 'limit' must be an integer, got '{limit}'" # Convert page to int if it's a string if isinstance(page, str): try: page = int(page) except ValueError: return f"Error: 'page' must be an integer, got '{page}'" if limit <= 0: return "Error: limit must be a positive integer" if page < 1: return "Error: 'page' must be a positive integer (1 or greater)" # Convert recursive to bool if it's a string if isinstance(recursive, str): recursive = recursive.lower() in ('true', '1', 'yes') g = knowledge_graph.graph matching_files = [] for nid, n in g.nodes(data=True): node = n['data'] node_type = getattr(node, 'node_type', None) # Only look at file nodes if node_type != 'file': continue file_path = getattr(node, 'path', nid) file_name = getattr(node, 'name', '') # Filter by directory path if specified if directory_path: if recursive: # Check if file is under the directory if not file_path.startswith(directory_path.rstrip('/') + '/') and file_path != directory_path: continue else: # Check if file is directly in the directory (not in subdirectories) parent_dir = '/'.join(file_path.rsplit('/', 1)[:-1]) if '/' in file_path else '' if parent_dir != directory_path.rstrip('/'): continue # Apply glob pattern matching if pattern and pattern != '*': # Match against both full path and filename if not (fnmatch.fnmatch(file_path, pattern) or fnmatch.fnmatch(file_name, pattern) or fnmatch.fnmatch(file_path, f'**/{pattern}')): continue language = getattr(node, 'language', 'Unknown') declared_entities = getattr(node, 'declared_entities', []) matching_files.append({ 'path': file_path, 'name': file_name, 'language': language, 'entity_count': len(declared_entities) }) # Sort by path for consistent ordering matching_files.sort(key=lambda x: x['path']) if not matching_files: filter_desc = f" in '{directory_path}'" if directory_path else "" pattern_desc = f" matching '{pattern}'" if pattern and pattern != '*' else "" return f"No files found{filter_desc}{pattern_desc}." total = len(matching_files) # Pagination total_pages = (total + limit - 1) // limit if page > total_pages: return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} files at {limit} per page)" start_idx = (page - 1) * limit end_idx = start_idx + limit page_slice = matching_files[start_idx:end_idx] result = f"Files" if directory_path: result += f" in '{directory_path}'" if pattern and pattern != '*': result += f" matching '{pattern}'" result += f" (Page {page}/{total_pages}, {total} total):\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" for i, f in enumerate(page_slice, start=start_idx + 1): result += f"{i}. {f['path']}\n" result += f" Language: {f['language']}, Entities: {f['entity_count']}\n\n" # Pagination hint if page < total_pages: result += f"Use page={page + 1} to see the next page\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def find_files_importing(module_or_entity: str, limit: int = 30, page: int = 1) -> str: """ Find all files that import or use a specific module, class, or function. PURPOSE: Use this tool to trace import dependencies and understand which parts of the codebase depend on a particular module or entity. WHEN TO USE: - To find all files that import a specific module (e.g., 'torch', 'numpy') - To trace dependencies on a class or function - To understand the impact scope of a module - To find usage patterns of external libraries - For dependency analysis and impact assessment SEARCH BEHAVIOR: - Searches through 'called_entities' metadata - Also scans code chunks for import statement patterns - Matches import, from...import, require, use patterns - Case-insensitive matching WHAT YOU'LL GET: - List of files that import/use the specified module or entity - Match type (called_entity or import_statement) - Matched entity names when applicable EXAMPLE QUERIES: - find_files_importing('torch') -> files using PyTorch - find_files_importing('numpy') -> files using NumPy - find_files_importing('BertModel') -> files using BertModel - find_files_importing('attention') -> files related to attention LIMITATIONS: - May not catch all dynamic imports - Pattern matching may have false positives/negatives - For comprehensive search, combine with search_nodes Args: module_or_entity: Name of the module, class, or function to search for (case-insensitive) limit: Maximum results per page (default: 30) page: Page number for pagination Returns: str: List of files that import or use the specified module/entity, with match details """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: # Convert limit to int if it's a string if isinstance(limit, str): try: limit = int(limit) except ValueError: return f"Error: 'limit' must be an integer, got '{limit}'" # Convert page to int if it's a string if isinstance(page, str): try: page = int(page) except ValueError: return f"Error: 'page' must be an integer, got '{page}'" if limit <= 0: return "Error: limit must be a positive integer" if page < 1: return "Error: 'page' must be a positive integer (1 or greater)" g = knowledge_graph.graph importing_files = [] search_term = module_or_entity.lower() # Search through file nodes for nid, n in g.nodes(data=True): node = n['data'] node_type = getattr(node, 'node_type', None) if node_type != 'file': continue file_path = getattr(node, 'path', nid) called_entities = getattr(node, 'called_entities', []) # Check if the module/entity is in called entities found_in_calls = False matched_entities = [] for entity in called_entities: entity_str = str(entity).lower() if not isinstance(entity, dict) else entity.get('name', '').lower() if search_term in entity_str: found_in_calls = True matched_entities.append(entity_str) if found_in_calls: importing_files.append({ 'path': file_path, 'name': getattr(node, 'name', ''), 'matched_entities': matched_entities[:5], 'match_type': 'called_entity' }) continue # Also check chunk contents for import statements chunks = knowledge_graph.get_chunks_of_file(file_path) if hasattr(knowledge_graph, 'get_chunks_of_file') else [] for chunk in chunks[:3]: # Check first few chunks (usually where imports are) content = getattr(chunk, 'content', '') # Look for import patterns import_patterns = [ rf'import\s+.*{re.escape(module_or_entity)}', rf'from\s+.*{re.escape(module_or_entity)}.*\s+import', rf'require\s*\(\s*["\'].*{re.escape(module_or_entity)}', rf'use\s+.*{re.escape(module_or_entity)}', ] for pattern in import_patterns: if re.search(pattern, content, re.IGNORECASE): if not any(f['path'] == file_path for f in importing_files): importing_files.append({ 'path': file_path, 'name': getattr(node, 'name', ''), 'matched_entities': [], 'match_type': 'import_statement' }) break # Sort by path importing_files.sort(key=lambda x: x['path']) if not importing_files: return f"No files found importing '{module_or_entity}'.\n\nTip: Try searching for the module name in code content using search_nodes." total = len(importing_files) # Pagination total_pages = (total + limit - 1) // limit if page > total_pages: return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} files at {limit} per page)" start_idx = (page - 1) * limit end_idx = start_idx + limit page_slice = importing_files[start_idx:end_idx] result = f"Files importing '{module_or_entity}' (Page {page}/{total_pages}, {total} total):\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" for i, f in enumerate(page_slice, start=start_idx + 1): result += f"{i}. {f['path']}\n" result += f" Match type: {f['match_type']}\n" if f['matched_entities']: result += f" Matched: {', '.join(f['matched_entities'][:3])}\n" result += "\n" # Pagination hint if page < total_pages: result += f"Use page={page + 1} to see the next page\n" return result except Exception as e: return f"Error: {str(e)}" @observe(as_type="tool") def get_concept_overview(concept: str, limit: int = 15) -> str: """ Get a high-level overview of how a concept is implemented across the Transformers codebase. PURPOSE: Use this tool for broad exploration of a concept or feature. Aggregates related classes, functions, files, and code snippets into a single comprehensive view. Ideal for initial investigation of a topic. WHEN TO USE: - FIRST STEP when exploring a new concept (before detailed searches) - To understand how a feature is implemented across the codebase - To discover all components related to a concept - To get a bird's-eye view before diving into specifics - When you're not sure where to start investigating SEARCH STRATEGY: This tool combines multiple search approaches: - Searches entity names (classes, functions, methods) containing the concept - Searches file names and paths - Searches chunk content and descriptions - Aggregates results into categorized sections CONCEPT EXAMPLES: - 'attention' -> attention mechanisms across all models - 'embedding' -> embedding layers and utilities - 'tokenizer' -> tokenization components - 'generation' -> text generation utilities - 'config' -> configuration classes - 'cache' -> caching mechanisms - 'rope' -> rotary position embeddings - 'flash' -> flash attention implementations OUTPUT STRUCTURE: - Related Classes: Class definitions matching the concept - Related Functions/Methods: Functions matching the concept - Related Files: Files with concept in path/name - Code Snippets: Relevant code chunks TYPICAL WORKFLOW: 1. get_concept_overview('attention') -> see all attention-related components 2. Identify specific classes/functions of interest 3. go_to_definition or search_nodes for detailed exploration Args: concept: The concept to explore (e.g., 'attention', 'embedding', 'generation', 'tokenizer') limit: Maximum items per category (default: 15) Returns: str: Categorized overview with related classes, functions, files, and code snippets """ if knowledge_graph is None: return "Error: Knowledge graph not initialized" try: # Convert limit to int if it's a string if isinstance(limit, str): try: limit = int(limit) except ValueError: return f"Error: 'limit' must be an integer, got '{limit}'" g = knowledge_graph.graph concept_lower = concept.lower() # Categories to collect related_classes = [] related_functions = [] related_files = [] related_chunks = [] # Search through all nodes for nid, n in g.nodes(data=True): node = n['data'] node_type = getattr(node, 'node_type', None) node_name = getattr(node, 'name', '') # Check if concept appears in name name_match = concept_lower in node_name.lower() if node_type == 'entity': entity_type = getattr(node, 'entity_type', '') if name_match: if entity_type.lower() == 'class' and len(related_classes) < limit: declaring = getattr(node, 'declaring_chunk_ids', []) related_classes.append({ 'name': node_name, 'id': nid, 'file': declaring[0] if declaring else 'Unknown' }) elif entity_type.lower() in ('function', 'method') and len(related_functions) < limit: declaring = getattr(node, 'declaring_chunk_ids', []) related_functions.append({ 'name': node_name, 'id': nid, 'type': entity_type, 'file': declaring[0] if declaring else 'Unknown' }) elif node_type == 'file' and len(related_files) < limit: # Check if concept in filename or path file_path = getattr(node, 'path', '') if concept_lower in file_path.lower() or name_match: declared = getattr(node, 'declared_entities', []) related_files.append({ 'path': file_path, 'name': node_name, 'entity_count': len(declared) }) elif node_type == 'chunk' and len(related_chunks) < limit // 2: # Check if concept in chunk content or description content = getattr(node, 'content', '') description = getattr(node, 'description', '') if concept_lower in content.lower() or concept_lower in (description or '').lower(): file_path = getattr(node, 'path', '') related_chunks.append({ 'id': nid, 'file': file_path, 'content': content }) # Build the overview result = f"Concept Overview: '{concept}'\n" result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n" # Summary total = len(related_classes) + len(related_functions) + len(related_files) + len(related_chunks) result += f"Found {total} related items across the codebase.\n\n" if related_classes: result += f"📦 Related Classes ({len(related_classes)}):\n" for cls in related_classes[:10]: result += f" • {cls['name']}\n" result += f" File: {cls['file']}\n" if len(related_classes) > 10: result += f" ... and {len(related_classes) - 10} more\n" result += "\n" if related_functions: result += f"⚡ Related Functions/Methods ({len(related_functions)}):\n" for func in related_functions[:10]: result += f" • {func['name']} ({func['type']})\n" result += f" File: {func['file']}\n" if len(related_functions) > 10: result += f" ... and {len(related_functions) - 10} more\n" result += "\n" if related_files: result += f"📄 Related Files ({len(related_files)}):\n" for f in related_files[:10]: result += f" • {f['path']}\n" result += f" Entities: {f['entity_count']}\n" if len(related_files) > 10: result += f" ... and {len(related_files) - 10} more\n" result += "\n" if related_chunks: result += f"📝 Code Snippets ({len(related_chunks)}):\n" for chunk in related_chunks[:5]: result += f" • {chunk['id']}\n" result += f" Content:\n{chunk['content']}\n\n" if len(related_chunks) > 5: result += f" ... and {len(related_chunks) - 5} more\n" if total == 0: result += "No direct matches found.\n\n" result += "Suggestions:\n" result += f" • Try searching with: search_nodes('{concept}')\n" result += f" • Try partial name: search_by_type_and_name('class', '{concept[:4]}')\n" result += f" • Check entity list: list_all_entities(entity_type='class')\n" return result except Exception as e: return f"Error: {str(e)}" def _get_header_explorer(): html = """
Code Knowledge Graph Explorer — 🤗 Transformers Library
Explore, query, and understand the structure of the Hugging Face Transformers codebase.
""" return html # ==================== Gradio App ==================== def create_gradio_app(): """Create and configure the Gradio interface.""" with gr.Blocks(title="", theme=gr.themes.Soft()) as demo: gr.HTML(_get_header_explorer()) # Helper to render tool docstrings in the UI def _tool_doc_md(func): doc = (func.__doc__ or "No description available.").strip() # Render as a fenced code block for readability return f"**Description:**\n\n```\n{doc}\n```" gr.Markdown(""" Understanding large codebases is essential for software engineers. This Space presents a Code Knowledge Graph MCP Server built around the Hugging Face Transformers library (4,000+ files, 400k+ lines of code). It enables LLM-based coding agents to analyze code structure, follow dependencies, and spot potential improvements. Developed initially for EPITA coding courses, these capabilities make it easier to review, navigate, and understand complex projects such as the Transformers library. """) with gr.Tab("📊 Graph Overview"): stats_btn = gr.Button("Get Graph Statistics", variant="primary") stats_output = gr.Textbox(label="Statistics", lines=20, max_lines=30) stats_btn.click(fn=get_graph_stats, outputs=stats_output) gr.Markdown(_tool_doc_md(get_graph_stats)) with gr.Tab("🔎 Search"): with gr.Row(): with gr.Column(): search_query = gr.Textbox(label="Search Query", placeholder="Enter search query...") search_limit = gr.Slider(1, 50, value=10, step=1, label="Results per Page") search_page = gr.Slider(1, 100, value=1, step=1, label="Page") search_btn = gr.Button("Search", variant="primary") with gr.Column(): search_output = gr.Textbox(label="Search Results", lines=20, max_lines=30) search_btn.click(fn=search_nodes, inputs=[search_query, search_limit, search_page], outputs=search_output) gr.Markdown(_tool_doc_md(search_nodes)) with gr.Tab("📝 Node Info"): with gr.Row(): with gr.Column(): node_id_input = gr.Textbox(label="Node ID", placeholder="Enter node ID...") node_info_btn = gr.Button("Get Node Info", variant="primary") node_edges_btn = gr.Button("Get Node Edges", variant="secondary") with gr.Column(): node_output = gr.Textbox(label="Node Information", lines=20, max_lines=30) node_info_btn.click(fn=get_node_info, inputs=node_id_input, outputs=node_output) gr.Markdown("#Get Node Info:" + _tool_doc_md(get_node_info)) node_edges_btn.click(fn=get_node_edges, inputs=node_id_input, outputs=node_output) gr.Markdown("#Get Node Edges:" + _tool_doc_md(get_node_edges)) with gr.Tab("🏗️ Structure"): gr.Markdown("### Repository Tree") with gr.Row(): with gr.Column(): tree_root = gr.Textbox(label="Root Node ID", value="root", placeholder="root") tree_depth = gr.Slider(1, 10, value=3, step=1, label="Max Depth") tree_btn = gr.Button("Show Tree", variant="primary") with gr.Column(): tree_output = gr.Textbox(label="Tree View", lines=20, max_lines=40) tree_btn.click(fn=print_tree, inputs=[tree_root, tree_depth], outputs=tree_output) gr.Markdown(_tool_doc_md(print_tree)) gr.Markdown("---") gr.Markdown("### File Structure") with gr.Row(): with gr.Column(): file_path_input = gr.Textbox(label="File Path", placeholder="Enter file path...") file_structure_btn = gr.Button("Get File Structure", variant="primary") with gr.Column(): file_structure_output = gr.Textbox(label="File Structure", lines=20, max_lines=30) file_structure_btn.click(fn=get_file_structure, inputs=file_path_input, outputs=file_structure_output) gr.Markdown(_tool_doc_md(get_file_structure)) with gr.Tab("🎯 Entities"): gr.Markdown("### List All Entities") with gr.Row(): with gr.Column(): entity_page = gr.Slider(1, 100, value=1, step=1, label="Page") entity_limit = gr.Slider(10, 100, value=50, step=10, label="Per Page") entity_type_filter = gr.Dropdown( choices=["", "class", "function", "method", "variable", "parameter"], label="Filter by Type (optional)", value="" ) declared_in_repo = gr.Dropdown( choices=["", "true", "false"], label="Declared in Repo (optional)", value="" ) called_in_repo = gr.Dropdown( choices=["", "true", "false"], label="Called in Repo (optional)", value="" ) list_entities_btn = gr.Button("List Entities", variant="primary") with gr.Column(): list_entities_output = gr.Textbox(label="Entities", lines=20, max_lines=30) list_entities_btn.click( fn=list_all_entities, inputs=[entity_limit, entity_page, entity_type_filter, declared_in_repo, called_in_repo], outputs=list_entities_output, ) gr.Markdown(_tool_doc_md(list_all_entities)) gr.Markdown("---") gr.Markdown("### Go to Definition") with gr.Row(): with gr.Column(): entity_name_def = gr.Textbox(label="Entity Name", placeholder="Enter entity name...") def_btn = gr.Button("Go to Definition", variant="primary") with gr.Column(): def_output = gr.Textbox(label="Definition", lines=15, max_lines=25) def_btn.click(fn=go_to_definition, inputs=entity_name_def, outputs=def_output) gr.Markdown(_tool_doc_md(go_to_definition)) gr.Markdown("---") gr.Markdown("### Find Usages") with gr.Row(): with gr.Column(): entity_name_usage = gr.Textbox(label="Entity Name", placeholder="Enter entity name...") usage_limit = gr.Slider(1, 50, value=20, step=1, label="Results per Page") usage_page = gr.Slider(1, 100, value=1, step=1, label="Page") usage_btn = gr.Button("Find Usages", variant="primary") with gr.Column(): usage_output = gr.Textbox(label="Usages", lines=15, max_lines=25) usage_btn.click(fn=find_usages, inputs=[entity_name_usage, usage_limit, usage_page], outputs=usage_output) gr.Markdown(_tool_doc_md(find_usages)) with gr.Tab("🔬 Discovery"): gr.Markdown("### List Nodes by Type") with gr.Row(): with gr.Column(): node_type_input = gr.Dropdown( choices=["file", "directory", "chunk", "function", "class", "method"], label="Node Type" ) type_limit = gr.Slider(1, 100, value=20, step=1, label="Max Results") type_page = gr.Slider(1, 100, value=1, step=1, label="Page") type_btn = gr.Button("List Nodes", variant="primary") with gr.Column(): type_output = gr.Textbox(label="Results", lines=20, max_lines=30) type_btn.click(fn=list_nodes_by_type, inputs=[node_type_input, type_limit, type_page], outputs=type_output) gr.Markdown(_tool_doc_md(list_nodes_by_type)) gr.Markdown("---") gr.Markdown("### Search by Type and Name") with gr.Row(): with gr.Column(): search_type = gr.Dropdown( choices=["file", "directory", "chunk", "function", "class", "method"], label="Node Type" ) search_name = gr.Textbox(label="Name Contains", placeholder="Enter partial name...") search_limit = gr.Slider(1, 100, value=10, step=1, label="Max Results") search_page = gr.Slider(1, 100, value=1, step=1, label="Page") search_partial_allowed = gr.Checkbox(label="Partial Match", value=True) search_type_btn = gr.Button("Search", variant="primary") with gr.Column(): search_type_output = gr.Textbox(label="Results", lines=20, max_lines=30) search_type_btn.click(fn=search_by_type_and_name, inputs=[search_type, search_name, search_limit, search_page, search_partial_allowed], outputs=search_type_output) gr.Markdown(_tool_doc_md(search_by_type_and_name)) with gr.Tab("🔗 Relationships"): gr.Markdown("### Get Neighbors") with gr.Row(): with gr.Column(): neighbor_node_id = gr.Textbox(label="Node ID", placeholder="Enter node ID...") neighbor_limit = gr.Slider(1, 100, value=20, step=1, label="Max Results") neighbor_page = gr.Slider(1, 100, value=1, step=1, label="Page") neighbor_btn = gr.Button("Get Neighbors", variant="primary") with gr.Column(): neighbor_output = gr.Textbox(label="Neighbors", lines=20, max_lines=30) neighbor_btn.click(fn=get_neighbors, inputs=[neighbor_node_id, neighbor_limit, neighbor_page], outputs=neighbor_output) gr.Markdown(_tool_doc_md(get_neighbors)) gr.Markdown("---") gr.Markdown("### Entity Relationships") with gr.Row(): with gr.Column(): rel_node_id = gr.Textbox(label="Node ID", placeholder="Enter node ID...") rel_btn = gr.Button("Get Relationships", variant="primary") with gr.Column(): rel_output = gr.Textbox(label="Relationships", lines=20, max_lines=30) rel_btn.click(fn=entity_relationships, inputs=rel_node_id, outputs=rel_output) gr.Markdown(_tool_doc_md(entity_relationships)) gr.Markdown("---") gr.Markdown("### Get Related Chunks") with gr.Row(): with gr.Column(): related_chunk_id = gr.Textbox(label="Chunk ID", placeholder="Enter chunk ID...") relation_type = gr.Dropdown(choices=["" , "calls", "contains", "declares", "uses"], label="Relation Type", value="calls") related_limit = gr.Slider(1, 100, value=20, step=1, label="Results per Page") related_page = gr.Slider(1, 100, value=1, step=1, label="Page") related_btn = gr.Button("Get Related Chunks", variant="primary") with gr.Column(): related_output = gr.Textbox(label="Related Chunks", lines=20, max_lines=30) related_btn.click(fn=get_related_chunks, inputs=[related_chunk_id, relation_type, related_limit, related_page], outputs=related_output) gr.Markdown(_tool_doc_md(get_related_chunks)) gr.Markdown("---") gr.Markdown("### Find Path Between Nodes") with gr.Row(): with gr.Column(): path_source = gr.Textbox(label="Source Node ID", placeholder="Enter source node ID...") path_target = gr.Textbox(label="Target Node ID", placeholder="Enter target node ID...") path_depth = gr.Slider(1, 10, value=5, step=1, label="Max Depth") path_btn = gr.Button("Find Path", variant="primary") with gr.Column(): path_output = gr.Textbox(label="Path", lines=20, max_lines=30) path_btn.click(fn=find_path, inputs=[path_source, path_target, path_depth], outputs=path_output) gr.Markdown(_tool_doc_md(find_path)) with gr.Tab("📖 Context"): gr.Markdown("### Get Chunk Context") with gr.Row(): with gr.Column(): chunk_id_input = gr.Textbox(label="Chunk ID", placeholder="Enter chunk ID...") context_btn = gr.Button("Get Context", variant="primary") with gr.Column(): context_output = gr.Textbox(label="Context", lines=25, max_lines=40) context_btn.click(fn=get_chunk_context, inputs=chunk_id_input, outputs=context_output) gr.Markdown(_tool_doc_md(get_chunk_context)) gr.Markdown("---") gr.Markdown("### Concept Overview") with gr.Row(): with gr.Column(): concept_input = gr.Textbox(label="Concept", placeholder="e.g., embedding, authentication...") concept_btn = gr.Button("Get Overview", variant="primary") with gr.Column(): concept_output = gr.Textbox(label="Concept Overview", lines=25, max_lines=40) concept_btn.click(fn=get_concept_overview, inputs=concept_input, outputs=concept_output) gr.Markdown(_tool_doc_md(get_concept_overview)) gr.Markdown("---") gr.Markdown("### Get Subgraph") with gr.Row(): with gr.Column(): subgraph_node = gr.Textbox(label="Center Node ID", placeholder="Enter node ID...") subgraph_depth = gr.Slider(1, 5, value=2, step=1, label="Depth") subgraph_edge_types = gr.Textbox(label="Edge Types (comma-separated, optional)", placeholder="e.g., calls,contains") subgraph_btn = gr.Button("Retrieve Subgraph", variant="primary") with gr.Column(): subgraph_output = gr.Textbox(label="Subgraph", lines=20, max_lines=30) subgraph_btn.click(fn=get_subgraph, inputs=[subgraph_node, subgraph_depth, subgraph_edge_types], outputs=subgraph_output) gr.Markdown(_tool_doc_md(get_subgraph)) with gr.Tab("📁 Files"): gr.Markdown("### List Files in Directory") with gr.Row(): with gr.Column(): dir_path = gr.Textbox(label="Directory Path (empty for root)", placeholder="e.g., src/") file_pattern = gr.Textbox(label="Pattern", value="*", placeholder="e.g., *.py") file_recursive = gr.Checkbox(label="Recursive", value=True) file_limit = gr.Slider(10, 100, value=50, step=10, label="Results per Page") file_page = gr.Slider(1, 100, value=1, step=1, label="Page") list_files_btn = gr.Button("List Files", variant="primary") with gr.Column(): list_files_output = gr.Textbox(label="Files", lines=20, max_lines=30) list_files_btn.click(fn=list_files_in_directory, inputs=[dir_path, file_pattern, file_recursive, file_limit, file_page], outputs=list_files_output) gr.Markdown(_tool_doc_md(list_files_in_directory)) gr.Markdown("---") gr.Markdown("### Find Files Importing") with gr.Row(): with gr.Column(): import_module = gr.Textbox(label="Module/Entity Name", placeholder="e.g., torch, numpy...") import_limit = gr.Slider(10, 50, value=30, step=5, label="Results per Page") import_page = gr.Slider(1, 100, value=1, step=1, label="Page") find_imports_btn = gr.Button("Find Files", variant="primary") with gr.Column(): find_imports_output = gr.Textbox(label="Importing Files", lines=20, max_lines=30) find_imports_btn.click(fn=find_files_importing, inputs=[import_module, import_limit, import_page], outputs=find_imports_output) gr.Markdown(_tool_doc_md(find_files_importing)) gr.Markdown("---") gr.Markdown("### Get File Stats") with gr.Row(): with gr.Column(): stats_path = gr.Textbox(label="File Path", placeholder="Enter file path...") stats_btn = gr.Button("Get Stats", variant="primary") with gr.Column(): stats_output = gr.Textbox(label="Statistics", lines=20, max_lines=30) stats_btn.click(fn=get_file_stats, inputs=stats_path, outputs=stats_output) gr.Markdown(_tool_doc_md(get_file_stats)) with gr.Tab("🔍 Analysis"): gr.Markdown("### Diff Chunks") with gr.Row(): with gr.Column(): diff_node1 = gr.Textbox(label="First Node ID", placeholder="Enter first node ID...") diff_node2 = gr.Textbox(label="Second Node ID", placeholder="Enter second node ID...") diff_btn = gr.Button("Show Diff", variant="primary") with gr.Column(): diff_output = gr.Textbox(label="Diff Output", lines=25, max_lines=40) diff_btn.click(fn=diff_chunks, inputs=[diff_node1, diff_node2], outputs=diff_output) gr.Markdown(_tool_doc_md(diff_chunks)) return demo def main(): parser = argparse.ArgumentParser(description="Knowledge Graph MCP Server from HuggingFace Dataset") # Required argument parser.add_argument("--hf-dataset", type=str, default=os.environ.get("HF_DATASET"), help="HuggingFace dataset repo ID (e.g., 'username/dataset-name')") # Optional HuggingFace auth (falls back to HF_TOKEN env var) parser.add_argument("--hf-token", type=str, default=os.environ.get("HF_TOKEN"), help="HuggingFace API token for private datasets (or set HF_TOKEN env var)") # Server settings parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to") parser.add_argument("--port", type=int, default=7860, help="Port to bind to") parser.add_argument("--share", action="store_true", help="Create a public link") # Index settings parser.add_argument("--no-index", action="store_true", help="Skip indexing nodes") parser.add_argument("--code-index-type", type=str, default="keyword-only", choices=["keyword-only", "embedding-only", "hybrid"], help="Type of code index to use") parser.add_argument("--code-index-backend", type=str, default="lancedb", choices=["lancedb", "weaviate"], help="Backend for code index") args = parser.parse_args() # Build code_index_kwargs code_index_kwargs = { "index_type": args.code_index_type, "backend": args.code_index_backend, "use_embed": args.code_index_type != "keyword-only", } # Initialize knowledge graph print("Initializing knowledge graph from HuggingFace dataset...") initialize_knowledge_graph( hf_dataset=args.hf_dataset, hf_token=args.hf_token, index_nodes=not args.no_index, code_index_kwargs=code_index_kwargs ) print("Knowledge graph initialized!") # Create and launch app demo = create_gradio_app() demo.launch( server_name=args.host, server_port=args.port, share=args.share, mcp_server=True ) if __name__ == "__main__": main()