"""
Simplified Gradio MCP Server for Knowledge Graphs loaded from HuggingFace datasets.
"""
import os
import sys
import argparse
import difflib
import fnmatch
import re
from typing import Optional, List
import gradio as gr
from RepoKnowledgeGraphLib.utils.chunk_utils import (
        organize_chunks_by_file_name, join_organized_chunks
    )

# Optional Langfuse integration
try:
    from langfuse import get_client, observe
    langfuse = get_client()
    LANGFUSE_ENABLED = langfuse.auth_check()
    if LANGFUSE_ENABLED:
        print("✓ Langfuse client is authenticated and ready!")
    else:
        print("⚠️ Langfuse authentication failed. Tracing disabled.")
except Exception as e:
    print(f"⚠️ Langfuse not available: {e}. Tracing disabled.")
    LANGFUSE_ENABLED = False
    def observe(*args, **kwargs):
        def decorator(func):
            return func
        return decorator


def _sanitize_value(v):
    if isinstance(v, str):
        return v.strip()
    if isinstance(v, dict):
        return {k: _sanitize_value(val) for k, val in v.items()}
    if isinstance(v, (list, tuple)):
        t = type(v)
        return t(_sanitize_value(x) for x in v)
    return v


def sanitize_inputs(func):
    """Decorator that trims whitespace from all string args/kwargs before calling func."""
    def wrapper(*args, **kwargs):
        new_args = tuple(_sanitize_value(a) for a in args)
        new_kwargs = {k: _sanitize_value(v) for k, v in kwargs.items()}
        return func(*new_args, **new_kwargs)
    # preserve original attributes
    try:
        wrapper.__name__ = func.__name__
        wrapper.__doc__ = func.__doc__
    except Exception:
        pass
    return wrapper


# Wrap the existing `observe` decorator (from langfuse or fallback) so that
# all observed tools receive sanitized inputs automatically. This avoids
# having to manually add `@sanitize_inputs` above every `@observe`.
try:
    _original_observe = observe
    def _observe_with_sanitize(*o_args, **o_kwargs):
        def decorator(f):
            return _original_observe(*o_args, **o_kwargs)(sanitize_inputs(f))
        return decorator
    observe = _observe_with_sanitize
except Exception:
    # If anything goes wrong, keep the existing observe as-is.
    pass

# Add parent directory to path
sys.path.insert(0, os.path.join(os.path.dirname(__file__), 'RepoKnowledgeGraphLib'))

from RepoKnowledgeGraphLib.RepoKnowledgeGraph import RepoKnowledgeGraph

# Global knowledge graph instance
knowledge_graph = None


def initialize_knowledge_graph(
    hf_dataset: str,
    hf_token: Optional[str] = None,
    index_nodes: bool = True,
    code_index_kwargs: Optional[dict] = None
):
    """Initialize the knowledge graph from a HuggingFace dataset."""
    global knowledge_graph

    model_service_kwargs = {
        "embedder_type": "sentence-transformers",
        "embed_model_name": "Salesforce/SFR-Embedding-Code-400M_R",
    }

    print(f"Loading knowledge graph from HuggingFace dataset: {hf_dataset}")
    knowledge_graph = RepoKnowledgeGraph.from_hf_dataset(
        repo_id=hf_dataset,
        index_nodes=index_nodes,
        model_service_kwargs=model_service_kwargs,
        code_index_kwargs=code_index_kwargs,
        token=hf_token
    )


# ==================== Tool Functions ====================
@observe(as_type="tool")
def get_node_info(node_id: str) -> str:
    """
    Retrieve comprehensive details about any node in the Transformers library knowledge graph.

    PURPOSE:
    Use this tool to inspect the full metadata and content of a specific node when you need
    to understand what a particular code element contains, what entities it declares or calls,
    and how it fits into the codebase structure.

    WHEN TO USE:
    - After finding a node ID from search_nodes, list_nodes_by_type, or get_neighbors
    - To see the actual code content of a chunk node
    - To understand what entities (classes, functions, variables) are declared in a file or chunk
    - To examine entity metadata including aliases, declaration locations, and usage locations
    - To get file metadata like language and path information

    NODE TYPES SUPPORTED:
    - 'chunk': Code segments with content, declared/called entities, and file position
    - 'file': Source files with path, language, and entity summaries
    - 'directory': Folder nodes with path information
    - 'entity': Programming constructs (classes, functions, methods, variables) with declaration/usage tracking
    - 'repo': Repository root node

    TYPICAL WORKFLOW:
    1. search_nodes("attention mechanism") -> get node IDs
    2. get_node_info(node_id) -> see full content and metadata
    3. get_neighbors(node_id) or find_usages(entity_name) -> explore relationships

    Args:
        node_id: The unique identifier of the node (e.g., 'src/transformers/models/bert/modeling_bert.py::chunk_3' for chunks, or 'BertModel' for entities)

    Returns:
        str: Formatted details including node type, name, description, content (for chunks), declared entities, called entities, and type-specific metadata

    Example node_ids:
        - Chunk: 'src/transformers/models/bert/modeling_bert.py::chunk_5'
        - File: 'src/transformers/models/bert/modeling_bert.py'
        - Entity: 'BertModel', 'forward', 'attention_mask'
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        if node_id not in knowledge_graph.graph:
            return f"Error: Node '{node_id}' not found in knowledge graph"

        node = knowledge_graph.graph.nodes[node_id]['data']
        node_type = getattr(node, 'node_type', 'Unknown')
        node_class = node.__class__.__name__
        node_name = getattr(node, 'name', 'Unknown')
        description = getattr(node, 'description', None)

        result = f"Node Information:\n━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        result += f"Node ID: {node_id}\nClass: {node_class}\nName: {node_name}\nType: {node_type}\n"
        result += f"Description: {description or 'N/A'}\n"

        if node_class == 'EntityNode' or node_type == 'entity':
            entity_type = getattr(node, 'entity_type', 'Unknown')
            declaring_chunk_ids = getattr(node, 'declaring_chunk_ids', [])
            calling_chunk_ids = getattr(node, 'calling_chunk_ids', [])
            aliases = getattr(node, 'aliases', [])

            result += f"\nEntity Type: {entity_type}\n"
            result += f"Aliases: {', '.join(aliases) if aliases else 'None'}\n"
            result += f"Declared in {len(declaring_chunk_ids)} chunk(s):\n"
            for cid in declaring_chunk_ids[:5]:
                result += f"  - {cid}\n"
            if len(declaring_chunk_ids) > 5:
                result += f"  ... and {len(declaring_chunk_ids) - 5} more\n"
            result += f"Called in {len(calling_chunk_ids)} chunk(s):\n"
            for cid in calling_chunk_ids[:5]:
                result += f"  - {cid}\n"
            if len(calling_chunk_ids) > 5:
                result += f"  ... and {len(calling_chunk_ids) - 5} more\n"
            result += f"\nSummary: Entity {node_id} ({node_name}) — {entity_type} declared in {len(declaring_chunk_ids)} chunk(s) and called in {len(calling_chunk_ids)} chunk(s).\n"
        else:
            declared_entities = getattr(node, 'declared_entities', [])
            called_entities = getattr(node, 'called_entities', [])

            result += f"\nDeclared Entities ({len(declared_entities)}):\n"
            for entity in declared_entities[:10]:
                result += f"  - {entity}\n"
            if len(declared_entities) > 10:
                result += f"  ... and {len(declared_entities) - 10} more\n"

            result += f"\nCalled Entities ({len(called_entities)}):\n"
            for entity in called_entities[:10]:
                result += f"  - {entity}\n"
            if len(called_entities) > 10:
                result += f"  ... and {len(called_entities) - 10} more\n"

            # Add content preview for file/chunk nodes
            if node_type in ['file', 'chunk']:
                content = getattr(node, 'content', None)
                result += f"\nContent:\n{content or 'N/A'}\n"
                if hasattr(node, 'path'):
                    result += f"Path: {node.path}\n"
                if hasattr(node, 'language'):
                    result += f"Language: {node.language}\n"
                if node_type == 'chunk' and hasattr(node, 'order_in_file'):
                    result += f"Order in File: {node.order_in_file}\n"
            elif node_type == 'directory':
                if hasattr(node, 'path'):
                    result += f"Path: {node.path}\n"

            result += f"\nSummary: Node {node_id} ({node_name}) — {node_type} with {len(declared_entities)} declared and {len(called_entities)} called entities.\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def get_node_edges(node_id: str) -> str:
    """
    List all graph edges (relationships) connected to a specific node in the knowledge graph.

    PURPOSE:
    Use this tool to understand how a node is connected to other parts of the codebase.
    Reveals the dependency structure and relationships that link code elements together.

    WHEN TO USE:
    - To discover what code calls or depends on a specific function/class
    - To find parent-child relationships (e.g., which file contains a chunk)
    - To trace declaration and usage patterns through the codebase
    - To understand the connectivity of an entity in the dependency graph
    - When you need a raw view of all relationships without filtering

    EDGE TYPES YOU'LL SEE:
    - 'contains': Parent-child (file→chunk, directory→file, repo→directory)
    - 'calls': Entity usage relationships (chunk→entity it calls)
    - 'declares': Entity declaration relationships (chunk→entity it defines)

    DIRECTION MEANINGS:
    - Incoming edges (←): Other nodes pointing TO this node (e.g., "who calls me?")
    - Outgoing edges (→): This node pointing TO others (e.g., "what do I call?")

    COMPARISON WITH get_neighbors:
    - get_node_edges: Shows edge metadata and direction, raw relationship view
    - get_neighbors: Shows neighboring node details, easier for exploration

    Args:
        node_id: The unique identifier of the node to inspect edges for

    Returns:
        str: List of incoming and outgoing edges with source/target node IDs and relationship types
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        if node_id not in knowledge_graph.graph:
            return f"Error: Node '{node_id}' not found in knowledge graph"

        g = knowledge_graph.graph

        incoming = [
            {"source": src, "target": tgt, "relation": data.get("relation", "?")}
            for src, tgt, data in g.in_edges(node_id, data=True)
        ]
        outgoing = [
            {"source": src, "target": tgt, "relation": data.get("relation", "?")}
            for src, tgt, data in g.out_edges(node_id, data=True)
        ]

        result = f"""Node Edges for '{node_id}':
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Incoming Edges ({len(incoming)}):
"""
        for edge in incoming[:20]:
            result += f"  ← {edge['source']} [{edge['relation']}]\n"
        if len(incoming) > 20:
            result += f"  ... and {len(incoming) - 20} more\n"

        result += f"\nOutgoing Edges ({len(outgoing)}):\n"
        for edge in outgoing[:20]:
            result += f"  → {edge['target']} [{edge['relation']}]\n"
        if len(outgoing) > 20:
            result += f"  ... and {len(outgoing) - 20} more\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def search_nodes(query: str, limit: int = 10, page: int = 1) -> str:
    """
    Search the Transformers codebase using keyword matching against code content and metadata.

    PURPOSE:
    This is your PRIMARY SEARCH TOOL for exploring the codebase. Use it to find relevant
    code chunks based on natural language queries, function names, class names, comments,
    or any text that might appear in the source code.

    WHEN TO USE:
    - FIRST STEP when investigating any topic in the Transformers library
    - To find implementations of specific features (e.g., "rotary embeddings", "flash attention")
    - To locate code by function/class name when you don't have the exact node ID
    - To discover code related to a concept (e.g., "gradient checkpointing", "tokenization")
    - When you don't know where something is implemented

    SEARCH TIPS:
    - Use specific technical terms: "rope embedding" rather than just "embedding"
    - Include class/function names if known: "BertSelfAttention forward"
    - Try multiple related queries if first results aren't satisfactory
    - Results are ranked by relevance to your query

    TYPICAL WORKFLOW:
    1. search_nodes("attention mask handling") -> find relevant chunks
    2. get_node_info(chunk_id) -> examine the code content
    3. get_chunk_context(chunk_id) -> see surrounding code for fuller picture
    4. go_to_definition(entity_name) -> find where an entity is defined

    Args:
        query: Search terms to match against code content. Can be natural language, function names, class names, or code snippets. More specific queries yield better results.
        limit: Results per page (default: 10, max recommended: 50). Use smaller limits for faster responses.
        page: Page number starting from 1. Use pagination to browse through many results.

    Returns:
        str: Ranked list of matching code chunks with IDs and content previews. Use the returned IDs with get_node_info or get_chunk_context for full details.
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        # Convert limit to int if it's a string (MCP may pass strings)
        if isinstance(limit, str):
            try:
                limit = int(limit)
            except ValueError:
                return f"Error: 'limit' must be an integer, got '{limit}'"
        
        # Convert page to int if it's a string
        if isinstance(page, str):
            try:
                page = int(page)
            except ValueError:
                return f"Error: 'page' must be an integer, got '{page}'"
        
        if limit <= 0:
            return "Error: limit must be a positive integer"
        if page < 1:
            return "Error: 'page' must be a positive integer (1 or greater)"

        # Fetch more results to support pagination
        max_fetch = limit * page
        results = knowledge_graph.code_index.query(query, n_results=max_fetch)
        metadatas = results.get("metadatas", [[]])[0]

        if not metadatas:
            return f"No results found for '{query}'."

        total = len(metadatas)
        # Pagination
        total_pages = (total + limit - 1) // limit
        if page > total_pages:
            return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} results at {limit} per page)"

        start_idx = (page - 1) * limit
        end_idx = start_idx + limit
        page_slice = metadatas[start_idx:end_idx]

        result = f"Search Results for '{query}' (Page {page}/{total_pages}, {total} total):\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

        for i, res in enumerate(page_slice, start=start_idx + 1):
            result += f"{i}. ID: {res.get('id', 'N/A')}\n"
            content = res.get('content', '')
            if content:
                result += f"   Content: {content}\n"
            result += "\n"

        # Pagination hint
        if page < total_pages:
            result += f"Use page={page + 1} to see the next page\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"

@observe(as_type="tool")
def get_graph_stats() -> str:
    """
    Get a comprehensive statistical overview of the Transformers library knowledge graph.

    PURPOSE:
    Use this tool to understand the scope and structure of the knowledge graph.
    Provides counts and breakdowns of all node types, entity types, and relationship types.

    WHEN TO USE:
    - At the START of an exploration session to understand the codebase scope
    - To learn what types of entities and relationships are available for querying
    - To understand the terminology used in this knowledge graph (chunks, entities, edges)
    - When you need to report on the overall structure of the Transformers library

    WHAT YOU'LL LEARN:
    - Total number of nodes and edges in the graph
    - Breakdown of node types (chunks, files, directories, entities)
    - Entity type distribution (classes, functions, methods, variables, etc.)
    - Edge relationship types (contains, calls, declares)
    - Definitions of key concepts used throughout the tools

    GRAPH TERMINOLOGY:
    - Chunks: Logical code segments (a function body, a class definition, etc.)
    - Entities: Named programming constructs tracked across the codebase
    - Edges: Relationships connecting nodes (contains, calls, declares)

    Returns:
        str: Detailed statistics including node counts by type, entity breakdown, edge relation counts, and concept definitions to help you use other tools effectively.
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        g = knowledge_graph.graph
        num_nodes = g.number_of_nodes()
        num_edges = g.number_of_edges()

        # Count node types
        node_types = {}
        entity_breakdown = {}
        
        for _, node_attrs in g.nodes(data=True):
            node_type = getattr(node_attrs['data'], 'node_type', 'Unknown')
            node_types[node_type] = node_types.get(node_type, 0) + 1
            
            # For entity nodes, get entity_type breakdown
            if node_type == 'entity':
                entity_type = getattr(node_attrs['data'], 'entity_type', 'Unknown')
                
                # Fallback: if entity_type is empty, check entities dictionary
                if not entity_type:
                    node_id = node_attrs['data'].id if hasattr(node_attrs['data'], 'id') else None
                    if node_id and node_id in knowledge_graph.entities:
                        entity_types = knowledge_graph.entities[node_id].get('type', [])
                        entity_type = entity_types[0] if entity_types else 'Unknown'
                
                entity_breakdown[entity_type] = entity_breakdown.get(entity_type, 0) + 1

        # Count edge relations
        edge_relations = {}
        for _, _, attrs in g.edges(data=True):
            relation = attrs.get('relation', 'Unknown')
            edge_relations[relation] = edge_relations.get(relation, 0) + 1

        # Build result
        result = f"""Knowledge Graph Statistics:
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

📊 Overview:
  Total Nodes: {num_nodes:,}
  Total Edges: {num_edges:,}

━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

📦 Node Types:
"""
        
        # Sort node types by count
        for ntype, count in sorted(node_types.items(), key=lambda x: x[1], reverse=True):
            result += f"  • {ntype}: {count:,}\n"
            
            # If this is entity type, show breakdown
            if ntype == 'entity' and entity_breakdown:
                result += f"    └─ Entity Breakdown:\n"
                for etype, ecount in sorted(entity_breakdown.items(), key=lambda x: x[1], reverse=True):
                    percentage = (ecount / count * 100) if count > 0 else 0
                    result += f"       ├─ {etype}: {ecount:,} ({percentage:.1f}%)\n"

        result += f"""
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

🔗 Edge Relations:
"""
        for relation, count in sorted(edge_relations.items(), key=lambda x: x[1], reverse=True):
            result += f"  • {relation}: {count:,}\n"

        # Add explanation section
        result += f"""
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

ℹ️  Definitions:

Chunks: Code segments representing logical portions of files. Each chunk
        contains a section of code (like a function, class, or code block)
        along with metadata about what entities it declares and calls.

Entities: Programming constructs identified in the code including:
          - Classes: Class definitions
          - Functions: Function definitions
          - Methods: Class method definitions
          - Variables: Variable declarations
          - Parameters: Function/method parameters
          - Function_call/Method_call: Usage references

Files: Source code files in the repository
Directories: Folder structure containing files
Repo: Root repository node

Edge Relations:
  - contains: Parent-child relationships (file contains chunks)
  - declares: Entity declaration relationships
  - calls: Entity usage/invocation relationships
"""

        return result
    except Exception as e:
        return f"Error: {str(e)}"

@observe(as_type="tool")
def list_nodes_by_type(node_type: str, limit: int = 20, page: int = 1) -> str:
    """
    List all nodes of a specific type in the Transformers knowledge graph with pagination.

    PURPOSE:
    Use this tool to browse and discover nodes by their type. Helpful when you want to
    see what classes, functions, files, or other constructs exist in the codebase.

    WHEN TO USE:
    - To get a list of all classes in the Transformers library: node_type='class'
    - To see all Python files: node_type='file'
    - To list all functions: node_type='function'
    - To browse all methods: node_type='method'
    - When you need to find node IDs for further exploration

    VALID node_type VALUES:
    For entities (programming constructs):
    - 'class': Class definitions (e.g., BertModel, GPT2LMHeadModel)
    - 'function': Standalone function definitions
    - 'method': Class method definitions
    - 'variable': Variable declarations
    - 'parameter': Function/method parameters

    For structural nodes:
    - 'file': Source code files
    - 'chunk': Code segments within files
    - 'directory': Folder structure nodes
    - 'repo': Repository root (typically one)

    COMPARISON WITH search_by_type_and_name:
    - list_nodes_by_type: Browse ALL nodes of a type (no name filter)
    - search_by_type_and_name: Filter by type AND search by name substring

    Args:
        node_type: The type to filter by. Use lowercase: 'class', 'function', 'method', 'file', 'chunk', 'directory'
        limit: Maximum results per page (default: 20). Increase for broader browsing.
        page: Page number starting from 1 for pagination through large result sets.

    Returns:
        str: Alphabetically sorted list of matching nodes with their IDs and types. Use IDs with get_node_info for details.
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"
    
    try:
        # Convert limit/page to int if they're strings (MCP/Gradio may pass strings)
        if isinstance(limit, str):
            try:
                limit = int(limit)
            except ValueError:
                return f"Error: 'limit' must be an integer, got '{limit}'"
        
        if isinstance(page, str):
            try:
                page = int(page)
            except ValueError:
                return f"Error: 'page' must be an integer, got '{page}'"
        
        if limit <= 0:
            return "Error: limit must be a positive integer"
        if page < 1:
            return "Error: 'page' must be a positive integer (1 or greater)"
        
        g = knowledge_graph.graph
        matching_nodes = []
        
        for node_id, data in g.nodes(data=True):
            node = data['data']
            current_node_type = getattr(node, 'node_type', None)
            node_name = getattr(node, 'name', 'Unknown')
            
            # For entity nodes, check entity_type instead of node_type
            if current_node_type == 'entity':
                entity_type = getattr(node, 'entity_type', '')
                
                # Fallback: if entity_type is empty, check the entities dictionary
                if not entity_type and node_id in knowledge_graph.entities:
                    entity_types = knowledge_graph.entities[node_id].get('type', [])
                    entity_type = entity_types[0] if entity_types else ''
                
                if entity_type and entity_type.lower() == node_type.lower():
                    matching_nodes.append({
                        "id": node_id,
                        "name": node_name,
                        "type": f"entity ({entity_type})"
                    })
            # For other nodes, check node_type directly
            elif current_node_type == node_type:
                matching_nodes.append({
                    "id": node_id,
                    "name": node_name,
                    "type": current_node_type
                })
        
        # Sort by name for consistent ordering
        matching_nodes.sort(key=lambda x: x['name'].lower())
        
        total = len(matching_nodes)
        if total == 0:
            return f"No nodes found of type '{node_type}'."
        
        # Pagination
        total_pages = (total + limit - 1) // limit
        if page > total_pages:
            return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} nodes at {limit} per page)"
        
        start_idx = (page - 1) * limit
        end_idx = start_idx + limit
        page_slice = matching_nodes[start_idx:end_idx]
        
        result = f"Nodes of type '{node_type}' (Page {page}/{total_pages}, {total} total):\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
        
        for i, node in enumerate(page_slice, start=start_idx + 1):
            result += f"{i}. {node['name']}\n"
            result += f"   ID: {node['id']}\n"
            result += f"   Type: {node['type']}\n\n"
        
        # Pagination hint
        if page < total_pages:
            result += f"Use page={page + 1} to see the next page\n"
        
        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def get_neighbors(node_id: str, limit: int = 20, page: int = 1) -> str:
    """
    Get all nodes directly connected to a given node with their relationship information.

    PURPOSE:
    Use this tool to explore the local neighborhood of any node in the knowledge graph.
    Shows what's connected to a node and how, making it easy to navigate the codebase structure.

    WHEN TO USE:
    - To explore what a node is connected to (files, chunks, entities)
    - To navigate from one code element to related elements
    - To understand the local structure around a specific node
    - After using get_node_info when you want to explore connected nodes
    - To discover related code without knowing exact names

    WHAT YOU'LL SEE:
    - Neighbor node IDs and names
    - Node types (chunk, file, entity, etc.)
    - Relationship direction (→ outgoing, ← incoming)
    - Relationship type (contains, calls, declares)

    TYPICAL NAVIGATION PATTERNS:
    - From a file: see its chunks and declared entities
    - From a chunk: see entities it declares/calls and its parent file
    - From an entity: see chunks that declare or call it
    - From a directory: see contained files and subdirectories

    COMPARISON WITH get_node_edges:
    - get_neighbors: Shows neighboring NODE details (name, type) - better for exploration
    - get_node_edges: Shows raw EDGE information - better for understanding relationships

    Args:
        node_id: The ID of the node to explore neighbors for
        limit: Maximum neighbors to return per page (default: 20)
        page: Page number for pagination when node has many connections

    Returns:
        str: List of connected nodes with their IDs, names, types, and the relationships connecting them
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        if node_id not in knowledge_graph.graph:
            return f"Error: Node '{node_id}' not found in knowledge graph"

        # Convert limit/page to int if they're strings
        if isinstance(limit, str):
            try:
                limit = int(limit)
            except ValueError:
                return f"Error: 'limit' must be an integer, got '{limit}'"

        if isinstance(page, str):
            try:
                page = int(page)
            except ValueError:
                return f"Error: 'page' must be an integer, got '{page}'"

        if limit <= 0:
            return "Error: limit must be a positive integer"
        if page < 1:
            return "Error: 'page' must be a positive integer (1 or greater)"

        neighbors = knowledge_graph.get_neighbors(node_id)
        if not neighbors:
            return f"No neighbors found for node '{node_id}'"

        total = len(neighbors)
        # Pagination
        total_pages = (total + limit - 1) // limit
        if page > total_pages:
            return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} neighbors at {limit} per page)"

        start_idx = (page - 1) * limit
        end_idx = start_idx + limit
        page_slice = neighbors[start_idx:end_idx]

        result = f"Neighbors of '{node_id}' (Page {page}/{total_pages}, {total} total):\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

        for i, neighbor in enumerate(page_slice, start=start_idx + 1):
            result += f"{i}. {neighbor.id}\n"
            result += f"   Name: {getattr(neighbor, 'name', 'Unknown')}\n"
            result += f"   Type: {neighbor.node_type}\n"

            if knowledge_graph.graph.has_edge(node_id, neighbor.id):
                edge_data = knowledge_graph.graph.get_edge_data(node_id, neighbor.id)
                result += f"   → Relation: {edge_data.get('relation', 'Unknown')}\n"
            elif knowledge_graph.graph.has_edge(neighbor.id, node_id):
                edge_data = knowledge_graph.graph.get_edge_data(neighbor.id, node_id)
                result += f"   ← Relation: {edge_data.get('relation', 'Unknown')}\n"
            result += "\n"

        # Pagination hint
        if page < total_pages:
            result += f"Use page={page + 1} to see the next page\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def go_to_definition(entity_name: str) -> str:
    """
    Jump to the source code location(s) where an entity is defined/declared.

    PURPOSE:
    Use this tool to find WHERE in the codebase a class, function, method, or variable
    is defined. Returns the actual code content of the definition along with file location.

    WHEN TO USE:
    - To see the implementation of a class like 'BertModel' or 'GPT2Attention'
    - To find where a function is defined when you know its name
    - To examine the source code of any entity found through search or listing
    - When you need to understand HOW something is implemented (not just WHERE it's used)
    - To get the actual code definition for analysis or explanation

    WHAT YOU'LL GET:
    - Entity type (class, function, method, variable)
    - Data type if available
    - List of all locations where the entity is declared (some entities may be defined in multiple places)
    - For each location: file path, chunk order, and FULL CODE CONTENT

    TYPICAL WORKFLOW:
    1. search_nodes("attention") -> find entity names
    2. go_to_definition("BertSelfAttention") -> see the class implementation
    3. find_usages("BertSelfAttention") -> see where it's used

    COMPARISON WITH find_usages:
    - go_to_definition: Shows WHERE entity is DEFINED (the implementation)
    - find_usages: Shows WHERE entity is USED/CALLED (the consumers)

    Args:
        entity_name: Exact name of the entity (case-sensitive). Examples: 'BertModel', 'forward', 'attention_mask', 'get_extended_attention_mask'

    Returns:
        str: Entity type, file location(s), and complete source code of the definition(s). Returns error if entity not found.
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        if entity_name not in knowledge_graph.entities:
            return f"Error: Entity '{entity_name}' not found in knowledge graph"

        entity_info = knowledge_graph.entities[entity_name]
        declaring_chunks = entity_info.get('declaring_chunk_ids', [])

        if not declaring_chunks:
            return f"Entity '{entity_name}' found but no declarations identified."

        result = f"Definition(s) for '{entity_name}':\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
        result += f"Type: {', '.join(entity_info.get('type', ['Unknown']))}\n"
        if entity_info.get('dtype'):
            result += f"Data Type: {entity_info['dtype']}\n"
        result += f"\nDeclared in {len(declaring_chunks)} location(s):\n\n"

        for i, chunk_id in enumerate(declaring_chunks[:5], 1):
            if chunk_id in knowledge_graph.graph:
                chunk = knowledge_graph.graph.nodes[chunk_id]['data']
                result += f"{i}. Chunk: {chunk_id}\n"
                result += f"   File: {chunk.path}\n"
                result += f"   Order: {chunk.order_in_file}\n"
                result += f"   Content:\n{chunk.content}\n\n"

        if len(declaring_chunks) > 5:
            result += f"... and {len(declaring_chunks) - 5} more locations\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def find_usages(entity_name: str, limit: int = 20, page: int = 1) -> str:
    """
    Find all locations in the codebase where an entity is used or called.

    PURPOSE:
    Use this tool to understand the impact and usage patterns of any entity.
    Shows every place where a class is instantiated, a function is called,
    or a variable is referenced throughout the Transformers library.

    WHEN TO USE:
    - To understand how widely used a class or function is
    - To see usage examples of a particular API or function
    - To assess the impact of changing an entity (who depends on it?)
    - To learn how to use a class/function by seeing real examples
    - To trace data flow through the codebase

    WHAT YOU'LL GET:
    - Total count of usage locations
    - For each usage: file path, chunk position, and full code context showing the usage
    - Paginated results for entities with many usages

    TYPICAL WORKFLOWS:

    Impact Analysis:
    1. go_to_definition("deprecated_function") -> understand what it does
    2. find_usages("deprecated_function") -> see all code that needs updating

    Learning by Example:
    1. list_nodes_by_type('class') -> find interesting classes
    2. find_usages("BertModel") -> see how it's instantiated and used

    COMPARISON WITH go_to_definition:
    - find_usages: WHERE is this entity CALLED/USED (consumers)
    - go_to_definition: WHERE is this entity DEFINED (implementation)

    Args:
        entity_name: Exact name of the entity to find usages for (case-sensitive)
        limit: Usages per page (default: 20). Many popular classes have 100+ usages.
        page: Page number for pagination (starts at 1)

    Returns:
        str: List of code chunks that use this entity, with file paths and full code content showing the usage in context
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        # Convert limit to int if it's a string (MCP may pass strings)
        if isinstance(limit, str):
            try:
                limit = int(limit)
            except ValueError:
                return f"Error: 'limit' must be an integer, got '{limit}'"
        
        # Convert page to int if it's a string
        if isinstance(page, str):
            try:
                page = int(page)
            except ValueError:
                return f"Error: 'page' must be an integer, got '{page}'"
        
        if entity_name not in knowledge_graph.entities:
            return f"Error: Entity '{entity_name}' not found in knowledge graph"

        if limit <= 0:
            return "Error: limit must be a positive integer"
        if page < 1:
            return "Error: 'page' must be a positive integer (1 or greater)"

        entity_info = knowledge_graph.entities[entity_name]
        calling_chunks = entity_info.get('calling_chunk_ids', [])

        if not calling_chunks:
            return f"Entity '{entity_name}' found but no usages identified."

        total = len(calling_chunks)
        # Pagination
        total_pages = (total + limit - 1) // limit
        if page > total_pages:
            return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} usages at {limit} per page)"

        start_idx = (page - 1) * limit
        end_idx = start_idx + limit
        page_slice = calling_chunks[start_idx:end_idx]

        result = f"Usages of '{entity_name}' (Page {page}/{total_pages}, {total} total):\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

        for i, chunk_id in enumerate(page_slice, start=start_idx + 1):
            if chunk_id in knowledge_graph.graph:
                chunk = knowledge_graph.graph.nodes[chunk_id]['data']
                result += f"{i}. {chunk.path} (chunk {chunk.order_in_file})\n"
                result += f"   Content:\n{chunk.content}\n\n"

        # Pagination hint
        if page < total_pages:
            result += f"Use page={page + 1} to see the next page\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def get_file_structure(file_path: str) -> str:
    """
    Get a structural overview of a source file showing its chunks and declared entities.

    PURPOSE:
    Use this tool to understand the organization of a specific file. Shows what classes,
    functions, and other entities are defined in the file, plus how the file is divided into chunks.

    WHEN TO USE:
    - To get a table of contents for a file before diving into specifics
    - To see what classes and functions a file defines
    - To understand how code is organized within a file
    - To find chunk IDs for further exploration with get_node_info or get_chunk_context
    - When you know the file path but need to understand its contents

    WHAT YOU'LL SEE:
    - File path and detected programming language
    - Total number of code chunks in the file
    - List of declared entities (classes, functions, methods, variables) with their types
    - Ordered list of chunks with their IDs and descriptions

    HOW TO GET FILE PATHS:
    - Use list_files_in_directory() to browse files
    - Use search_nodes() and look at file paths in results
    - Use list_nodes_by_type('file') to get file node IDs (which are the paths)

    TYPICAL WORKFLOW:
    1. list_files_in_directory('src/transformers/models/bert') -> find files
    2. get_file_structure('src/transformers/models/bert/modeling_bert.py') -> see structure
    3. get_node_info(chunk_id) -> examine specific code chunks

    Args:
        file_path: The full path to the file (e.g., 'src/transformers/models/bert/modeling_bert.py'). Must match exactly as stored in the knowledge graph.

    Returns:
        str: File overview including language, chunk count, declared entities list, and chunk descriptions
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        if file_path not in knowledge_graph.graph:
            return f"Error: File '{file_path}' not found in knowledge graph"

        file_node = knowledge_graph.graph.nodes[file_path]['data']
        chunks = knowledge_graph.get_chunks_of_file(file_path)

        result = f"File Structure: {file_node.name}\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
        result += f"Path: {file_path}\n"
        result += f"Language: {getattr(file_node, 'language', 'Unknown')}\n"
        result += f"Total Chunks: {len(chunks)}\n\n"

        if hasattr(file_node, 'declared_entities') and file_node.declared_entities:
            result += f"Declared Entities ({len(file_node.declared_entities)}):\n"
            for entity in file_node.declared_entities[:15]:
                if isinstance(entity, dict):
                    result += f"  - {entity.get('name', '?')} ({entity.get('type', '?')})\n"
                else:
                    result += f"  - {entity}\n"
            if len(file_node.declared_entities) > 15:
                result += f"  ... and {len(file_node.declared_entities) - 15} more\n"

        result += f"\nChunks:\n"
        for chunk in chunks[:10]:
            result += f"  [{chunk.order_in_file}] {chunk.id}\n"
            if chunk.description:
                desc = chunk.description[:80] + "..." if len(chunk.description) > 80 else chunk.description
                result += f"      {desc}\n"

        if len(chunks) > 10:
            result += f"  ... and {len(chunks) - 10} more chunks\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def get_related_chunks(chunk_id: str, relation_type: str = "calls", limit: int = 20, page: int = 1) -> str:
    """
    Find code chunks connected to a given chunk through a specific relationship type.

    PURPOSE:
    Use this tool to trace code dependencies by following relationship edges from a chunk.
    Helps understand what code a chunk depends on or what depends on it.

    WHEN TO USE:
    - To find what entities/code a chunk calls or uses (relation_type='calls')
    - To trace dependencies from a specific piece of code
    - To explore the call graph emanating from a chunk
    - When you have a chunk ID and want to see connected code

    RELATIONSHIP TYPES:
    - 'calls': Entities/chunks that this chunk calls or references (most common)
    - 'contains': Child nodes contained by this node (for files/directories)
    - 'declares': Entities declared by this chunk
    - 'all' or '': Get all outgoing relationships regardless of type

    TYPICAL WORKFLOW:
    1. search_nodes("BertAttention forward") -> find a chunk
    2. get_related_chunks(chunk_id, 'calls') -> see what it calls
    3. get_node_info(related_chunk_id) -> examine called code

    COMPARISON WITH OTHER TOOLS:
    - get_neighbors: All connected nodes (any direction, any type)
    - get_related_chunks: Outgoing edges only, filtered by relationship type
    - entity_relationships: Focused on entity nodes and their relationships

    Args:
        chunk_id: The ID of the chunk to explore from (e.g., 'src/transformers/models/bert/modeling_bert.py::chunk_5')
        relation_type: Filter by relationship type: 'calls', 'contains', 'declares', or 'all' for everything (default: 'calls')
        limit: Maximum results per page (default: 20)
        page: Page number for pagination

    Returns:
        str: List of related chunks with their IDs, file paths, and entity names involved in the relationship
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        if chunk_id not in knowledge_graph.graph:
            return f"Error: Chunk '{chunk_id}' not found in knowledge graph"

        # Convert limit/page to int if they're strings
        if isinstance(limit, str):
            try:
                limit = int(limit)
            except ValueError:
                return f"Error: 'limit' must be an integer, got '{limit}'"

        if isinstance(page, str):
            try:
                page = int(page)
            except ValueError:
                return f"Error: 'page' must be an integer, got '{page}'"

        if limit <= 0:
            return "Error: limit must be a positive integer"
        if page < 1:
            return "Error: 'page' must be a positive integer (1 or greater)"

        related = []
        if relation_type == "" or relation_type == "all":
            # Get all outgoing edges regardless of relation type
            for _, target, attrs in knowledge_graph.graph.out_edges(chunk_id, data=True):
                target_node = knowledge_graph.graph.nodes[target]['data']
                related.append({
                    "id": target,
                    "file_path": getattr(target_node, 'path', 'Unknown'),
                    "entity_name": attrs.get('entity_name')
                })
        else:     
            for _, target, attrs in knowledge_graph.graph.out_edges(chunk_id, data=True):
                if attrs.get('relation') == relation_type:
                    target_node = knowledge_graph.graph.nodes[target]['data']
                    related.append({
                        "id": target,
                        "file_path": getattr(target_node, 'path', 'Unknown'),
                        "entity_name": attrs.get('entity_name')
                    })

        if not related:
            return f"No chunks found with '{relation_type}' relationship from '{chunk_id}'"

        total = len(related)
        # Pagination
        total_pages = (total + limit - 1) // limit
        if page > total_pages:
            return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} results at {limit} per page)"

        start_idx = (page - 1) * limit
        end_idx = start_idx + limit
        page_slice = related[start_idx:end_idx]

        result = f"Chunks related to '{chunk_id}' via '{relation_type}' (Page {page}/{total_pages}, {total} total):\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

        for i, chunk in enumerate(page_slice, start=start_idx + 1):
            result += f"{i}. {chunk['id']}\n"
            result += f"   File: {chunk['file_path']}\n"
            if chunk['entity_name']:
                result += f"   Entity: {chunk['entity_name']}\n"
            result += "\n"

        # Pagination hint
        if page < total_pages:
            result += f"Use page={page + 1} to see the next page\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def list_all_entities(
    limit: int = 50,
    page: int = 1,
    entity_type: Optional[str] = None,
    declared_in_repo: Optional[bool] = None,
    called_in_repo: Optional[bool] = None
) -> str:
    """
    Browse all programming entities (classes, functions, methods, variables) tracked in the knowledge graph.

    PURPOSE:
    Use this tool to explore the full inventory of code entities in the Transformers library.
    Supports filtering by type and usage patterns, making it powerful for targeted exploration.

    WHEN TO USE:
    - To browse all classes, functions, or methods in the codebase
    - To find entities that are defined but never used (dead code analysis)
    - To find external entities that are called but not defined in the repo
    - To get an overview of entity distribution in the codebase
    - When you need entity names for use with go_to_definition or find_usages

    FILTERING OPTIONS:

    By entity_type:
    - 'class': Class definitions (BertModel, GPT2Config, etc.)
    - 'function': Standalone functions
    - 'method': Class methods
    - 'variable': Variable declarations
    - 'parameter': Function/method parameters
    - None: All entity types

    By declaration status (declared_in_repo):
    - True: Only entities DEFINED in this repo (has source code)
    - False: Only external entities (imported from other packages)
    - None: All entities

    By usage status (called_in_repo):
    - True: Only entities that ARE USED somewhere in the code
    - False: Only entities that are NEVER USED (potential dead code)
    - None: All entities

    USEFUL FILTER COMBINATIONS:
    - All classes: entity_type='class'
    - Defined classes: entity_type='class', declared_in_repo=True
    - Unused functions: entity_type='function', called_in_repo=False
    - External dependencies: declared_in_repo=False, called_in_repo=True

    Args:
        limit: Entities per page (default: 50). Use larger values for comprehensive listings.
        page: Page number starting from 1 for pagination
        entity_type: Filter by type: 'class', 'function', 'method', 'variable', 'parameter', or None for all
        declared_in_repo: True=defined in repo, False=external only, None=all
        called_in_repo: True=has usages, False=never used, None=all

    Returns:
        str: List of entities with their types, declaration count, and usage count. Use entity names with go_to_definition or find_usages.
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        # Convert limit to int if it's a string (MCP may pass strings)
        if isinstance(limit, str):
            try:
                limit = int(limit)
            except ValueError:
                return f"Error: 'limit' must be an integer, got '{limit}'"
        
        # Convert page to int if it's a string (MCP may pass strings)
        if isinstance(page, str):
            try:
                page = int(page)
            except ValueError:
                return f"Error: 'page' must be an integer, got '{page}'"
        
        if page < 1:
            return "Error: 'page' must be a positive integer (1 or greater)"
        
        # Handle entity_type - empty string should be treated as None
        if entity_type == "" or entity_type == "null":
            entity_type = None
            
        # Handle declared_in_repo - convert string to bool if needed
        if isinstance(declared_in_repo, str):
            if declared_in_repo.lower() in ("true", "1", "yes"):
                declared_in_repo = True
            elif declared_in_repo.lower() in ("false", "0", "no"):
                declared_in_repo = False
            elif declared_in_repo.lower() in ("none", "null", "all", ""):
                declared_in_repo = None

        # Handle called_in_repo - convert string to bool if needed
        if isinstance(called_in_repo, str):
            if called_in_repo.lower() in ("true", "1", "yes"):
                called_in_repo = True
            elif called_in_repo.lower() in ("false", "0", "no"):
                called_in_repo = False
            elif called_in_repo.lower() in ("none", "null", "all", ""):
                called_in_repo = None
        
        if not knowledge_graph.entities:
            return "No entities found in the knowledge graph."

        # Filter entities based on criteria
        filtered_entities = {}
        for entity_name, info in knowledge_graph.entities.items():
            # Filter by entity type if specified
            if entity_type is not None:
                entity_types = [t.lower() if t else '' for t in info.get('type', [])]
                if entity_type.lower() not in entity_types:
                    continue

            # Filter by declared_in_repo if specified
            if declared_in_repo is not None:
                has_declaration = len(info.get('declaring_chunk_ids', [])) > 0
                if declared_in_repo and not has_declaration:
                    continue
                if not declared_in_repo and has_declaration:
                    continue

            # Filter by called_in_repo (usages) if specified
            if called_in_repo is not None:
                has_calls = len(info.get('calling_chunk_ids', [])) > 0
                if called_in_repo and not has_calls:
                    continue
                if not called_in_repo and has_calls:
                    continue

            filtered_entities[entity_name] = info

        # Build the response with filtered entities
        if not filtered_entities:
            filter_desc = []
            if entity_type:
                filter_desc.append(f"type={entity_type}")
            if declared_in_repo is not None:
                filter_desc.append(f"declared_in_repo={declared_in_repo}")
            if called_in_repo is not None:
                filter_desc.append(f"called_in_repo={called_in_repo}")
            filter_text = f" (filtered by {', '.join(filter_desc)})" if filter_desc else ""
            return f"No entities found{filter_text}."

        # Calculate pagination
        total_entities = len(filtered_entities)
        total_pages = (total_entities + limit - 1) // limit  # Ceiling division
        
        if page > total_pages:
            return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total_entities} entities at {limit} per page)"
        
        start_idx = (page - 1) * limit
        end_idx = start_idx + limit
        
        # Get the paginated slice of entities
        entity_items = list(filtered_entities.items())
        paginated_items = entity_items[start_idx:end_idx]

        result = f"All Entities (Page {page}/{total_pages}, {total_entities} total):\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

        for i, (entity_name, info) in enumerate(paginated_items, start=start_idx + 1):
            result += f"{i}. {entity_name}\n"
            result += f"   Types: {', '.join(info.get('type', ['Unknown']))}\n"
            result += f"   Declarations: {len(info.get('declaring_chunk_ids', []))}\n"
            result += f"   Usages: {len(info.get('calling_chunk_ids', []))}\n\n"

        # Add pagination info
        result += f"━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n"
        result += f"Showing {start_idx + 1}-{min(end_idx, total_entities)} of {total_entities} entities\n"
        result += f"Page {page} of {total_pages}\n"
        
        if page < total_pages:
            result += f"Use page={page + 1} to see the next page\n"

        # Add filter information
        if entity_type:
            result += f"\n(Filtered by type={entity_type})\n"
        if declared_in_repo is not None:
            result += f"(Filtered by declared_in_repo={declared_in_repo})\n"
        if called_in_repo is not None:
            result += f"(Filtered by called_in_repo={called_in_repo})\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def diff_chunks(node_id_1: str, node_id_2: str) -> str:
    """
    Compare two code chunks and show their differences in unified diff format.

    PURPOSE:
    Use this tool to compare two pieces of code side-by-side. Shows exactly what's
    different between them using standard unified diff format (like git diff).

    WHEN TO USE:
    - To compare similar implementations (e.g., two attention mechanisms)
    - To understand differences between related classes or functions
    - To analyze variations in code patterns across the codebase
    - To compare two versions or implementations of similar functionality
    - When you suspect code duplication and want to see exact differences

    DIFF FORMAT:
    - Lines starting with '-' are only in the first chunk
    - Lines starting with '+' are only in the second chunk
    - Lines without prefix are common to both
    - @@ markers show line number context

    TYPICAL WORKFLOW:
    1. search_nodes("attention") -> find attention implementations
    2. Get chunk IDs from two different attention classes
    3. diff_chunks(chunk_id_1, chunk_id_2) -> compare implementations

    COMPARISON IDEAS:
    - BertAttention vs GPT2Attention
    - Different forward() implementations
    - Similar utility functions in different modules

    Args:
        node_id_1: ID of the first chunk/node to compare
        node_id_2: ID of the second chunk/node to compare

    Returns:
        str: Unified diff output showing line-by-line differences. Returns 'No differences found' if chunks are identical.
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        if node_id_1 not in knowledge_graph.graph:
            return f"Error: Node '{node_id_1}' not found in knowledge graph"
        if node_id_2 not in knowledge_graph.graph:
            return f"Error: Node '{node_id_2}' not found in knowledge graph"

        g = knowledge_graph.graph
        content1 = getattr(g.nodes[node_id_1]['data'], 'content', None)
        content2 = getattr(g.nodes[node_id_2]['data'], 'content', None)

        if not content1 or not content2:
            return "Error: One or both nodes have no content."

        diff = list(difflib.unified_diff(
            content1.splitlines(), content2.splitlines(),
            fromfile=node_id_1, tofile=node_id_2, lineterm=""
        ))

        if not diff:
            return "No differences found between the two chunks."

        return "\n".join(diff)
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def print_tree(root_id: str = "root", max_depth: int = 3) -> str:
    """
    Display a hierarchical tree view of the repository structure starting from any node.

    PURPOSE:
    Use this tool to visualize the structure of the codebase. Shows parent-child relationships
    in a familiar tree format, helping you understand how files and directories are organized.

    WHEN TO USE:
    - To explore the directory structure of the Transformers repository
    - To see what's inside a specific directory (use directory as root_id)
    - To understand the file organization for a component
    - To get an overview of the codebase hierarchy
    - When you need to understand where files are located

    TREE VISUALIZATION:
    - Each level shows node name and type (repo, directory, file, chunk)
    - Indentation represents depth in the hierarchy
    - Children are limited to prevent overwhelming output

    TIPS:
    - Start with max_depth=2 for a high-level overview
    - Increase max_depth to see more detail (but output gets larger)
    - Use a directory path as root_id to focus on a specific area
    - Use list_files_in_directory for more detailed file listings

    TYPICAL USAGE:
    - print_tree('root', max_depth=2) -> see top-level structure
    - print_tree('src/transformers/models', max_depth=2) -> see model organization
    - print_tree('src/transformers/models/bert', max_depth=3) -> see bert module structure

    Args:
        root_id: Starting node ID. Use 'root' for repository root, or a directory/file path to start from a specific location.
        max_depth: How many levels deep to show (default: 3). Higher values show more detail but larger output.

    Returns:
        str: ASCII tree visualization showing the hierarchical structure with node names and types
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        # Convert max_depth to int if it's a string (MCP may pass strings)
        if isinstance(max_depth, str):
            try:
                max_depth = int(max_depth)
            except ValueError:
                return f"Error: 'max_depth' must be an integer, got '{max_depth}'"
        
        g = knowledge_graph.graph

        if root_id not in g:
            # Try to find a suitable root
            roots = [n for n, d in g.nodes(data=True)
                    if getattr(d['data'], 'node_type', None) in ('repo', 'directory', 'file')]
            if roots:
                root_id = roots[0]
            else:
                return f"Error: Node '{root_id}' not found and no suitable root found"

        result = f"Tree View (starting from '{root_id}', max depth: {max_depth}):\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

        def format_node(node_id, depth):
            if depth > max_depth:
                return ""

            node = g.nodes[node_id]['data']
            name = getattr(node, 'name', node_id)
            node_type = getattr(node, 'node_type', '?')

            line = "  " * depth + f"- {name} ({node_type})\n"

            children = [t for s, t in g.out_edges(node_id)]
            for child in children[:20]:  # Limit children to prevent huge output
                line += format_node(child, depth + 1)

            if len(children) > 20:
                line += "  " * (depth + 1) + f"... and {len(children) - 20} more\n"

            return line

        result += format_node(root_id, 0)
        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def entity_relationships(node_id: str) -> str:
    """
    Display all incoming and outgoing relationships for any node, with relationship types.

    PURPOSE:
    Use this tool to get a complete picture of how a node connects to the rest of the
    knowledge graph. Shows both what points TO this node and what this node points TO.

    WHEN TO USE:
    - To understand all dependencies of an entity
    - To see what declares or calls a specific entity
    - To trace the full relationship network around any node
    - When you need more detail than get_neighbors provides about relationship types
    - For entity-centric analysis (understanding a class or function's connections)

    WHAT YOU'LL SEE:
    - Incoming relationships: Other nodes that have edges pointing TO this node
      (e.g., chunks that CALL this function, files that CONTAIN this chunk)
    - Outgoing relationships: This node's edges pointing TO other nodes
      (e.g., entities this chunk CALLS, chunks this file CONTAINS)
    - Relationship types for each edge (calls, declares, contains)

    COMPARISON WITH SIMILAR TOOLS:
    - get_node_edges: Same information but different formatting
    - get_neighbors: Shows neighbor node details, not edge details
    - get_related_chunks: Filtered by relationship type, chunks only

    TYPICAL WORKFLOW:
    1. go_to_definition("BertModel") -> find entity
    2. entity_relationships("BertModel") -> see what calls/uses BertModel

    Args:
        node_id: The ID of any node (entity, chunk, file, directory)

    Returns:
        str: Complete list of incoming and outgoing relationships with source/target IDs and relationship types
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        if node_id not in knowledge_graph.graph:
            return f"Error: Node '{node_id}' not found in knowledge graph"

        g = knowledge_graph.graph

        result = f"Relationships for '{node_id}':\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

        incoming = list(g.in_edges(node_id, data=True))
        outgoing = list(g.out_edges(node_id, data=True))

        if incoming:
            result += f"Incoming Relationships ({len(incoming)}):\n"
            for source, target, data in incoming[:20]:
                result += f"  ← {source} [{data.get('relation', '?')}]\n"
            if len(incoming) > 20:
                result += f"  ... and {len(incoming) - 20} more\n"
            result += "\n"

        if outgoing:
            result += f"Outgoing Relationships ({len(outgoing)}):\n"
            for source, target, data in outgoing[:20]:
                result += f"  → {target} [{data.get('relation', '?')}]\n"
            if len(outgoing) > 20:
                result += f"  ... and {len(outgoing) - 20} more\n"

        if not incoming and not outgoing:
            result += "No relationships found.\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def search_by_type_and_name(node_type: str, name_query: str, limit: int = 10, page: int = 1, partial_allowed: bool = True) -> str:
    """
    Search for nodes by combining type filtering with name pattern matching.

    PURPOSE:
    Use this tool for precise, targeted searches when you know the type of node you're looking
    for and have a partial name. More efficient than list_nodes_by_type when you have name hints.

    WHEN TO USE:
    - To find all classes containing 'Attention': search_by_type_and_name('class', 'Attention')
    - To find functions with 'forward' in name: search_by_type_and_name('function', 'forward')
    - To find files named 'config': search_by_type_and_name('file', 'config')
    - When you know the type AND have a partial name to search for
    - For pattern-based discovery of related components

    SEARCH BEHAVIOR:
    - Case-insensitive matching
    - partial_allowed=True (default): Fuzzy matching, finds 'BertEmbeddings' when searching 'Embed'
    - partial_allowed=False: Requires exact substring match
    - Results sorted by match quality (exact matches first, then substring, then fuzzy)

    VALID node_type VALUES:
    For entities: 'class', 'function', 'method', 'variable', 'parameter'
    For structural: 'file', 'chunk', 'directory'

    SEARCH EXAMPLES:
    - All Attention classes: search_by_type_and_name('class', 'Attention')
    - All Embedding classes: search_by_type_and_name('class', 'Embedding')
    - Config files: search_by_type_and_name('file', 'config')
    - Forward methods: search_by_type_and_name('method', 'forward')
    - Test files: search_by_type_and_name('file', 'test_')

    COMPARISON WITH SIMILAR TOOLS:
    - search_nodes: Full-text search in code content (doesn't filter by type)
    - list_nodes_by_type: Lists all of a type (no name filter)
    - search_by_type_and_name: Combines type filter + name search (best of both)

    Args:
        node_type: Type to filter by: 'class', 'function', 'method', 'file', 'chunk', 'directory', etc.
        name_query: Name pattern to search for (case-insensitive). Can be partial.
        limit: Results per page (default: 10)
        page: Page number for pagination
        partial_allowed: Enable fuzzy matching (default: True). Set False for stricter matching.

    Returns:
        str: Matching nodes sorted by relevance, with IDs and types. Use IDs with get_node_info for details.
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        # Convert limit/page to int if they're strings (MCP/Gradio may pass strings)
        if isinstance(limit, str):
            try:
                limit = int(limit)
            except ValueError:
                return f"Error: 'limit' must be an integer, got '{limit}'"

        if isinstance(page, str):
            try:
                page = int(page)
            except ValueError:
                return f"Error: 'page' must be an integer, got '{page}'"

        # Convert partial_allowed to bool if it's a string
        if isinstance(partial_allowed, str):
            partial_allowed = partial_allowed.lower() in ('true', '1', 'yes')

        if limit <= 0:
            return "Error: limit must be a positive integer"
        if page < 1:
            return "Error: 'page' must be a positive integer (1 or greater)"

        g = knowledge_graph.graph
        matches = []
        query_lower = name_query.lower()
        
        # Build regex pattern for partial_allowed matching
        # This will match names containing all characters of the query in order
        if partial_allowed:
            # Create pattern that matches query as substring or with characters spread out
            # e.g., "Embed" matches "Embedding", "BertEmbeddings", "EmbedLayer"
            partial_pattern = '.*'.join(re.escape(c) for c in query_lower)
            partial_regex = re.compile(partial_pattern, re.IGNORECASE)
        
        for nid, n in g.nodes(data=True):
            node = n['data']
            node_name = getattr(node, 'name', '')
            
            if not node_name:
                continue
            
            # Check if name matches the query
            name_matches = False
            if partial_allowed:
                # Partial match: substring match OR regex pattern match
                if query_lower in node_name.lower() or partial_regex.search(node_name):
                    name_matches = True
            else:
                # Exact substring match
                if query_lower in node_name.lower():
                    name_matches = True
            
            if not name_matches:
                continue
            
            # Check type based on node_type
            current_node_type = getattr(node, 'node_type', None)
            
            # For entity nodes, check entity_type instead of node_type
            if current_node_type == 'entity':
                entity_type = getattr(node, 'entity_type', '')
                
                # Fallback: if entity_type is empty, check the entities dictionary
                # This handles cases where EntityNode was created before the fix
                if not entity_type and nid in knowledge_graph.entities:
                    entity_types = knowledge_graph.entities[nid].get('type', [])
                    entity_type = entity_types[0] if entity_types else ''
                
                if entity_type and entity_type.lower() == node_type.lower():
                    # Calculate match score for sorting (exact matches first)
                    score = 0 if query_lower == node_name.lower() else (1 if query_lower in node_name.lower() else 2)
                    matches.append({
                        "id": nid,
                        "name": node_name,
                        "type": f"entity ({entity_type})",
                        "score": score
                    })
            # For other nodes, check node_type directly
            elif current_node_type == node_type:
                score = 0 if query_lower == node_name.lower() else (1 if query_lower in node_name.lower() else 2)
                matches.append({
                    "id": nid,
                    "name": node_name,
                    "type": current_node_type,
                    "score": score
                })
        
        # Sort by match score (best matches first)
        matches.sort(key=lambda x: (x['score'], x['name'].lower()))

        total = len(matches)
        if total == 0:
            return f"No matches for type '{node_type}' and name containing '{name_query}'."

        # Pagination
        total_pages = (total + limit - 1) // limit
        if page > total_pages:
            return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} results at {limit} per page)"

        start_idx = (page - 1) * limit
        end_idx = start_idx + limit
        page_slice = matches[start_idx:end_idx]

        result = f"Matches for type '{node_type}' and name '{name_query}' (Page {page}/{total_pages}, {total} total):\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

        for i, match in enumerate(page_slice, start=start_idx + 1):
            result += f"{i}. {match['name']}\n"
            result += f"   ID: {match['id']}\n"
            result += f"   Type: {match['type']}\n\n"

        if page < total_pages:
            result += f"Use page={page + 1} to see the next page\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def get_chunk_context(node_id: str) -> str:
    """
    Get expanded code context by retrieving a chunk along with its previous and next chunks.

    PURPOSE:
    Use this tool when you need to see MORE CODE CONTEXT around a specific chunk.
    Chunks are logical code segments, but sometimes you need to see surrounding code
    to fully understand the implementation.

    WHEN TO USE:
    - After search_nodes or get_node_info when you need more surrounding context
    - When a chunk shows a partial function/class and you need the complete picture
    - To understand code flow across chunk boundaries
    - To see imports or setup code that precedes a chunk
    - To see what code follows after a chunk

    WHAT YOU'LL GET:
    - The previous chunk's content (if it exists)
    - The target chunk's content
    - The next chunk's content (if it exists)
    - All organized by file and joined together seamlessly

    CONTEXT EXPANSION:
    - Shows up to 3 consecutive chunks (prev + current + next)
    - Useful for understanding function bodies that span chunks
    - Helps see class context when looking at individual methods

    TYPICAL WORKFLOW:
    1. search_nodes("attention forward") -> find relevant chunk
    2. get_node_info(chunk_id) -> see chunk content
    3. get_chunk_context(chunk_id) -> see surrounding code for fuller understanding

    COMPARISON WITH get_node_info:
    - get_node_info: Single chunk content + full metadata
    - get_chunk_context: Expanded code view (prev + current + next chunks), less metadata

    Args:
        node_id: The chunk ID to get context for (e.g., 'src/transformers/models/bert/modeling_bert.py::chunk_5')

    Returns:
        str: Combined content of previous, current, and next chunks organized by file. Provides seamless code view.
    """

    
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        if node_id not in knowledge_graph.graph:
            return f"Error: Node '{node_id}' not found in knowledge graph"

        g = knowledge_graph.graph
        current_chunk = g.nodes[node_id]['data']
        previous_chunk = knowledge_graph.get_previous_chunk(node_id)
        next_chunk = knowledge_graph.get_next_chunk(node_id)

        # Collect all chunks (previous, current, next)
        chunks = []
        if previous_chunk:
            chunks.append(previous_chunk)
        chunks.append(current_chunk)
        if next_chunk:
            chunks.append(next_chunk)

        # Organize and join chunks
        organized = organize_chunks_by_file_name(chunks)
        full_content = join_organized_chunks(organized)

        return full_content
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def get_file_stats(path: str) -> str:
    """
    Get detailed statistics and metrics for a specific file or directory.

    PURPOSE:
    Use this tool to get quantitative metrics about a file including line counts,
    entity counts, and chunk counts. Useful for understanding file complexity.

    WHEN TO USE:
    - To assess the size and complexity of a file
    - To see summary counts of entities declared and called
    - To understand how a file is chunked
    - For code metrics and analysis tasks
    - When deciding which files to explore further

    METRICS PROVIDED:
    - Line count (total lines in the file)
    - Declared entities count with a sample list
    - Called entities count with a sample list
    - Number of chunks the file is divided into

    COMPARISON WITH get_file_structure:
    - get_file_stats: Quantitative metrics (counts, numbers)
    - get_file_structure: Qualitative overview (entity names, chunk IDs)

    TYPICAL USAGE:
    - get_file_stats('src/transformers/models/bert/modeling_bert.py') -> see metrics
    - Use this to identify large/complex files before diving in

    Args:
        path: The file path to analyze. Must match the path as stored in the knowledge graph.

    Returns:
        str: Statistics including line count, declared entities, called entities, and chunk count
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        g = knowledge_graph.graph
        nodes = [n for n, d in g.nodes(data=True) if getattr(d['data'], 'path', None) == path]

        if not nodes:
            return f"No nodes found for path '{path}'."

        result = f"Statistics for '{path}':\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

        for node_id in nodes:
            node = g.nodes[node_id]['data']
            content = getattr(node, 'content', '')
            declared = getattr(node, 'declared_entities', [])
            called = getattr(node, 'called_entities', [])
            chunks = [t for s, t in g.out_edges(node_id)
                     if getattr(g.nodes[t]['data'], 'node_type', None) == 'chunk']

            result += f"Node: {node_id} ({getattr(node, 'node_type', '?')})\n"
            result += f"  Lines: {len(content.splitlines()) if content else 0}\n"
            result += f"  Declared entities: {len(declared)}\n"

            if declared:
                for entity in declared[:10]:
                    if isinstance(entity, dict):
                        result += f"    - {entity.get('name', '?')} ({entity.get('type', '?')})\n"
                    else:
                        result += f"    - {entity}\n"
                if len(declared) > 10:
                    result += f"    ... and {len(declared) - 10} more\n"

            result += f"  Called entities: {len(called)}\n"
            if called:
                for entity in called[:10]:
                    result += f"    - {entity}\n"
                if len(called) > 10:
                    result += f"    ... and {len(called) - 10} more\n"

            result += f"  Chunks: {len(chunks)}\n\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def find_path(source_id: str, target_id: str, max_depth: int = 5) -> str:
    """
    Find the shortest path between two nodes in the knowledge graph.

    PURPOSE:
    Use this tool to discover how two code elements are connected through the graph.
    Reveals the chain of relationships linking two seemingly unrelated pieces of code.

    WHEN TO USE:
    - To understand how two classes/functions are related
    - To trace dependency chains between components
    - To discover indirect connections between code elements
    - To verify if two nodes are connected at all
    - For understanding code architecture and coupling

    WHAT YOU'LL GET:
    - Path length (number of hops)
    - Ordered list of nodes from source to target
    - Visual representation of the path

    LIMITATIONS:
    - max_depth limits search to avoid long computations
    - If no path found within max_depth, nodes may still be connected via longer path
    - Very distant nodes may require increasing max_depth

    EXAMPLE QUERIES:
    - How is BertModel connected to GPT2Model?
    - What's the path from a utility function to a model class?
    - How many hops between two files?

    TYPICAL WORKFLOW:
    1. Identify two node IDs of interest
    2. find_path(source, target) -> discover connection
    3. get_node_info for nodes in the path to understand the relationship

    Args:
        source_id: Starting node ID (any node type)
        target_id: Destination node ID (any node type)
        max_depth: Maximum path length to search (default: 5). Increase for distant nodes.

    Returns:
        str: Path from source to target showing each node in sequence, or message if no path found
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        # Convert max_depth to int if it's a string (MCP may pass strings)
        if isinstance(max_depth, str):
            try:
                max_depth = int(max_depth)
            except ValueError:
                return f"Error: 'max_depth' must be an integer, got '{max_depth}'"
        
        path_result = knowledge_graph.find_path(source_id, target_id, max_depth)

        if "error" in path_result:
            return f"Error: {path_result['error']}"

        if not path_result.get("path"):
            return f"No path found from '{source_id}' to '{target_id}' within depth {max_depth}"

        result = f"Path from '{source_id}' to '{target_id}':\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
        result += f"Length: {path_result['length']}\n\n"

        path = path_result['path']
        for i, node_id in enumerate(path):
            result += f"{i}. {node_id}\n"
            if i < len(path) - 1:
                result += "   ↓\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def get_subgraph(node_id: str, depth: int = 2, edge_types: Optional[str] = None) -> str:
    """
    Extract a local subgraph around a node up to a specified depth.

    PURPOSE:
    Use this tool to get a bounded view of the graph neighborhood around any node.
    Shows all nodes reachable within a certain number of hops, optionally filtered by edge type.

    WHEN TO USE:
    - To understand the local network around a class or function
    - To extract a bounded region of the knowledge graph for analysis
    - To see all nodes within N hops of a target node
    - To analyze the dependency neighborhood of a component
    - When get_neighbors isn't enough and you need multi-hop exploration

    DEPTH EXPLANATION:
    - depth=1: Only immediate neighbors (same as get_neighbors)
    - depth=2: Neighbors and their neighbors (2 hops)
    - depth=3+: Larger neighborhood (exponentially more nodes)

    EDGE TYPE FILTERING:
    - Pass comma-separated edge types to filter: 'calls,declares'
    - Common types: 'calls', 'contains', 'declares'
    - Leave empty or None for all edge types

    OUTPUT:
    - Node count and edge count in the subgraph
    - List of all node IDs in the extracted subgraph
    - Filtered by edge types if specified

    TYPICAL WORKFLOW:
    1. Find a central node of interest
    2. get_subgraph(node_id, depth=2) -> see local neighborhood
    3. Use node IDs from result with get_node_info for details

    COMPARISON WITH get_neighbors:
    - get_neighbors: Single hop, shows node details
    - get_subgraph: Multi-hop, shows subgraph structure and counts

    Args:
        node_id: Central node to build subgraph around
        depth: Radius in hops from central node (default: 2). Higher = larger subgraph.
        edge_types: Optional comma-separated filter: 'calls,contains,declares' or None for all

    Returns:
        str: Subgraph summary with node/edge counts and list of included node IDs
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        # Convert depth to int if it's a string (MCP may pass strings)
        if isinstance(depth, str):
            try:
                depth = int(depth)
            except ValueError:
                return f"Error: 'depth' must be an integer, got '{depth}'"
        
        edge_types_list = edge_types.split(",") if edge_types else None
        subgraph_result = knowledge_graph.get_subgraph(node_id, depth, edge_types_list)

        if "error" in subgraph_result:
            return f"Error: {subgraph_result['error']}"

        result = f"Subgraph around '{node_id}' (depth: {depth}):\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
        result += f"Nodes: {subgraph_result['node_count']}\n"
        result += f"Edges: {subgraph_result['edge_count']}\n"

        if edge_types_list:
            result += f"Filtered by edge types: {', '.join(edge_types_list)}\n"

        result += "\nNodes in subgraph:\n"
        for node in subgraph_result['nodes'][:30]:
            result += f"  - {node}\n"

        if len(subgraph_result['nodes']) > 30:
            result += f"  ... and {len(subgraph_result['nodes']) - 30} more\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def list_files_in_directory(directory_path: str = "", pattern: str = "*", recursive: bool = True, limit: int = 50, page: int = 1) -> str:
    """
    Browse and list files in the repository with flexible filtering options.

    PURPOSE:
    Use this tool to explore the file structure of the Transformers library.
    Supports directory scoping, glob patterns, and recursive/non-recursive modes.

    WHEN TO USE:
    - To see what files exist in a directory
    - To find files by pattern (e.g., all Python files, all test files)
    - To explore the repository structure directory by directory
    - To find specific file types in specific locations
    - When you need file paths for use with other tools

    FILTERING OPTIONS:

    directory_path:
    - Empty string '': Search all files in the repository
    - 'src/transformers/models': Only files under this directory
    - 'src/transformers/models/bert': Focus on a specific model

    pattern (glob patterns):
    - '*': All files (default)
    - '*.py': Python files only
    - 'test_*.py': Test files
    - '*config*': Files with 'config' in name
    - 'modeling_*.py': Modeling files

    recursive:
    - True (default): Include files in subdirectories
    - False: Only files directly in the specified directory

    COMMON USE CASES:
    - All files: list_files_in_directory()
    - Bert model files: list_files_in_directory('src/transformers/models/bert')
    - All Python files: list_files_in_directory(pattern='*.py')
    - Test files only: list_files_in_directory(pattern='test_*.py')
    - Config files: list_files_in_directory(pattern='*config*')

    COMPARISON WITH print_tree:
    - print_tree: Visual hierarchy, includes directories
    - list_files_in_directory: Flat file list with details, better for finding specific files

    Args:
        directory_path: Directory to search in. Empty string for entire repository.
        pattern: Glob pattern for filename filtering (default: '*' matches all)
        recursive: Search subdirectories (default: True)
        limit: Files per page (default: 50)
        page: Page number for pagination

    Returns:
        str: List of matching files with paths, languages, and entity counts
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        # Convert limit to int if it's a string
        if isinstance(limit, str):
            try:
                limit = int(limit)
            except ValueError:
                return f"Error: 'limit' must be an integer, got '{limit}'"
        
        # Convert page to int if it's a string
        if isinstance(page, str):
            try:
                page = int(page)
            except ValueError:
                return f"Error: 'page' must be an integer, got '{page}'"
        
        if limit <= 0:
            return "Error: limit must be a positive integer"
        if page < 1:
            return "Error: 'page' must be a positive integer (1 or greater)"
        
        # Convert recursive to bool if it's a string
        if isinstance(recursive, str):
            recursive = recursive.lower() in ('true', '1', 'yes')

        g = knowledge_graph.graph
        matching_files = []
        
        for nid, n in g.nodes(data=True):
            node = n['data']
            node_type = getattr(node, 'node_type', None)
            
            # Only look at file nodes
            if node_type != 'file':
                continue
            
            file_path = getattr(node, 'path', nid)
            file_name = getattr(node, 'name', '')
            
            # Filter by directory path if specified
            if directory_path:
                if recursive:
                    # Check if file is under the directory
                    if not file_path.startswith(directory_path.rstrip('/') + '/') and file_path != directory_path:
                        continue
                else:
                    # Check if file is directly in the directory (not in subdirectories)
                    parent_dir = '/'.join(file_path.rsplit('/', 1)[:-1]) if '/' in file_path else ''
                    if parent_dir != directory_path.rstrip('/'):
                        continue
            
            # Apply glob pattern matching
            if pattern and pattern != '*':
                # Match against both full path and filename
                if not (fnmatch.fnmatch(file_path, pattern) or 
                        fnmatch.fnmatch(file_name, pattern) or
                        fnmatch.fnmatch(file_path, f'**/{pattern}')):
                    continue
            
            language = getattr(node, 'language', 'Unknown')
            declared_entities = getattr(node, 'declared_entities', [])
            
            matching_files.append({
                'path': file_path,
                'name': file_name,
                'language': language,
                'entity_count': len(declared_entities)
            })
        
        # Sort by path for consistent ordering
        matching_files.sort(key=lambda x: x['path'])

        if not matching_files:
            filter_desc = f" in '{directory_path}'" if directory_path else ""
            pattern_desc = f" matching '{pattern}'" if pattern and pattern != '*' else ""
            return f"No files found{filter_desc}{pattern_desc}."

        total = len(matching_files)
        # Pagination
        total_pages = (total + limit - 1) // limit
        if page > total_pages:
            return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} files at {limit} per page)"

        start_idx = (page - 1) * limit
        end_idx = start_idx + limit
        page_slice = matching_files[start_idx:end_idx]

        result = f"Files"
        if directory_path:
            result += f" in '{directory_path}'"
        if pattern and pattern != '*':
            result += f" matching '{pattern}'"
        result += f" (Page {page}/{total_pages}, {total} total):\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

        for i, f in enumerate(page_slice, start=start_idx + 1):
            result += f"{i}. {f['path']}\n"
            result += f"   Language: {f['language']}, Entities: {f['entity_count']}\n\n"

        # Pagination hint
        if page < total_pages:
            result += f"Use page={page + 1} to see the next page\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def find_files_importing(module_or_entity: str, limit: int = 30, page: int = 1) -> str:
    """
    Find all files that import or use a specific module, class, or function.

    PURPOSE:
    Use this tool to trace import dependencies and understand which parts of the
    codebase depend on a particular module or entity.

    WHEN TO USE:
    - To find all files that import a specific module (e.g., 'torch', 'numpy')
    - To trace dependencies on a class or function
    - To understand the impact scope of a module
    - To find usage patterns of external libraries
    - For dependency analysis and impact assessment

    SEARCH BEHAVIOR:
    - Searches through 'called_entities' metadata
    - Also scans code chunks for import statement patterns
    - Matches import, from...import, require, use patterns
    - Case-insensitive matching

    WHAT YOU'LL GET:
    - List of files that import/use the specified module or entity
    - Match type (called_entity or import_statement)
    - Matched entity names when applicable

    EXAMPLE QUERIES:
    - find_files_importing('torch') -> files using PyTorch
    - find_files_importing('numpy') -> files using NumPy
    - find_files_importing('BertModel') -> files using BertModel
    - find_files_importing('attention') -> files related to attention

    LIMITATIONS:
    - May not catch all dynamic imports
    - Pattern matching may have false positives/negatives
    - For comprehensive search, combine with search_nodes

    Args:
        module_or_entity: Name of the module, class, or function to search for (case-insensitive)
        limit: Maximum results per page (default: 30)
        page: Page number for pagination

    Returns:
        str: List of files that import or use the specified module/entity, with match details
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        # Convert limit to int if it's a string
        if isinstance(limit, str):
            try:
                limit = int(limit)
            except ValueError:
                return f"Error: 'limit' must be an integer, got '{limit}'"
        
        # Convert page to int if it's a string
        if isinstance(page, str):
            try:
                page = int(page)
            except ValueError:
                return f"Error: 'page' must be an integer, got '{page}'"
        
        if limit <= 0:
            return "Error: limit must be a positive integer"
        if page < 1:
            return "Error: 'page' must be a positive integer (1 or greater)"

        g = knowledge_graph.graph
        importing_files = []
        search_term = module_or_entity.lower()
        
        # Search through file nodes
        for nid, n in g.nodes(data=True):
            node = n['data']
            node_type = getattr(node, 'node_type', None)
            
            if node_type != 'file':
                continue
            
            file_path = getattr(node, 'path', nid)
            called_entities = getattr(node, 'called_entities', [])
            
            # Check if the module/entity is in called entities
            found_in_calls = False
            matched_entities = []
            for entity in called_entities:
                entity_str = str(entity).lower() if not isinstance(entity, dict) else entity.get('name', '').lower()
                if search_term in entity_str:
                    found_in_calls = True
                    matched_entities.append(entity_str)
            
            if found_in_calls:
                importing_files.append({
                    'path': file_path,
                    'name': getattr(node, 'name', ''),
                    'matched_entities': matched_entities[:5],
                    'match_type': 'called_entity'
                })
                continue
            
            # Also check chunk contents for import statements
            chunks = knowledge_graph.get_chunks_of_file(file_path) if hasattr(knowledge_graph, 'get_chunks_of_file') else []
            for chunk in chunks[:3]:  # Check first few chunks (usually where imports are)
                content = getattr(chunk, 'content', '')
                # Look for import patterns
                import_patterns = [
                    rf'import\s+.*{re.escape(module_or_entity)}',
                    rf'from\s+.*{re.escape(module_or_entity)}.*\s+import',
                    rf'require\s*\(\s*["\'].*{re.escape(module_or_entity)}',
                    rf'use\s+.*{re.escape(module_or_entity)}',
                ]
                for pattern in import_patterns:
                    if re.search(pattern, content, re.IGNORECASE):
                        if not any(f['path'] == file_path for f in importing_files):
                            importing_files.append({
                                'path': file_path,
                                'name': getattr(node, 'name', ''),
                                'matched_entities': [],
                                'match_type': 'import_statement'
                            })
                        break
        
        # Sort by path
        importing_files.sort(key=lambda x: x['path'])

        if not importing_files:
            return f"No files found importing '{module_or_entity}'.\n\nTip: Try searching for the module name in code content using search_nodes."

        total = len(importing_files)
        # Pagination
        total_pages = (total + limit - 1) // limit
        if page > total_pages:
            return f"Error: Page {page} does not exist. Total pages: {total_pages} (with {total} files at {limit} per page)"

        start_idx = (page - 1) * limit
        end_idx = start_idx + limit
        page_slice = importing_files[start_idx:end_idx]

        result = f"Files importing '{module_or_entity}' (Page {page}/{total_pages}, {total} total):\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"

        for i, f in enumerate(page_slice, start=start_idx + 1):
            result += f"{i}. {f['path']}\n"
            result += f"   Match type: {f['match_type']}\n"
            if f['matched_entities']:
                result += f"   Matched: {', '.join(f['matched_entities'][:3])}\n"
            result += "\n"

        # Pagination hint
        if page < total_pages:
            result += f"Use page={page + 1} to see the next page\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"


@observe(as_type="tool")
def get_concept_overview(concept: str, limit: int = 15) -> str:
    """
    Get a high-level overview of how a concept is implemented across the Transformers codebase.

    PURPOSE:
    Use this tool for broad exploration of a concept or feature. Aggregates related
    classes, functions, files, and code snippets into a single comprehensive view.
    Ideal for initial investigation of a topic.

    WHEN TO USE:
    - FIRST STEP when exploring a new concept (before detailed searches)
    - To understand how a feature is implemented across the codebase
    - To discover all components related to a concept
    - To get a bird's-eye view before diving into specifics
    - When you're not sure where to start investigating

    SEARCH STRATEGY:
    This tool combines multiple search approaches:
    - Searches entity names (classes, functions, methods) containing the concept
    - Searches file names and paths
    - Searches chunk content and descriptions
    - Aggregates results into categorized sections

    CONCEPT EXAMPLES:
    - 'attention' -> attention mechanisms across all models
    - 'embedding' -> embedding layers and utilities
    - 'tokenizer' -> tokenization components
    - 'generation' -> text generation utilities
    - 'config' -> configuration classes
    - 'cache' -> caching mechanisms
    - 'rope' -> rotary position embeddings
    - 'flash' -> flash attention implementations

    OUTPUT STRUCTURE:
    - Related Classes: Class definitions matching the concept
    - Related Functions/Methods: Functions matching the concept
    - Related Files: Files with concept in path/name
    - Code Snippets: Relevant code chunks

    TYPICAL WORKFLOW:
    1. get_concept_overview('attention') -> see all attention-related components
    2. Identify specific classes/functions of interest
    3. go_to_definition or search_nodes for detailed exploration

    Args:
        concept: The concept to explore (e.g., 'attention', 'embedding', 'generation', 'tokenizer')
        limit: Maximum items per category (default: 15)

    Returns:
        str: Categorized overview with related classes, functions, files, and code snippets
    """
    if knowledge_graph is None:
        return "Error: Knowledge graph not initialized"

    try:
        # Convert limit to int if it's a string
        if isinstance(limit, str):
            try:
                limit = int(limit)
            except ValueError:
                return f"Error: 'limit' must be an integer, got '{limit}'"

        g = knowledge_graph.graph
        concept_lower = concept.lower()
        
        # Categories to collect
        related_classes = []
        related_functions = []
        related_files = []
        related_chunks = []
        
        # Search through all nodes
        for nid, n in g.nodes(data=True):
            node = n['data']
            node_type = getattr(node, 'node_type', None)
            node_name = getattr(node, 'name', '')
            
            # Check if concept appears in name
            name_match = concept_lower in node_name.lower()
            
            if node_type == 'entity':
                entity_type = getattr(node, 'entity_type', '')
                if name_match:
                    if entity_type.lower() == 'class' and len(related_classes) < limit:
                        declaring = getattr(node, 'declaring_chunk_ids', [])
                        related_classes.append({
                            'name': node_name,
                            'id': nid,
                            'file': declaring[0] if declaring else 'Unknown'
                        })
                    elif entity_type.lower() in ('function', 'method') and len(related_functions) < limit:
                        declaring = getattr(node, 'declaring_chunk_ids', [])
                        related_functions.append({
                            'name': node_name,
                            'id': nid,
                            'type': entity_type,
                            'file': declaring[0] if declaring else 'Unknown'
                        })
            
            elif node_type == 'file' and len(related_files) < limit:
                # Check if concept in filename or path
                file_path = getattr(node, 'path', '')
                if concept_lower in file_path.lower() or name_match:
                    declared = getattr(node, 'declared_entities', [])
                    related_files.append({
                        'path': file_path,
                        'name': node_name,
                        'entity_count': len(declared)
                    })
            
            elif node_type == 'chunk' and len(related_chunks) < limit // 2:
                # Check if concept in chunk content or description
                content = getattr(node, 'content', '')
                description = getattr(node, 'description', '')
                if concept_lower in content.lower() or concept_lower in (description or '').lower():
                    file_path = getattr(node, 'path', '')
                    related_chunks.append({
                        'id': nid,
                        'file': file_path,
                        'content': content
                    })

        # Build the overview
        result = f"Concept Overview: '{concept}'\n"
        result += "━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\n\n"
        
        # Summary
        total = len(related_classes) + len(related_functions) + len(related_files) + len(related_chunks)
        result += f"Found {total} related items across the codebase.\n\n"
        
        if related_classes:
            result += f"📦 Related Classes ({len(related_classes)}):\n"
            for cls in related_classes[:10]:
                result += f"  • {cls['name']}\n"
                result += f"    File: {cls['file']}\n"
            if len(related_classes) > 10:
                result += f"  ... and {len(related_classes) - 10} more\n"
            result += "\n"
        
        if related_functions:
            result += f"⚡ Related Functions/Methods ({len(related_functions)}):\n"
            for func in related_functions[:10]:
                result += f"  • {func['name']} ({func['type']})\n"
                result += f"    File: {func['file']}\n"
            if len(related_functions) > 10:
                result += f"  ... and {len(related_functions) - 10} more\n"
            result += "\n"
        
        if related_files:
            result += f"📄 Related Files ({len(related_files)}):\n"
            for f in related_files[:10]:
                result += f"  • {f['path']}\n"
                result += f"    Entities: {f['entity_count']}\n"
            if len(related_files) > 10:
                result += f"  ... and {len(related_files) - 10} more\n"
            result += "\n"
        
        if related_chunks:
            result += f"📝 Code Snippets ({len(related_chunks)}):\n"
            for chunk in related_chunks[:5]:
                result += f"  • {chunk['id']}\n"
                result += f"    Content:\n{chunk['content']}\n\n"
            if len(related_chunks) > 5:
                result += f"  ... and {len(related_chunks) - 5} more\n"
        
        if total == 0:
            result += "No direct matches found.\n\n"
            result += "Suggestions:\n"
            result += f"  • Try searching with: search_nodes('{concept}')\n"
            result += f"  • Try partial name: search_by_type_and_name('class', '{concept[:4]}')\n"
            result += f"  • Check entity list: list_all_entities(entity_type='class')\n"

        return result
    except Exception as e:
        return f"Error: {str(e)}"

    
def _get_header_explorer():
    html = """
    <style>
        .kge-header-container {
            background: linear-gradient(314deg, #64748b 0%, #373f4a 100%);
            padding: 28px 22px;
            border-radius: 16px;
            color: white !important;
            box-shadow: 0 10px 15px -3px rgba(0, 0, 0, 0.1),
                        0 4px 6px -2px rgba(0, 0, 0, 0.05);
            margin-bottom: 25px;
            font-family: 'Inter', -apple-system, sans-serif;
            text-align: center;
        }

        .kge-header-title {
            font-size: 30px;
            font-weight: 700;
            margin-bottom: 8px;
        }

        .kge-header-subtitle {
            font-size: 17px;
            font-weight: 400;
            margin-bottom: 6px;
        }

        .kge-header-link a {
            color: #d7e8ff;
            font-weight: 600;
            text-decoration: none;
        }

        .kge-header-link a:hover {
            text-decoration: underline;
        }
    </style>

    <div class="kge-header-container">
        <div class="kge-header-title">Code Knowledge Graph Explorer — 🤗 Transformers Library</div>
        <div class="kge-header-subtitle">
            Explore, query, and understand the structure of the Hugging Face Transformers codebase.
        </div>
        <div class="kge-header-link">
            <a href="https://www.epita.fr/" target="_blank">EPITA Website</a>
        </div>
    </div>
    """
    return html


# ==================== Gradio App ====================

def create_gradio_app():
    """Create and configure the Gradio interface."""

    with gr.Blocks(title="", theme=gr.themes.Soft()) as demo:
        gr.HTML(_get_header_explorer())
        # Helper to render tool docstrings in the UI
        def _tool_doc_md(func):
            doc = (func.__doc__ or "No description available.").strip()
            # Render as a fenced code block for readability
            return f"**Description:**\n\n```\n{doc}\n```"

        gr.Markdown("""

        Understanding large codebases is essential for software engineers. This Space presents a Code Knowledge Graph MCP Server built around the Hugging Face Transformers library (4,000+ files, 400k+ lines of code). It enables LLM-based coding agents to analyze code structure, follow dependencies, and spot potential improvements. Developed initially for EPITA coding courses, these capabilities make it easier to review, navigate, and understand complex projects such as the Transformers library.
        """)

        with gr.Tab("📊 Graph Overview"):
            stats_btn = gr.Button("Get Graph Statistics", variant="primary")
            stats_output = gr.Textbox(label="Statistics", lines=20, max_lines=30)
            stats_btn.click(fn=get_graph_stats, outputs=stats_output)
            gr.Markdown(_tool_doc_md(get_graph_stats))

        with gr.Tab("🔎 Search"):
            with gr.Row():
                with gr.Column():
                    search_query = gr.Textbox(label="Search Query", placeholder="Enter search query...")
                    search_limit = gr.Slider(1, 50, value=10, step=1, label="Results per Page")
                    search_page = gr.Slider(1, 100, value=1, step=1, label="Page")
                    search_btn = gr.Button("Search", variant="primary")
                with gr.Column():
                    search_output = gr.Textbox(label="Search Results", lines=20, max_lines=30)
            search_btn.click(fn=search_nodes, inputs=[search_query, search_limit, search_page], outputs=search_output)
            gr.Markdown(_tool_doc_md(search_nodes))

        with gr.Tab("📝 Node Info"):
            with gr.Row():
                with gr.Column():
                    node_id_input = gr.Textbox(label="Node ID", placeholder="Enter node ID...")
                    node_info_btn = gr.Button("Get Node Info", variant="primary")
                    node_edges_btn = gr.Button("Get Node Edges", variant="secondary")
                with gr.Column():
                    node_output = gr.Textbox(label="Node Information", lines=20, max_lines=30)
            node_info_btn.click(fn=get_node_info, inputs=node_id_input, outputs=node_output)
            gr.Markdown("#Get Node Info:" + _tool_doc_md(get_node_info))
            node_edges_btn.click(fn=get_node_edges, inputs=node_id_input, outputs=node_output)
            gr.Markdown("#Get Node Edges:" + _tool_doc_md(get_node_edges))

        with gr.Tab("🏗️ Structure"):
            gr.Markdown("### Repository Tree")
            with gr.Row():
                with gr.Column():
                    tree_root = gr.Textbox(label="Root Node ID", value="root", placeholder="root")
                    tree_depth = gr.Slider(1, 10, value=3, step=1, label="Max Depth")
                    tree_btn = gr.Button("Show Tree", variant="primary")
                with gr.Column():
                    tree_output = gr.Textbox(label="Tree View", lines=20, max_lines=40)
            tree_btn.click(fn=print_tree, inputs=[tree_root, tree_depth], outputs=tree_output)
            gr.Markdown(_tool_doc_md(print_tree))

            gr.Markdown("---")
            gr.Markdown("### File Structure")
            with gr.Row():
                with gr.Column():
                    file_path_input = gr.Textbox(label="File Path", placeholder="Enter file path...")
                    file_structure_btn = gr.Button("Get File Structure", variant="primary")
                with gr.Column():
                    file_structure_output = gr.Textbox(label="File Structure", lines=20, max_lines=30)
            file_structure_btn.click(fn=get_file_structure, inputs=file_path_input, outputs=file_structure_output)
            gr.Markdown(_tool_doc_md(get_file_structure))

        with gr.Tab("🎯 Entities"):
            gr.Markdown("### List All Entities")
            with gr.Row():
                with gr.Column():
                    entity_page = gr.Slider(1, 100, value=1, step=1, label="Page")
                    entity_limit = gr.Slider(10, 100, value=50, step=10, label="Per Page")
                    entity_type_filter = gr.Dropdown(
                        choices=["", "class", "function", "method", "variable", "parameter"],
                        label="Filter by Type (optional)", value=""
                    )
                    declared_in_repo = gr.Dropdown(
                        choices=["", "true", "false"],
                        label="Declared in Repo (optional)",
                        value=""
                    )
                    called_in_repo = gr.Dropdown(
                        choices=["", "true", "false"],
                        label="Called in Repo (optional)",
                        value=""
                    )
                    list_entities_btn = gr.Button("List Entities", variant="primary")
                with gr.Column():
                    list_entities_output = gr.Textbox(label="Entities", lines=20, max_lines=30)
            list_entities_btn.click(
                fn=list_all_entities,
                inputs=[entity_limit, entity_page, entity_type_filter, declared_in_repo, called_in_repo],
                outputs=list_entities_output,
            )
            gr.Markdown(_tool_doc_md(list_all_entities))

            gr.Markdown("---")
            gr.Markdown("### Go to Definition")
            with gr.Row():
                with gr.Column():
                    entity_name_def = gr.Textbox(label="Entity Name", placeholder="Enter entity name...")
                    def_btn = gr.Button("Go to Definition", variant="primary")
                with gr.Column():
                    def_output = gr.Textbox(label="Definition", lines=15, max_lines=25)
            def_btn.click(fn=go_to_definition, inputs=entity_name_def, outputs=def_output)
            gr.Markdown(_tool_doc_md(go_to_definition))

            gr.Markdown("---")
            gr.Markdown("### Find Usages")
            with gr.Row():
                with gr.Column():
                    entity_name_usage = gr.Textbox(label="Entity Name", placeholder="Enter entity name...")
                    usage_limit = gr.Slider(1, 50, value=20, step=1, label="Results per Page")
                    usage_page = gr.Slider(1, 100, value=1, step=1, label="Page")
                    usage_btn = gr.Button("Find Usages", variant="primary")
                with gr.Column():
                    usage_output = gr.Textbox(label="Usages", lines=15, max_lines=25)
            usage_btn.click(fn=find_usages, inputs=[entity_name_usage, usage_limit, usage_page], outputs=usage_output)
            gr.Markdown(_tool_doc_md(find_usages))

        with gr.Tab("🔬 Discovery"):
            gr.Markdown("### List Nodes by Type")
            with gr.Row():
                with gr.Column():
                    node_type_input = gr.Dropdown(
                        choices=["file", "directory", "chunk", "function", "class", "method"],
                        label="Node Type"
                    )
                    type_limit = gr.Slider(1, 100, value=20, step=1, label="Max Results")
                    type_page = gr.Slider(1, 100, value=1, step=1, label="Page")
                    type_btn = gr.Button("List Nodes", variant="primary")
                with gr.Column():
                    type_output = gr.Textbox(label="Results", lines=20, max_lines=30)
            type_btn.click(fn=list_nodes_by_type, inputs=[node_type_input, type_limit, type_page], outputs=type_output)
            gr.Markdown(_tool_doc_md(list_nodes_by_type))

            gr.Markdown("---")
            gr.Markdown("### Search by Type and Name")
            with gr.Row():
                with gr.Column():
                    search_type = gr.Dropdown(
                        choices=["file", "directory", "chunk", "function", "class", "method"],
                        label="Node Type"
                    )
                    search_name = gr.Textbox(label="Name Contains", placeholder="Enter partial name...")
                    search_limit = gr.Slider(1, 100, value=10, step=1, label="Max Results")
                    search_page = gr.Slider(1, 100, value=1, step=1, label="Page")
                    search_partial_allowed = gr.Checkbox(label="Partial Match", value=True)
                    search_type_btn = gr.Button("Search", variant="primary")
                with gr.Column():
                    search_type_output = gr.Textbox(label="Results", lines=20, max_lines=30)
            search_type_btn.click(fn=search_by_type_and_name, inputs=[search_type, search_name, search_limit, search_page, search_partial_allowed], outputs=search_type_output)
            gr.Markdown(_tool_doc_md(search_by_type_and_name))

        with gr.Tab("🔗 Relationships"):
            gr.Markdown("### Get Neighbors")
            with gr.Row():
                with gr.Column():
                    neighbor_node_id = gr.Textbox(label="Node ID", placeholder="Enter node ID...")
                    neighbor_limit = gr.Slider(1, 100, value=20, step=1, label="Max Results")
                    neighbor_page = gr.Slider(1, 100, value=1, step=1, label="Page")
                    neighbor_btn = gr.Button("Get Neighbors", variant="primary")
                with gr.Column():
                    neighbor_output = gr.Textbox(label="Neighbors", lines=20, max_lines=30)
            neighbor_btn.click(fn=get_neighbors, inputs=[neighbor_node_id, neighbor_limit, neighbor_page], outputs=neighbor_output)
            gr.Markdown(_tool_doc_md(get_neighbors))

            gr.Markdown("---")
            gr.Markdown("### Entity Relationships")
            with gr.Row():
                with gr.Column():
                    rel_node_id = gr.Textbox(label="Node ID", placeholder="Enter node ID...")
                    rel_btn = gr.Button("Get Relationships", variant="primary")
                with gr.Column():
                    rel_output = gr.Textbox(label="Relationships", lines=20, max_lines=30)
            rel_btn.click(fn=entity_relationships, inputs=rel_node_id, outputs=rel_output)
            gr.Markdown(_tool_doc_md(entity_relationships))

            gr.Markdown("---")
            gr.Markdown("### Get Related Chunks")
            with gr.Row():
                with gr.Column():
                    related_chunk_id = gr.Textbox(label="Chunk ID", placeholder="Enter chunk ID...")
                    relation_type = gr.Dropdown(choices=["" , "calls", "contains", "declares", "uses"], label="Relation Type", value="calls")
                    related_limit = gr.Slider(1, 100, value=20, step=1, label="Results per Page")
                    related_page = gr.Slider(1, 100, value=1, step=1, label="Page")
                    related_btn = gr.Button("Get Related Chunks", variant="primary")
                with gr.Column():
                    related_output = gr.Textbox(label="Related Chunks", lines=20, max_lines=30)
            related_btn.click(fn=get_related_chunks, inputs=[related_chunk_id, relation_type, related_limit, related_page], outputs=related_output)
            gr.Markdown(_tool_doc_md(get_related_chunks))

            gr.Markdown("---")
            gr.Markdown("### Find Path Between Nodes")
            with gr.Row():
                with gr.Column():
                    path_source = gr.Textbox(label="Source Node ID", placeholder="Enter source node ID...")
                    path_target = gr.Textbox(label="Target Node ID", placeholder="Enter target node ID...")
                    path_depth = gr.Slider(1, 10, value=5, step=1, label="Max Depth")
                    path_btn = gr.Button("Find Path", variant="primary")
                with gr.Column():
                    path_output = gr.Textbox(label="Path", lines=20, max_lines=30)
            path_btn.click(fn=find_path, inputs=[path_source, path_target, path_depth], outputs=path_output)
            gr.Markdown(_tool_doc_md(find_path))

        with gr.Tab("📖 Context"):
            gr.Markdown("### Get Chunk Context")
            with gr.Row():
                with gr.Column():
                    chunk_id_input = gr.Textbox(label="Chunk ID", placeholder="Enter chunk ID...")
                    context_btn = gr.Button("Get Context", variant="primary")
                with gr.Column():
                    context_output = gr.Textbox(label="Context", lines=25, max_lines=40)
            context_btn.click(fn=get_chunk_context, inputs=chunk_id_input, outputs=context_output)
            gr.Markdown(_tool_doc_md(get_chunk_context))

            gr.Markdown("---")
            gr.Markdown("### Concept Overview")
            with gr.Row():
                with gr.Column():
                    concept_input = gr.Textbox(label="Concept", placeholder="e.g., embedding, authentication...")
                    concept_btn = gr.Button("Get Overview", variant="primary")
                with gr.Column():
                    concept_output = gr.Textbox(label="Concept Overview", lines=25, max_lines=40)
            concept_btn.click(fn=get_concept_overview, inputs=concept_input, outputs=concept_output)
            gr.Markdown(_tool_doc_md(get_concept_overview))

            gr.Markdown("---")
            gr.Markdown("### Get Subgraph")
            with gr.Row():
                with gr.Column():
                    subgraph_node = gr.Textbox(label="Center Node ID", placeholder="Enter node ID...")
                    subgraph_depth = gr.Slider(1, 5, value=2, step=1, label="Depth")
                    subgraph_edge_types = gr.Textbox(label="Edge Types (comma-separated, optional)", placeholder="e.g., calls,contains")
                    subgraph_btn = gr.Button("Retrieve Subgraph", variant="primary")
                with gr.Column():
                    subgraph_output = gr.Textbox(label="Subgraph", lines=20, max_lines=30)
            subgraph_btn.click(fn=get_subgraph, inputs=[subgraph_node, subgraph_depth, subgraph_edge_types], outputs=subgraph_output)
            gr.Markdown(_tool_doc_md(get_subgraph))

        with gr.Tab("📁 Files"):
            gr.Markdown("### List Files in Directory")
            with gr.Row():
                with gr.Column():
                    dir_path = gr.Textbox(label="Directory Path (empty for root)", placeholder="e.g., src/")
                    file_pattern = gr.Textbox(label="Pattern", value="*", placeholder="e.g., *.py")
                    file_recursive = gr.Checkbox(label="Recursive", value=True)
                    file_limit = gr.Slider(10, 100, value=50, step=10, label="Results per Page")
                    file_page = gr.Slider(1, 100, value=1, step=1, label="Page")
                    list_files_btn = gr.Button("List Files", variant="primary")
                with gr.Column():
                    list_files_output = gr.Textbox(label="Files", lines=20, max_lines=30)
            list_files_btn.click(fn=list_files_in_directory, inputs=[dir_path, file_pattern, file_recursive, file_limit, file_page], outputs=list_files_output)
            gr.Markdown(_tool_doc_md(list_files_in_directory))

            gr.Markdown("---")
            gr.Markdown("### Find Files Importing")
            with gr.Row():
                with gr.Column():
                    import_module = gr.Textbox(label="Module/Entity Name", placeholder="e.g., torch, numpy...")
                    import_limit = gr.Slider(10, 50, value=30, step=5, label="Results per Page")
                    import_page = gr.Slider(1, 100, value=1, step=1, label="Page")
                    find_imports_btn = gr.Button("Find Files", variant="primary")
                with gr.Column():
                    find_imports_output = gr.Textbox(label="Importing Files", lines=20, max_lines=30)
            find_imports_btn.click(fn=find_files_importing, inputs=[import_module, import_limit, import_page], outputs=find_imports_output)
            gr.Markdown(_tool_doc_md(find_files_importing))

            gr.Markdown("---")
            gr.Markdown("### Get File Stats")
            with gr.Row():
                with gr.Column():
                    stats_path = gr.Textbox(label="File Path", placeholder="Enter file path...")
                    stats_btn = gr.Button("Get Stats", variant="primary")
                with gr.Column():
                    stats_output = gr.Textbox(label="Statistics", lines=20, max_lines=30)
            stats_btn.click(fn=get_file_stats, inputs=stats_path, outputs=stats_output)
            gr.Markdown(_tool_doc_md(get_file_stats))

        with gr.Tab("🔍 Analysis"):
            gr.Markdown("### Diff Chunks")
            with gr.Row():
                with gr.Column():
                    diff_node1 = gr.Textbox(label="First Node ID", placeholder="Enter first node ID...")
                    diff_node2 = gr.Textbox(label="Second Node ID", placeholder="Enter second node ID...")
                    diff_btn = gr.Button("Show Diff", variant="primary")
                with gr.Column():
                    diff_output = gr.Textbox(label="Diff Output", lines=25, max_lines=40)
            diff_btn.click(fn=diff_chunks, inputs=[diff_node1, diff_node2], outputs=diff_output)
            gr.Markdown(_tool_doc_md(diff_chunks))

    return demo


def main():
    parser = argparse.ArgumentParser(description="Knowledge Graph MCP Server from HuggingFace Dataset")
    
    # Required argument
    parser.add_argument("--hf-dataset", type=str, default=os.environ.get("HF_DATASET"), 
                        help="HuggingFace dataset repo ID (e.g., 'username/dataset-name')")
    
    # Optional HuggingFace auth (falls back to HF_TOKEN env var)
    parser.add_argument("--hf-token", type=str, default=os.environ.get("HF_TOKEN"), 
                        help="HuggingFace API token for private datasets (or set HF_TOKEN env var)")
    
    # Server settings
    parser.add_argument("--host", type=str, default="0.0.0.0", help="Host to bind to")
    parser.add_argument("--port", type=int, default=7860, help="Port to bind to")
    parser.add_argument("--share", action="store_true", help="Create a public link")
    
    # Index settings
    parser.add_argument("--no-index", action="store_true", help="Skip indexing nodes")
    parser.add_argument("--code-index-type", type=str, default="keyword-only", 
                        choices=["keyword-only", "embedding-only", "hybrid"],
                        help="Type of code index to use")
    parser.add_argument("--code-index-backend", type=str, default="lancedb", 
                        choices=["lancedb", "weaviate"],
                        help="Backend for code index")

    args = parser.parse_args()

    # Build code_index_kwargs
    code_index_kwargs = {
        "index_type": args.code_index_type,
        "backend": args.code_index_backend,
        "use_embed": args.code_index_type != "keyword-only",
    }

    # Initialize knowledge graph
    print("Initializing knowledge graph from HuggingFace dataset...")
    initialize_knowledge_graph(
        hf_dataset=args.hf_dataset,
        hf_token=args.hf_token,
        index_nodes=not args.no_index,
        code_index_kwargs=code_index_kwargs
    )
    print("Knowledge graph initialized!")

    # Create and launch app
    demo = create_gradio_app()
    demo.launch(
        server_name=args.host,
        server_port=args.port,
        share=args.share,
        mcp_server=True
    )


if __name__ == "__main__":
    main()