Spaces:

MCP-1st-Birthday
/

code-knowledge-graph-explorer-transformers-library

Sleeping

code-knowledge-graph-explorer-transformers-library

File size: 74,405 Bytes

3ec78dd

import networkx as nx
import json
import os
import asyncio
import nest_asyncio
import tqdm
# from pathlib import Path
import os.path
import tempfile
import subprocess
from typing import List, Optional, Dict
import logging
import urllib.parse

from .ModelService import create_model_service
from .Node import Node, DirectoryNode, FileNode, ChunkNode, EntityNode
from .CodeParser import CodeParser
from .EntityExtractor import HybridEntityExtractor
from .CodeIndex import CodeIndex
from .utils.logger_utils import setup_logger
from .utils.parsing_utils import read_directory_files_recursively, get_language_from_filename
from .utils.path_utils import prepare_input_path, build_entity_alias_map, resolve_entity_call
from .EntityChunkMapper import EntityChunkMapper

LOGGER_NAME = 'REPO_KNOWLEDGE_GRAPH_LOGGER'

MODEL_SERVICE_TYPES = ['openai', 'sentence-transformers']


# A RepoKnowledgeGraph is a weighted DAG based on a tree-structure with added edges
class RepoKnowledgeGraph:
    """
    RepoKnowledgeGraph builds a knowledge graph of a code repository.
    It parses source files, extracts code entities and relationships, and organizes them
    into a directed acyclic graph (DAG) with additional semantic edges.

    Use `from_path()` or `load_graph_from_file()` to create instances.
    """

    def __init__(self):
        """
        Private constructor. Use from_path() or load_graph_from_file() instead.
        """
        raise RuntimeError(
            "Cannot instantiate RepoKnowledgeGraph directly. "
            "Use RepoKnowledgeGraph.from_path() or RepoKnowledgeGraph.load_graph_from_file() instead."
        )

    def _initialize(self, model_service_kwargs: dict, code_index_kwargs: Optional[dict] = None):
        """Internal initialization method."""
        setup_logger(LOGGER_NAME)
        self.logger = logging.getLogger(LOGGER_NAME)
        self.logger.info('Initializing RepoKnowledgeGraph instance.')
        self.code_parser = CodeParser()
        
        # Determine if we should skip loading the embedder based on index_type
        index_type = (code_index_kwargs or {}).get('index_type', 'hybrid')
        skip_embedder = index_type == 'keyword-only'
        if skip_embedder:
            self.logger.info('Using keyword-only index, skipping embedder initialization')
        
        self.model_service = create_model_service(skip_embedder=skip_embedder, **model_service_kwargs)
        self.entities = {}
        self.graph = nx.DiGraph()
        self.knowledge_graph = nx.DiGraph()
        self.code_index = None
        self.entity_extractor = HybridEntityExtractor()

    def __iter__(self):
        # Yield only the 'data' attribute from each node
        return (node_data['data'] for _, node_data in self.graph.nodes(data=True))

    def __getitem__(self, node_id):
        return self.graph.nodes[node_id]['data']
    

    @classmethod
    def from_path(cls, path: str, skip_dirs: Optional[list] = None, index_nodes: bool = True, describe_nodes=False,
                  extract_entities: bool = False, model_service_kwargs: Optional[dict] = None, code_index_kwargs: Optional[dict] = None):
        if skip_dirs is None:
            skip_dirs = []
        if model_service_kwargs is None:
            model_service_kwargs = {}
        """
        Alternative constructor to build a RepoKnowledgeGraph from a path, with options to skip directories
        and control entity extraction and node description.

        Args:
            path (str): Path to the root of the code repository.
            skip_dirs (list): List of directory names to skip.
            index_nodes (bool): Whether to build a code index.
            describe_nodes (bool): Whether to generate descriptions for code chunks.
            extract_entities (bool): Whether to extract entities from code.

        Returns:
            RepoKnowledgeGraph: The constructed knowledge graph.
        """
        instance = cls.__new__(cls)  # Create instance without calling __init__
        instance._initialize(model_service_kwargs=model_service_kwargs, code_index_kwargs=code_index_kwargs)

        instance.logger.info(f"Preparing to build knowledge graph from path: {path}")

        prepared_path = prepare_input_path(path)
        instance.logger.debug(f"Prepared input path: {prepared_path}")

        # Handle running event loop (e.g., in Jupyter)
        try:
            loop = asyncio.get_running_loop()
        except RuntimeError:
            loop = None

        if loop and loop.is_running():
            instance.logger.debug("Detected running event loop, applying nest_asyncio.")
            nest_asyncio.apply()
            task = instance._initial_parse_path_async(prepared_path, skip_dirs=skip_dirs, index_nodes=index_nodes,
                                                      describe_nodes=describe_nodes, extract_entities=extract_entities)
            loop.run_until_complete(task)
        else:
            instance.logger.debug("No running event loop, using asyncio.run.")
            asyncio.run(instance._initial_parse_path_async(prepared_path, skip_dirs=skip_dirs, index_nodes=index_nodes,
                                                           describe_nodes=describe_nodes,
                                                           extract_entities=extract_entities))

        instance.logger.info("Parsing files and building initial nodes...")
        instance.logger.info("Initial parse and node creation complete. Building relationships between nodes...")
        instance._build_relationships()

        if index_nodes:
            instance.logger.info("Building code index for all nodes in the graph...")
            instance.code_index = CodeIndex(list(instance), model_service=instance.model_service, **(code_index_kwargs or {}))

        instance.logger.info("Knowledge graph construction from path completed successfully.")
        return instance

    @classmethod
    def from_repo(
            cls,
            repo_url: str,
            skip_dirs: Optional[list] = None,
            index_nodes: bool = True,
            describe_nodes: bool = False,
            extract_entities: bool = False,
            model_service_kwargs: Optional[dict] = None,
            code_index_kwargs: Optional[dict]=None,
            github_token: Optional[str] = None,
            allow_unauthenticated_clone: bool = True,
    ):
        """
        Alternative constructor to build a RepoKnowledgeGraph from a remote git repository URL.

        Args:
            repo_url (str): Git repository URL (SSH or HTTPS).
            skip_dirs (list): List of directory names to skip.
            index_nodes (bool): Whether to build a code index.
            describe_nodes (bool): Whether to generate descriptions for code chunks.
            extract_entities (bool): Whether to extract entities from code.
            github_token (str, optional): Personal access token to access private GitHub repos.
                If not provided, the method will look for the `GITHUB_OAUTH_TOKEN` environment variable.
            allow_unauthenticated_clone (bool): If True, attempt to clone without a token when none is provided.
                If False, raise an error when no token is available.

        Returns:
            RepoKnowledgeGraph: The constructed knowledge graph.
        """
        if skip_dirs is None:
            skip_dirs = []
        if model_service_kwargs is None:
            model_service_kwargs = {}

        instance = cls.__new__(cls)
        instance._initialize(model_service_kwargs=model_service_kwargs, code_index_kwargs=code_index_kwargs)

        instance.logger.info(f"Starting knowledge graph build from remote repository: {repo_url}")

        # Determine token
        token = github_token or os.environ.get('GITHUB_OAUTH_TOKEN')

        with tempfile.TemporaryDirectory() as tmpdirname:
            clone_url = repo_url
            try:
                if repo_url.startswith('git@'):
                    # Convert git@github.com:owner/repo.git -> https://github.com/owner/repo.git
                    clone_url = repo_url.replace(':', '/').split('git@')[-1]
                    clone_url = f'https://{clone_url}'

                if token and clone_url.startswith('https://'):
                    encoded_token = urllib.parse.quote(token, safe='')
                    clone_url = clone_url.replace('https://', f'https://{encoded_token}@')
                elif not token and not allow_unauthenticated_clone:
                    raise ValueError(
                        "GitHub token not provided and unauthenticated clone is disabled. "
                        "Set allow_unauthenticated_clone=True or provide a token."
                    )

                instance.logger.debug(f"Running git clone: {clone_url} -> {tmpdirname}")
                subprocess.run(['git', 'clone', clone_url, tmpdirname], check=True)

            except Exception as e:
                instance.logger.error(f"Failed to clone repository {repo_url} using URL {clone_url}: {e}")
                raise

            instance.logger.info(f"Repository successfully cloned to: {tmpdirname}")

            return cls.from_path(
                tmpdirname,
                skip_dirs=skip_dirs,
                index_nodes=index_nodes,
                describe_nodes=describe_nodes,
                extract_entities=extract_entities,
                model_service_kwargs=model_service_kwargs,
                code_index_kwargs=code_index_kwargs
            )

    async def _initial_parse_path_async(self, path: str, skip_dirs: list, index_nodes=True, describe_nodes=True,
                                        extract_entities: bool = True):
        self.logger.info(f"Beginning async parsing of repository at path: {path}")
        """
        Orchestrates the parsing and graph construction process:
        1. Reads files and splits into chunks.
        2. Extracts entities and relationships.
        3. Builds chunk, file, directory, and root nodes.
        4. Aggregates entity information.

        Args:
            path (str): Root path to parse.
            skip_dirs (list): Directories to skip.
            index_nodes (bool): Whether to build code index.
            describe_nodes (bool): Whether to generate descriptions.
            extract_entities (bool): Whether to extract entities.
        """

        # --- Pass 1: Create ChunkNodes ---
        level1_node_contents = read_directory_files_recursively(
            path, skip_dirs=skip_dirs,
            skip_pattern=r"(?:\.log$|\.json$|(?:^|/)(?:\.git|\.idea|__pycache__|\.cache)(?:/|$)|(?:^|/)(?:changelog|ChangeLog)(?:\.[a-z0-9]+)?$|\.cache$)"
        )
        self.logger.debug(f"Found {len(level1_node_contents)} files to process.")
        self.logger.info("Chunk nodes creation step started.")
        chunk_info = await self._create_chunk_nodes(
            level1_node_contents, extract_entities, describe_nodes, index_nodes, root_path=path
        )
        self.logger.info("Chunk nodes creation step finished.")
        self.logger.info("File nodes creation step started.")
        file_info = self._create_file_nodes(
            chunk_info, level1_node_contents
        )
        self.logger.info("File nodes creation step finished.")
        self.logger.info("Directory nodes creation step started.")
        dir_agg = self._create_directory_nodes(
            file_info
        )
        self.logger.info("Directory nodes creation step finished.")
        self.logger.info("Aggregating all nodes to root node.")
        self._aggregate_to_root(dir_agg)
        self.logger.info("Async parse and node aggregation fully complete.")

    async def _create_chunk_nodes(self, level1_node_contents, extract_entities, describe_nodes, index_nodes, root_path=None):
        self.logger.info(f"Starting chunk node creation for {len(level1_node_contents)} files.")
        accepted_extensions = {'.py', '.c', '.cpp', '.h', '.hpp', '.java', '.js', '.ts', '.jsx', '.tsx', '.rs', '.html'}
        chunk_info = {}
        entity_mapper = EntityChunkMapper()
        total_chunks = 0

        # Use tqdm for progress bar over files
        for file_path in tqdm.tqdm(level1_node_contents, desc="Processing files for chunk nodes"):
            self.logger.debug(f"Processing file for chunk nodes: {file_path}")
            full_path = os.path.normpath(file_path)
            parts = full_path.split(os.sep)
            _, ext = os.path.splitext(file_path)
            is_code_file = ext.lower() in accepted_extensions

            self.logger.debug(f"Parsing file: {file_path}")

            # Parse file into chunks
            parsed_content = self.code_parser.parse(file_name=file_path, file_content=level1_node_contents[file_path])
            self.logger.debug(f"Parsed {len(parsed_content)} chunks from file: {file_path}")
            total_chunks += len(parsed_content)

            # Entity extraction logging
            if extract_entities and is_code_file:
                self.logger.debug(f"Extracting entities from code file: {file_path}")
                try:
                    # Construct full path for entity extraction (needed for C/C++ include resolution)
                    extraction_file_path = os.path.join(root_path, file_path) if root_path else file_path
                    
                    file_declared_entities, file_called_entities = self.entity_extractor.extract_entities(
                        code=level1_node_contents[file_path], file_name=extraction_file_path)
                    self.logger.debug(f"Extracted {len(file_declared_entities)} declared and {len(file_called_entities)} called entities from file: {file_path}")

                    chunk_declared_map, chunk_called_map = entity_mapper.map_entities_to_chunks(
                        file_declared_entities, file_called_entities, parsed_content, file_name=file_path)
                    self.logger.debug(f"Mapped entities to {len(parsed_content)} chunks for file: {file_path}")
                except Exception as e:
                    self.logger.error(f"Error extracting entities from {file_path}: {e}")
                    file_declared_entities, file_called_entities = [], []
                    chunk_declared_map = {i: [] for i in range(len(parsed_content))}
                    chunk_called_map = {i: [] for i in range(len(parsed_content))}
            else:
                self.logger.debug(f"Skipping entity extraction for non-code file: {file_path}")
                file_declared_entities, file_called_entities = [], []
                chunk_declared_map = {i: [] for i in range(len(parsed_content))}
                chunk_called_map = {i: [] for i in range(len(parsed_content))}

            chunk_tasks = []
            for i, chunk in enumerate(parsed_content):
                chunk_id = f'{file_path}_{i}'
                self.logger.debug(f"Scheduling processing for chunk {chunk_id} of file {file_path}")

                async def process_chunk(i=i, chunk=chunk, chunk_id=chunk_id):
                    self.logger.debug(f"Creating chunk node: {chunk_id}")
                    declared_entities = chunk_declared_map.get(i, [])
                    called_entities = chunk_called_map.get(i, [])

                    # FIRST PASS: Register all declared entities with aliases
                    # Build temporary alias map for checking existing entities
                    temp_alias_map = build_entity_alias_map(self.entities)

                    for entity in declared_entities:
                        name = entity.get("name")
                        if not name:
                            continue

                        # Check if this entity already exists under any of its aliases
                        entity_aliases = entity.get("aliases", [])
                        canonical_name = None

                        # First check if the name itself already exists or is an alias
                        if name in temp_alias_map:
                            canonical_name = temp_alias_map[name]
                            self.logger.debug(f"Entity '{name}' already exists as '{canonical_name}'")
                        else:
                            # Check if any of the entity's aliases match existing entities
                            for alias in entity_aliases:
                                if alias in temp_alias_map:
                                    canonical_name = temp_alias_map[alias]
                                    self.logger.debug(f"Entity '{name}' matches existing entity '{canonical_name}' via alias '{alias}'")
                                    break

                        # If we found a match, use the canonical name; otherwise use the entity name
                        if canonical_name:
                            entity_key = canonical_name
                        else:
                            entity_key = name
                            self.logger.debug(f"Registering new declared entity '{name}' in chunk {chunk_id}")
                            self.entities[entity_key] = {
                                "declaring_chunk_ids": [],
                                "calling_chunk_ids": [],
                                "type": [],
                                "dtype": None,
                                "aliases": []
                            }
                            # Update temp alias map with new entity
                            temp_alias_map[entity_key] = entity_key

                        if chunk_id not in self.entities[entity_key]["declaring_chunk_ids"]:
                            self.entities[entity_key]["declaring_chunk_ids"].append(chunk_id)
                        entity_type = entity.get("type")
                        if entity_type and entity_type not in self.entities[entity_key]["type"]:
                            self.entities[entity_key]["type"].append(entity_type)
                        dtype = entity.get("dtype")
                        if dtype:
                            self.entities[entity_key]["dtype"] = dtype
                        # Store aliases (add new ones, avoiding duplicates)
                        for alias in [name] + entity_aliases:
                            if alias and alias not in self.entities[entity_key]["aliases"]:
                                self.entities[entity_key]["aliases"].append(alias)
                                temp_alias_map[alias] = entity_key  # Update temp map
                        self.logger.debug(f"Declared entity '{name}' registered as '{entity_key}' in chunk {chunk_id} with aliases: {self.entities[entity_key]['aliases']}")


                    # Logging for node creation
                    if describe_nodes:
                        self.logger.info(f"Generating description for chunk {chunk_id}")
                        try:
                            description = await self.model_service.query_async(
                                f'Summarize this {get_language_from_filename(file_path)} code chunk in a few sentences: {chunk}')
                        except Exception as e:
                            self.logger.error(f"Error generating description for chunk {chunk_id}: {e}")
                            description = ''
                    else:
                        self.logger.debug(f"No description requested for chunk {chunk_id}")
                        description = ''

                    chunk_node = ChunkNode(
                        id=chunk_id,
                        name=chunk_id,
                        path=file_path,
                        content=chunk,
                        order_in_file=i,
                        called_entities=called_entities,
                        declared_entities=declared_entities,
                        language=get_language_from_filename(file_path),
                        description=description,
                    )
                    self.logger.debug(f"Chunk node created: {chunk_id}")

                    # NOTE: Embeddings are now deferred to CodeIndex for efficient batch processing
                    # This avoids the slow one-at-a-time embedding during chunk creation
                    chunk_node.embedding = None
                    return (chunk_id, chunk_node, declared_entities, called_entities)

                chunk_tasks.append(process_chunk())

            chunk_results = await asyncio.gather(*chunk_tasks)
            self.logger.debug(f"Finished processing {len(chunk_results)} chunks for file {file_path}.")
            chunk_info[file_path] = {
                'chunk_results': chunk_results,
                'file_declared_entities': file_declared_entities,
                'file_called_entities': file_called_entities
            }

        # Log summary
        self.logger.info(f"Created {total_chunks} chunk nodes from {len(level1_node_contents)} files")

        # SECOND PASS: Now that all declared entities are registered, resolve called entities
        self.logger.info("Starting second pass: resolving called entities using alias map...")
        alias_map = build_entity_alias_map(self.entities)
        self.logger.info(f"Built alias map with {len(alias_map)} entries for resolution")

        resolved_count = 0
        for file_path, file_data in tqdm.tqdm(chunk_info.items(), desc="Resolving called entities"):
            chunk_results = file_data['chunk_results']
            for chunk_id, chunk_node, declared_entities, called_entities in chunk_results:
                for called_name in called_entities:
                    # Skip empty or whitespace-only names
                    if not called_name or not called_name.strip():
                        continue

                    # Try to resolve this called entity to an existing declared entity using aliases
                    resolved_name = resolve_entity_call(called_name, alias_map)

                    # Use the resolved name if found, otherwise check if called_name is already an alias
                    if resolved_name:
                        entity_key = resolved_name
                    elif called_name in alias_map:
                        # The called_name itself is an alias of an existing entity
                        entity_key = alias_map[called_name]
                    else:
                        # No match found, use the original called name
                        entity_key = called_name

                    if entity_key not in self.entities:
                        self.logger.debug(f"Registering new called entity '{entity_key}' (called as '{called_name}') in chunk {chunk_id}")
                        self.entities[entity_key] = {
                            "declaring_chunk_ids": [],
                            "calling_chunk_ids": [],
                            "type": [],
                            "dtype": None,
                            "aliases": []
                        }
                        # Add called_name as an alias if it's different from entity_key
                        if called_name != entity_key:
                            self.entities[entity_key]["aliases"].append(called_name)
                            alias_map[called_name] = entity_key  # Update alias map

                    if chunk_id not in self.entities[entity_key]["calling_chunk_ids"]:
                        self.entities[entity_key]["calling_chunk_ids"].append(chunk_id)

                    if resolved_name and resolved_name != called_name:
                        resolved_count += 1
                        self.logger.debug(f"Called entity '{called_name}' resolved to '{entity_key}' in chunk {chunk_id}")

        self.logger.info(f"Resolved {resolved_count} entity calls to existing declarations via aliases")
        self.logger.info("All chunk nodes have been created for all files.")
        return chunk_info

    def _create_file_nodes(self, chunk_info, level1_node_contents):
        self.logger.info("Starting file node creation.")
        """
        For each file, aggregate chunk information and create FileNode objects.
        This method remains mostly the same.
        """

        def merge_entities(target, source):
            # Merge entity lists, avoiding duplicates by (name, type)
            existing = set((e.get('name'), e.get('type')) for e in target)
            for e in source:
                k = (e.get('name'), e.get('type'))
                if k not in existing:
                    target.append(e)
                    existing.add(k)

        def merge_called_entities(target, source):
            # Merge called entity lists, avoiding duplicates
            existing = set(target)
            for e in source:
                if e not in existing:
                    target.append(e)
                    existing.add(e)

        file_info = {}
        for file_path, file_data in tqdm.tqdm(chunk_info.items(), desc="Creating file nodes"):
            self.logger.info(f"Creating file node for: {file_path}")
            parts = os.path.normpath(file_path).split(os.sep)

            # Extract file-level entities and chunk results from the stored data
            chunk_results = file_data['chunk_results']
            file_declared_entities = list(file_data['file_declared_entities'])  # Use file-level entities directly
            file_called_entities = list(file_data['file_called_entities'])      # Use file-level entities directly
            chunk_ids = []

            for chunk_id, chunk_node, declared_entities, called_entities in chunk_results:
                self.logger.info(f"Adding chunk node {chunk_id} to graph for file {file_path}")
                self.graph.add_node(chunk_id, data=chunk_node, level=2)
                chunk_ids.append(chunk_id)
                # Note: We're using file-level entities for the FileNode, so we don't need to merge from chunks
                # The chunks already have their entities set correctly

            file_node = FileNode(
                id=file_path,
                name=parts[-1],
                path=file_path,
                node_type='file',
                content=level1_node_contents[file_path],
                declared_entities=file_declared_entities,
                called_entities=file_called_entities,
                language=get_language_from_filename(file_path),
            )

            self.logger.debug(f"Adding file node {file_path} to graph.")
            self.graph.add_node(file_path, data=file_node, level=1)
            for chunk_id in chunk_ids:
                self.graph.add_edge(file_path, chunk_id, relation='contains')

            file_info[file_path] = {
                'declared_entities': file_declared_entities,
                'called_entities': file_called_entities,
                'chunk_ids': chunk_ids,
                'parts': parts,
            }
            self.logger.info(f"File node {file_path} added to graph with {len(chunk_ids)} chunks.")

        self.logger.info("All file nodes have been created.")
        return file_info

    def _create_directory_nodes(self, file_info):
        self.logger.info("Starting directory node creation.")
        """
        For each directory, aggregate file information and create DirectoryNode objects.

        Args:
            file_info (dict): Mapping file_path -> file info dict.

        Returns:
            dict: Mapping dir_path -> aggregated entity info.
        """

        def merge_entities(target, source):
            # Merge entity lists, avoiding duplicates by (name, type)
            existing = set((e.get('name'), e.get('type')) for e in target)
            for e in source:
                k = (e.get('name'), e.get('type'))
                if k not in existing:
                    target.append(e)
                    existing.add(k)

        def merge_called_entities(target, source):
            # Merge called entity lists, avoiding duplicates
            existing = set(target)
            for e in source:
                if e not in existing:
                    target.append(e)
                    existing.add(e)

        dir_agg = {}
        for file_path, info in tqdm.tqdm(file_info.items(), desc="Creating directory nodes"):
            self.logger.info(f"Processing directory nodes for file: {file_path}")
            parts = os.path.normpath(file_path).split(os.sep)
            file_declared_entities = info['declared_entities']
            file_called_entities = info['called_entities']
            current_parent = 'root'
            path_accum = ''
            for part in parts[:-1]:  # Skip file itself
                path_accum = os.path.join(path_accum, part) if path_accum else part
                if path_accum not in self.graph:
                    self.logger.info(f"Adding new directory node: {path_accum}")
                    dir_node = DirectoryNode(id=path_accum, name=part, path=path_accum)
                    self.graph.add_node(path_accum, data=dir_node, level=1)
                    self.graph.add_edge(current_parent, path_accum, relation='contains')
                if path_accum not in dir_agg:
                    dir_agg[path_accum] = {'declared_entities': [], 'called_entities': []}
                merge_entities(dir_agg[path_accum]['declared_entities'], file_declared_entities)
                merge_called_entities(dir_agg[path_accum]['called_entities'], file_called_entities)
                current_parent = path_accum
            # Connect file to its parent directory
            self.graph.add_edge(current_parent, file_path, relation='contains')
        self.logger.info("All directory nodes created.")
        return dir_agg

    def _aggregate_to_root(self, dir_agg):
        self.logger.info("Aggregating directory information to root node.")
        """
        Aggregate all directory entity information to the root node.

        Args:
            dir_agg (dict): Mapping dir_path -> aggregated entity info.
        """

        def merge_entities(target, source):
            # Merge entity lists, avoiding duplicates by (name, type)
            existing = set((e.get('name'), e.get('type')) for e in target)
            for e in source:
                k = (e.get('name'), e.get('type'))
                if k not in existing:
                    target.append(e)
                    existing.add(k)

        def merge_called_entities(target, source):
            # Merge called entity lists, avoiding duplicates
            existing = set(target)
            for e in source:
                if e not in existing:
                    target.append(e)
                    existing.add(e)

        root_node = Node(id='root', name='root', node_type='root')
        self.graph.add_node('root', data=root_node, level=0)
        root_declared_entities = []
        root_called_entities = []
        for dir_path, agg in tqdm.tqdm(dir_agg.items(), desc="Aggregating to root"):
            node = self.graph.nodes[dir_path]['data']
            if not hasattr(node, 'declared_entities'):
                node.declared_entities = []
            if not hasattr(node, 'called_entities'):
                node.called_entities = []
            merge_entities(node.declared_entities, agg['declared_entities'])
            merge_called_entities(node.called_entities, agg['called_entities'])
            merge_entities(root_declared_entities, agg['declared_entities'])
            merge_called_entities(root_called_entities, agg['called_entities'])
        if not hasattr(root_node, 'declared_entities'):
            root_node.declared_entities = []
        if not hasattr(root_node, 'called_entities'):
            root_node.called_entities = []
        merge_entities(root_node.declared_entities, root_declared_entities)
        merge_called_entities(root_node.called_entities, root_called_entities)
        self.logger.info("Aggregation to root node complete.")

    def _build_relationships(self):
        self.logger.info("Building relationships between chunk nodes based on entities.")
        """
        Build relationships between chunk nodes and entity nodes based on self.entities.
        For each entity in self.entities:
        1. Create an EntityNode with entity_name as the id
        2. Create edges from declaring chunks to entity node (declares relationship)
        3. Create edges from entity node to calling chunks (called_by relationship)
        4. Resolve called entity names using aliases for better matching
        """
        from .Node import EntityNode
        edges_created = 0
        entity_nodes_created = 0
        
        # Build alias map for quick lookups
        self.logger.info("Building entity alias map for call resolution...")
        alias_map = build_entity_alias_map(self.entities)
        self.logger.info(f"Built alias map with {len(alias_map)} entries")

        # First pass: Create all entity nodes
        for entity_name, info in tqdm.tqdm(self.entities.items(), desc="Creating entity nodes"):
            # Entity type is stored as a list in 'type' key, get first type or empty string
            entity_types = info.get('type', [])
            entity_type = entity_types[0] if entity_types else ''
            declaring_chunks = info.get('declaring_chunk_ids', [])
            calling_chunks = info.get('calling_chunk_ids', [])
            aliases = info.get('aliases', [])

            # Create EntityNode with entity_name as id
            entity_node = EntityNode(
                id=entity_name,
                name=entity_name,
                entity_type=entity_type,
                declaring_chunk_ids=declaring_chunks,
                calling_chunk_ids=calling_chunks,
                aliases=aliases
            )
            
            # Add entity node to graph
            self.graph.add_node(entity_name, data=entity_node, level=3)
            entity_nodes_created += 1
            
            # Log aliases for debugging
            if aliases:
                self.logger.debug(f"Created EntityNode '{entity_name}' with aliases: {aliases}")

            # Create edges from declaring chunks to entity node
            for declarer_id in declaring_chunks:
                if declarer_id in self.graph:
                    self.graph.add_edge(declarer_id, entity_name, relation='declares')
                    edges_created += 1
            
            # Create edges from entity node to calling chunks
            for caller_id in calling_chunks:
                if caller_id in self.graph and caller_id not in declaring_chunks:
                    self.graph.add_edge(entity_name, caller_id, relation='called_by')
                    edges_created += 1

        # Second pass: Resolve unmatched entity calls using alias matching
        self.logger.info("Resolving entity calls using alias matching...")
        resolved_calls = 0

        for entity_name, info in tqdm.tqdm(self.entities.items(), desc="Resolving entity calls"):
            # Skip entities that already have declarations (they were matched directly)
            if info.get('declaring_chunk_ids'):
                continue

            # Try to resolve this called entity to a declared entity using aliases
            resolved_name = resolve_entity_call(entity_name, alias_map)

            if resolved_name and resolved_name != entity_name:
                # Found a match! Update the calling_chunk_ids of the resolved entity
                calling_chunks = info.get('calling_chunk_ids', [])

                if resolved_name in self.entities:
                    for caller_id in calling_chunks:
                        if caller_id in self.graph:
                            # Add edge from resolved entity to calling chunk
                            if not self.graph.has_edge(resolved_name, caller_id):
                                self.graph.add_edge(resolved_name, caller_id, relation='called_by')
                                edges_created += 1
                                resolved_calls += 1
                                self.logger.debug(f"Resolved call: '{entity_name}' -> '{resolved_name}' in chunk {caller_id}")

        self.logger.info(f"_build_relationships: Created {entity_nodes_created} entity nodes, "
                        f"{edges_created} edges, and resolved {resolved_calls} entity calls using aliases.")

    def get_entity_by_alias(self, alias: str) -> Optional[str]:
        """
        Get the canonical entity name for a given alias.

        Args:
            alias: An alias of an entity (e.g., 'MyClass' or 'module.MyClass')

        Returns:
            Canonical entity name if found, None otherwise
        """
        alias_map = build_entity_alias_map(self.entities)
        return alias_map.get(alias)

    def resolve_entity_references(self) -> Dict[str, List[str]]:
        """
        Resolve all entity references in the knowledge graph using aliases.
        Returns a mapping of unresolved entity calls to their potential matches.

        Returns:
            Dictionary mapping called entity names to list of potential canonical matches
        """
        alias_map = build_entity_alias_map(self.entities)
        resolutions = {}

        for entity_name, info in self.entities.items():
            # Only look at entities that are called but not declared
            if not info.get('declaring_chunk_ids') and info.get('calling_chunk_ids'):
                resolved = resolve_entity_call(entity_name, alias_map)
                if resolved:
                    resolutions[entity_name] = resolved

        return resolutions

    def print_tree(self, max_depth=None, start_node_id='root', level=0, prefix=""):
        """
        Print the repository tree structure using the graph with 'contains' edges.

        Args:
            max_depth (int, optional): Maximum depth to print. None = unlimited.
            start_node_id (str): ID of the node to start from. Default is 'root'.
            level (int): Internal use only (used for recursion).
            prefix (str): Internal use only (used for formatting output).
        """
        if max_depth is not None and level > max_depth:
            self.logger.debug(f"Max depth {max_depth} reached at node {start_node_id}.")
            return

        if start_node_id not in self.graph:
            self.logger.warning(f"Start node '{start_node_id}' not found in graph.")
            return

        try:
            node_data = self[start_node_id]
        except KeyError as e:
            self.logger.error(f"KeyError when accessing node {start_node_id}: {e}")
            self.logger.error(f"Available node attributes: {list(self.graph.nodes[start_node_id].keys())}")
            # Use a fallback approach if 'data' is missing
            if 'data' not in self.graph.nodes[start_node_id]:
                self.logger.warning(f"Node {start_node_id} has no 'data' attribute, using node itself")
                # Create a fallback node if 'data' is missing
                if start_node_id == 'root':
                    # Create a default root node
                    node_data = Node(id='root', name='root', node_type='root')
                    # Update the graph node with the fallback data
                    self.graph.nodes[start_node_id]['data'] = node_data
                else:
                    # Try to infer node type from ID or structure
                    name = start_node_id.split('/')[-1] if '/' in start_node_id else start_node_id
                    if '_' in start_node_id and start_node_id.split('_')[-1].isdigit():
                        # Looks like a chunk ID
                        node_data = ChunkNode(id=start_node_id, name=name, node_type='chunk')
                    elif '.' in name:
                        # Looks like a file
                        node_data = FileNode(id=start_node_id, name=name, node_type='file', path=start_node_id)
                    else:
                        # Fallback to directory or generic node
                        node_data = DirectoryNode(id=start_node_id, name=name, node_type='directory',
                                                  path=start_node_id)
                    # Update the graph node with the fallback data
                    self.graph.nodes[start_node_id]['data'] = node_data
            return

        # Choose icon based on node type
        if node_data.node_type == 'file':
            node_symbol = "📄"
        elif node_data.node_type == 'chunk':
            node_symbol = "📝"
        elif node_data.node_type == 'root':
            node_symbol = "📁"
        elif node_data.node_type == 'directory':
            node_symbol = "📂"
        else:
            node_symbol = "📦"

        if level == 0:
            print(f"{node_symbol} {node_data.name} ({node_data.node_type})")
        else:
            print(f"{prefix}└── {node_symbol} {node_data.name} ({node_data.node_type})")

        # Get children via 'contains' edges
        children = [
            child for child in self.graph.successors(start_node_id)
            if self.graph.edges[start_node_id, child].get('relation') == 'contains'
        ]

        child_count = len(children)
        for i, child_id in enumerate(children):
            is_last = i == child_count - 1
            new_prefix = prefix + ("    " if is_last else "│   ")
            self.print_tree(max_depth, start_node_id=child_id, level=level + 1, prefix=new_prefix)

    def to_dict(self):
        self.logger.info("Serializing graph to dictionary.")
        from .Node import EntityNode
        graph_data = {
            'nodes': [],
            'edges': []
        }

        for node_id, node_attrs in tqdm.tqdm(self.graph.nodes(data=True), desc="Serializing nodes"):
            if 'data' not in node_attrs:
                self.logger.warning(f"Node {node_id} has no 'data' attribute, skipping in serialization")
                continue

            node = node_attrs['data']
            node_dict = {
                'id': node.id or node_id,
                'class': node.__class__.__name__,
                'data': {
                    'id': node.id or node_id,
                    'name': node.name,
                    'node_type': node.node_type,
                    'description': getattr(node, 'description', ''),
                    'declared_entities': list(getattr(node, 'declared_entities', [])),
                    'called_entities': list(getattr(node, 'called_entities', [])),
                }
            }

            # FileNode-specific
            if isinstance(node, FileNode):
                node_dict['data']['path'] = node.path
                node_dict['data']['content'] = node.content
                node_dict['data']['language'] = getattr(node, 'language', '')

            # ChunkNode-specific
            if isinstance(node, ChunkNode):
                node_dict['data']['order_in_file'] = getattr(node, 'order_in_file', 0)
                node_dict['data']['embedding'] = getattr(node, 'embedding', None)
            
            # EntityNode-specific
            if isinstance(node, EntityNode):
                node_dict['data']['entity_type'] = getattr(node, 'entity_type', '')
                node_dict['data']['declaring_chunk_ids'] = list(getattr(node, 'declaring_chunk_ids', []))
                node_dict['data']['calling_chunk_ids'] = list(getattr(node, 'calling_chunk_ids', []))
                node_dict['data']['aliases'] = list(getattr(node, 'aliases', []))

            graph_data['nodes'].append(node_dict)

        for u, v, attrs in tqdm.tqdm(self.graph.edges(data=True), desc="Serializing edges"):
            edge_data = {
                'source': u,
                'target': v,
                'relation': attrs.get('relation', '')
            }
            if 'entities' in attrs:
                edge_data['entities'] = list(attrs['entities'])
            graph_data['edges'].append(edge_data)

        self.logger.info("Serialization complete.")
        return graph_data

    @classmethod
    def from_dict(cls, data_dict, index_nodes: bool = True, use_embed: bool = True,
                  model_service_kwargs: Optional[dict] = None, code_index_kwargs: Optional[dict] = None):
        # ...existing code...
        instance = cls.__new__(cls)  # bypass __init__
        instance._initialize(model_service_kwargs=model_service_kwargs, code_index_kwargs=code_index_kwargs)

        instance.logger.info("Deserializing graph from dictionary.")

        
        node_classes = {
            'Node': Node,
            'FileNode': FileNode,
            'ChunkNode': ChunkNode,
            'DirectoryNode': DirectoryNode,
            'EntityNode': EntityNode,
        }

        # Create a root node if not present in the data
        root_found = any(node_data['id'] == 'root' for node_data in data_dict['nodes'])
        if not root_found:
            instance.logger.warning("Root node not found in the data, creating one")
            root_node = Node(id='root', name='root', node_type='root')
            instance.graph.add_node('root', data=root_node, level=0)

        # --- Rebuild nodes ---
        for node_data in tqdm.tqdm(data_dict['nodes'], desc="Rebuilding nodes"):
            cls_name = node_data['class']
            node_cls = node_classes.get(cls_name, Node)
            kwargs = node_data['data']

            # Ensure ID is properly set
            if not kwargs.get('id'):
                kwargs['id'] = node_data['id']

            # Always use lists for declared_entities and called_entities
            kwargs['declared_entities'] = list(kwargs.get('declared_entities', []))
            kwargs['called_entities'] = list(kwargs.get('called_entities', []))

            # FileNode-specific
            if node_cls in (FileNode, ChunkNode):
                kwargs.setdefault('path', '')
                kwargs.setdefault('content', '')
                kwargs.setdefault('language', '')
            if node_cls == ChunkNode:
                kwargs.setdefault('order_in_file', 0)
                kwargs.setdefault('embedding', [])
            # EntityNode-specific
            if node_cls == EntityNode:
                kwargs.setdefault('entity_type', '')
                kwargs.setdefault('declaring_chunk_ids', [])
                kwargs.setdefault('calling_chunk_ids', [])
                kwargs.setdefault('aliases', [])

            node_instance = node_cls(**kwargs)
            instance.graph.add_node(node_data['id'], data=node_instance, level=instance._infer_level(node_instance))

        # --- Rebuild edges ---
        for edge in tqdm.tqdm(data_dict['edges'], desc="Rebuilding edges"):
            source = edge['source']
            target = edge['target']
            if source in instance.graph and target in instance.graph:
                edge_kwargs = {'relation': edge.get('relation', '')}
                if 'entities' in edge:
                    edge_kwargs['entities'] = list(edge['entities'])
                instance.graph.add_edge(source, target, **edge_kwargs)
            else:
                instance.logger.warning(f"Cannot add edge {source} -> {target}, nodes don't exist")

        # --- Rebuild instance.entities ---
        instance.entities = {}
        for node_id, node_attrs in tqdm.tqdm(instance.graph.nodes(data=True), desc="Rebuilding entities"):
            node = node_attrs['data']
            declared_entities = getattr(node, 'declared_entities', [])
            called_entities = getattr(node, 'called_entities', [])
            for entity in declared_entities:
                if isinstance(entity, dict):
                    name = entity.get('name')
                else:
                    name = entity
                if not name:
                    continue
                if name not in instance.entities:
                    instance.entities[name] = {
                        "declaring_chunk_ids": [],
                        "calling_chunk_ids": [],
                        "type": [],
                        "dtype": None
                    }
                # Only add node_id if it is a ChunkNode
                if node_id not in instance.entities[name]["declaring_chunk_ids"]:
                    if node_id in instance.graph and isinstance(instance.graph.nodes[node_id]["data"], ChunkNode):
                        instance.entities[name]["declaring_chunk_ids"].append(node_id)
                if isinstance(entity, dict):
                    entity_type = entity.get("type")
                    if entity_type and entity_type not in instance.entities[name]["type"]:
                        instance.entities[name]["type"].append(entity_type)
                    dtype = entity.get("dtype")
                    if dtype:
                        instance.entities[name]["dtype"] = dtype
            for called_name in called_entities:
                if not called_name:
                    continue
                if called_name not in instance.entities:
                    instance.entities[called_name] = {
                        "declaring_chunk_ids": [],
                        "calling_chunk_ids": [],
                        "type": [],
                        "dtype": None
                    }
                if node_id not in instance.entities[called_name]["calling_chunk_ids"]:
                    if node_id in instance.graph and isinstance(instance.graph.nodes[node_id]["data"], ChunkNode):
                        instance.entities[called_name]["calling_chunk_ids"].append(node_id)

        if index_nodes:
            instance.logger.info("Building code index after deserialization.")
            # Merge use_embed with code_index_kwargs, avoiding duplicate keyword arguments
            code_idx_kwargs = code_index_kwargs or {}
            if 'use_embed' not in code_idx_kwargs:
                code_idx_kwargs['use_embed'] = use_embed
            instance.code_index = CodeIndex(list(instance), model_service=instance.model_service, **code_idx_kwargs)

        instance.logger.info("Deserialization complete.")
        return instance

    def _infer_level(self, node):
        """Infer the level of a node based on its type"""
        if node.node_type == 'root':
            return 0
        elif node.node_type in ('file', 'directory'):
            return 1
        elif node.node_type == 'chunk':
            return 2
        return 1  # Default level

    def save_graph_to_file(self, filepath: str):
        self.logger.info(f"Saving graph to file: {filepath}")
        with open(filepath, 'w') as f:
            json.dump(self.to_dict(), f, indent=2)
        self.logger.info("Graph saved successfully.")

    @classmethod
    def load_graph_from_file(cls, filepath: str, index_nodes=True, use_embed: bool = True,
                             model_service_kwargs: Optional[dict] = None, code_index_kwargs: Optional[dict] = None):
        if model_service_kwargs is None:
            model_service_kwargs = {}
        with open(filepath, 'r') as f:
            data = json.load(f)
        logging.getLogger(LOGGER_NAME).info(f"Loaded graph data from file: {filepath}")
        return cls.from_dict(data, use_embed=use_embed, index_nodes=index_nodes,
                             model_service_kwargs=model_service_kwargs, code_index_kwargs=code_index_kwargs)

    def to_hf_dataset(
        self,
        repo_id: str,
        save_embeddings: bool = True,
        private: bool = False,
        token: Optional[str] = None,
        commit_message: Optional[str] = None,
    ):
        """
        Save the knowledge graph to a HuggingFace dataset on the Hub.

        The graph is serialized into two splits:
        - 'nodes': Contains all node data
        - 'edges': Contains all edge relationships

        Args:
            repo_id (str): The HuggingFace dataset repository ID (e.g., 'username/dataset-name')
            save_embeddings (bool): If True, saves embedding vectors for chunk nodes. 
                                   If False, embeddings are excluded to reduce dataset size.
            private (bool): Whether the dataset should be private. Defaults to False.
            token (str, optional): HuggingFace API token. If not provided, uses the token
                                  from huggingface_hub login or HF_TOKEN environment variable.
            commit_message (str, optional): Custom commit message for the upload.

        Returns:
            str: URL of the uploaded dataset
        """
        try:
            from datasets import Dataset, DatasetDict
            from huggingface_hub import HfApi
        except ImportError:
            raise ImportError(
                "huggingface_hub and datasets are required for HuggingFace integration. "
                "Install them with: pip install huggingface_hub datasets"
            )

        self.logger.info(f"Preparing to save knowledge graph to HuggingFace dataset: {repo_id}")
        self.logger.info(f"save_embeddings={save_embeddings}")

        # Serialize nodes
        nodes_data = []
        for node_id, node_attrs in tqdm.tqdm(self.graph.nodes(data=True), desc="Serializing nodes for HF dataset"):
            if 'data' not in node_attrs:
                self.logger.warning(f"Node {node_id} has no 'data' attribute, skipping")
                continue

            node = node_attrs['data']
            node_record = {
                'node_id': node.id or node_id,
                'node_class': node.__class__.__name__,
                'name': node.name,
                'node_type': node.node_type,
                'description': getattr(node, 'description', '') or '',
                'declared_entities': json.dumps(list(getattr(node, 'declared_entities', []))),
                'called_entities': json.dumps(list(getattr(node, 'called_entities', []))),
            }

            # FileNode-specific fields
            if isinstance(node, FileNode):
                node_record['path'] = node.path
                node_record['content'] = node.content
                node_record['language'] = getattr(node, 'language', '')
            else:
                node_record['path'] = ''
                node_record['content'] = ''
                node_record['language'] = ''

            # ChunkNode-specific fields
            if isinstance(node, ChunkNode):
                node_record['order_in_file'] = getattr(node, 'order_in_file', 0)
                if save_embeddings:
                    embedding = getattr(node, 'embedding', None)
                    node_record['embedding'] = json.dumps(embedding if embedding is not None else [])
                else:
                    node_record['embedding'] = json.dumps([])
            else:
                node_record['order_in_file'] = -1
                node_record['embedding'] = json.dumps([])

            # EntityNode-specific fields
            if isinstance(node, EntityNode):
                node_record['entity_type'] = getattr(node, 'entity_type', '')
                node_record['declaring_chunk_ids'] = json.dumps(list(getattr(node, 'declaring_chunk_ids', [])))
                node_record['calling_chunk_ids'] = json.dumps(list(getattr(node, 'calling_chunk_ids', [])))
                node_record['aliases'] = json.dumps(list(getattr(node, 'aliases', [])))
            else:
                node_record['entity_type'] = ''
                node_record['declaring_chunk_ids'] = json.dumps([])
                node_record['calling_chunk_ids'] = json.dumps([])
                node_record['aliases'] = json.dumps([])

            nodes_data.append(node_record)

        # Serialize edges
        edges_data = []
        for source, target, attrs in tqdm.tqdm(self.graph.edges(data=True), desc="Serializing edges for HF dataset"):
            edge_record = {
                'source': source,
                'target': target,
                'relation': attrs.get('relation', ''),
                'entities': json.dumps(list(attrs.get('entities', []))) if 'entities' in attrs else json.dumps([])
            }
            edges_data.append(edge_record)

        # Create datasets
        nodes_dataset = Dataset.from_list(nodes_data)
        edges_dataset = Dataset.from_list(edges_data)

        self.logger.info(f"Created dataset with {len(nodes_data)} nodes and {len(edges_data)} edges")

        # Push to Hub - nodes and edges are pushed separately as different configs
        # because they have different schemas
        if commit_message is None:
            base_commit_message = f"Upload knowledge graph ({len(nodes_data)} nodes, {len(edges_data)} edges)"
            if not save_embeddings:
                base_commit_message += " [embeddings excluded]"
        else:
            base_commit_message = commit_message

        self.logger.info(f"Pushing nodes dataset to HuggingFace Hub: {repo_id}")
        nodes_dataset.push_to_hub(
            repo_id=repo_id,
            config_name="nodes",
            private=private,
            token=token,
            commit_message=f"{base_commit_message} - nodes"
        )

        self.logger.info(f"Pushing edges dataset to HuggingFace Hub: {repo_id}")
        edges_dataset.push_to_hub(
            repo_id=repo_id,
            config_name="edges",
            private=private,
            token=token,
            commit_message=f"{base_commit_message} - edges"
        )

        url = f"https://huggingface.co/datasets/{repo_id}"
        self.logger.info(f"Dataset successfully uploaded to: {url}")
        return url

    @classmethod
    def from_hf_dataset(
        cls,
        repo_id: str,
        index_nodes: bool = True,
        use_embed: bool = True,
        model_service_kwargs: Optional[dict] = None,
        code_index_kwargs: Optional[dict] = None,
        token: Optional[str] = None,
        revision: Optional[str] = None,
    ):
        """
        Load a knowledge graph from a HuggingFace dataset on the Hub.

        Args:
            repo_id (str): The HuggingFace dataset repository ID (e.g., 'username/dataset-name')
            index_nodes (bool): Whether to build a code index after loading. Defaults to True.
            use_embed (bool): Whether to use existing embeddings from the dataset. Defaults to True.
            model_service_kwargs (dict, optional): Arguments for the model service.
            code_index_kwargs (dict, optional): Arguments for the code index.
            token (str, optional): HuggingFace API token for private datasets.
            revision (str, optional): Git revision (branch, tag, or commit) to load from.

        Returns:
            RepoKnowledgeGraph: The loaded knowledge graph instance.
        """
        try:
            from datasets import load_dataset
        except ImportError:
            raise ImportError(
                "datasets library is required for HuggingFace integration. "
                "Install it with: pip install datasets"
            )

        if model_service_kwargs is None:
            model_service_kwargs = {}

        logger = logging.getLogger(LOGGER_NAME)
        logger.info(f"Loading knowledge graph from HuggingFace dataset: {repo_id}")

        # Load dataset from Hub - nodes and edges are stored as separate configs
        logger.info("Loading nodes config...")
        nodes_dataset = load_dataset(repo_id, name="nodes", token=token, revision=revision)
        logger.info("Loading edges config...")
        edges_dataset = load_dataset(repo_id, name="edges", token=token, revision=revision)

        # Get the train split (default split when pushing with config_name)
        nodes_data = nodes_dataset['train']
        edges_data = edges_dataset['train']

        logger.info(f"Loaded {len(nodes_data)} nodes and {len(edges_data)} edges from dataset")

        # Convert to the dict format expected by from_dict
        graph_data = {
            'nodes': [],
            'edges': []
        }

        # Reconstruct nodes
        for record in tqdm.tqdm(nodes_data, desc="Reconstructing nodes from HF dataset"):
            node_dict = {
                'id': record['node_id'],
                'class': record['node_class'],
                'data': {
                    'id': record['node_id'],
                    'name': record['name'],
                    'node_type': record['node_type'],
                    'description': record['description'],
                    'declared_entities': json.loads(record['declared_entities']),
                    'called_entities': json.loads(record['called_entities']),
                }
            }

            # FileNode-specific fields
            if record['node_class'] in ('FileNode', 'ChunkNode'):
                node_dict['data']['path'] = record['path']
                node_dict['data']['content'] = record['content']
                node_dict['data']['language'] = record['language']

            # ChunkNode-specific fields
            if record['node_class'] == 'ChunkNode':
                node_dict['data']['order_in_file'] = record['order_in_file']
                embedding = json.loads(record['embedding'])
                # Only use embedding if use_embed is True and embedding is non-empty
                if use_embed and embedding:
                    node_dict['data']['embedding'] = embedding
                else:
                    node_dict['data']['embedding'] = []

            # EntityNode-specific fields
            if record['node_class'] == 'EntityNode':
                node_dict['data']['entity_type'] = record['entity_type']
                node_dict['data']['declaring_chunk_ids'] = json.loads(record['declaring_chunk_ids'])
                node_dict['data']['calling_chunk_ids'] = json.loads(record['calling_chunk_ids'])
                node_dict['data']['aliases'] = json.loads(record['aliases'])

            graph_data['nodes'].append(node_dict)

        # Reconstruct edges
        for record in tqdm.tqdm(edges_data, desc="Reconstructing edges from HF dataset"):
            edge_dict = {
                'source': record['source'],
                'target': record['target'],
                'relation': record['relation'],
            }
            entities = json.loads(record['entities'])
            if entities:
                edge_dict['entities'] = entities

            graph_data['edges'].append(edge_dict)

        logger.info("Dataset reconstruction complete, building graph...")

        # Use from_dict to build the graph
        return cls.from_dict(
            graph_data,
            index_nodes=index_nodes,
            use_embed=use_embed,
            model_service_kwargs=model_service_kwargs,
            code_index_kwargs=code_index_kwargs
        )

    def get_neighbors(self, node_id):
        self.logger.debug(f"Getting neighbors for node: {node_id}")
        # Return all nodes that are directly connected to node_id (successors and predecessors) for any edge type
        neighbors = set()
        for n in self.graph.successors(node_id):
            neighbors.add(n)
        for n in self.graph.predecessors(node_id):
            neighbors.add(n)
        # Also include nodes connected by any edge (not just 'contains')
        for u, v in self.graph.edges(node_id):
            if u == node_id:
                neighbors.add(v)
            else:
                neighbors.add(u)
        for u, v in self.graph.in_edges(node_id):
            if v == node_id:
                neighbors.add(u)
            else:
                neighbors.add(v)
        return [self.graph.nodes[n]['data'] for n in neighbors if 'data' in self.graph.nodes[n]]

    def get_previous_chunk(self, node_id: str) -> ChunkNode:
        self.logger.debug(f"Getting previous chunk for node: {node_id}")
        node = self[node_id]
        # Check if node is of type ChunkNode
        if not isinstance(node, ChunkNode):
            raise Exception(f'Cannot get previous chunk on node of type {type(node)}')

        if node.order_in_file == 0:
            self.logger.warning(f'Cannot get previous chunk for first node')
            return None

        file_path = node.path
        previous_chunk_id = f'{file_path}_{node.order_in_file - 1}'

        if previous_chunk_id not in self.graph:
            raise Exception(f'Previous chunk {previous_chunk_id} not found in graph')

        previous_chunk = self[previous_chunk_id]
        return previous_chunk

    def get_next_chunk(self, node_id: str) -> ChunkNode:
        self.logger.debug(f"Getting next chunk for node: {node_id}")
        node = self[node_id]
        # Check if node is of type ChunkNode
        if not isinstance(node, ChunkNode):
            raise Exception(f'Cannot get previous chunk on node of type {type(node)}')

        file_path = node.path
        next_chunk_id = f'{file_path}_{node.order_in_file + 1}'

        if next_chunk_id not in self.graph:
            self.logger.warning(f'Next chunk {next_chunk_id} not found in graph, it might be the last chunk')
            return None
        previous_chunk = self[next_chunk_id]
        return previous_chunk

    def get_all_chunks(self) -> List[ChunkNode]:
        self.logger.debug("Getting all chunk nodes.")
        chunk_nodes = []
        for node in self:
            if isinstance(node, ChunkNode):
                chunk_nodes.append(node)
        return chunk_nodes

    def get_all_files(self) -> List[FileNode]:
        self.logger.debug("Getting all file nodes.")
        """
        Get all FileNodes in the knowledge graph.

        Returns:
            List[FileNode]: A list of FileNodes in the graph.
        """
        file_nodes = []
        for node in self.graph.nodes(data=True):
            node_data = node[1]['data']
            # Check for exact FileNode type, not ChunkNode (which inherits from FileNode)
            if isinstance(node_data, FileNode) and node_data.node_type == 'file':
                file_nodes.append(node_data)
        return file_nodes

    def get_chunks_of_file(self, file_node_id: str) -> List[ChunkNode]:
        self.logger.debug(f"Getting chunks for file node: {file_node_id}")
        """
        Get all ChunkNodes associated with a specific FileNode.

        Args:
            file_node (FileNode): The file node to get chunks for.

        Returns:
            List[ChunkNode]: A list of ChunkNodes associated with the file.
        """
        chunk_nodes = []
        for node in self.graph.neighbors(file_node_id):
            # Only include ChunkNodes that are connected by a 'contains' edge
            edge_data = self.graph.get_edge_data(file_node_id, node)
            node_data = self.graph.nodes[node]['data']
            if (
                    isinstance(node_data, ChunkNode)
                    and node_data.node_type == 'chunk'
                    and edge_data is not None
                    and edge_data.get('relation') == 'contains'
            ):
                chunk_nodes.append(node_data)
        return chunk_nodes

    def find_path(self, source_id: str, target_id: str, max_depth: int = 5) -> dict:
        """
        Find the shortest path between two nodes in the knowledge graph.

        Args:
            source_id (str): The ID of the source node.
            target_id (str): The ID of the target node.
            max_depth (int): Maximum depth to search for a path. Defaults to 5.

        Returns:
            dict: A dictionary containing path information or error message.
        """
        self.logger.debug(f"Finding path from {source_id} to {target_id} with max_depth={max_depth}")
        g = self.graph

        if source_id not in g:
            return {"error": f"Source node '{source_id}' not found."}
        if target_id not in g:
            return {"error": f"Target node '{target_id}' not found."}

        try:
            path = nx.shortest_path(g, source=source_id, target=target_id)

            if len(path) - 1 > max_depth:
                return {
                    "source_id": source_id,
                    "target_id": target_id,
                    "path": [],
                    "length": len(path) - 1,
                    "text": f"Path exists but exceeds max_depth of {max_depth} (actual length: {len(path) - 1})"
                }

            # Build detailed path information
            path_details = []
            for i, node_id in enumerate(path):
                node = g.nodes[node_id]['data']
                node_info = {
                    "node_id": node_id,
                    "name": getattr(node, 'name', 'Unknown'),
                    "type": getattr(node, 'node_type', 'Unknown'),
                    "step": i
                }

                # Add edge information for all but the last node
                if i < len(path) - 1:
                    next_node_id = path[i + 1]
                    edge_data = g.get_edge_data(node_id, next_node_id)
                    node_info["edge_to_next"] = edge_data.get('relation', 'Unknown') if edge_data else 'Unknown'

                path_details.append(node_info)

            # Format text output
            text = f"Path from '{source_id}' to '{target_id}' (length: {len(path) - 1}):\n\n"
            for i, node_info in enumerate(path_details):
                text += f"{i}. {node_info['name']} ({node_info['type']})\n"
                text += f"   Node ID: {node_info['node_id']}\n"
                if 'edge_to_next' in node_info:
                    text += f"   --[{node_info['edge_to_next']}]--> \n"

            return {
                "source_id": source_id,
                "target_id": target_id,
                "path": path_details,
                "length": len(path) - 1,
                "text": text
            }

        except nx.NetworkXNoPath:
            return {
                "source_id": source_id,
                "target_id": target_id,
                "path": [],
                "length": -1,
                "text": f"No path found between '{source_id}' and '{target_id}'"
            }
        except Exception as e:
            self.logger.error(f"Error finding path: {str(e)}")
            return {"error": f"Error finding path: {str(e)}"}

    def get_subgraph(self, node_id: str, depth: int = 2, edge_types: Optional[List[str]] = None) -> dict:
        """
        Extract a subgraph around a node up to a specified depth.

        Args:
            node_id (str): The ID of the central node.
            depth (int): The depth/radius of the subgraph to extract. Defaults to 2.
            edge_types (Optional[List[str]]): Optional list of edge types to include (e.g., ['calls', 'contains']).

        Returns:
            dict: A dictionary containing subgraph information or error message.
        """
        self.logger.debug(f"Getting subgraph for node {node_id} with depth={depth}, edge_types={edge_types}")
        g = self.graph

        if node_id not in g:
            return {"error": f"Node '{node_id}' not found."}

        # Collect nodes within specified depth
        nodes_at_depth = {node_id}
        all_nodes = {node_id}

        for d in range(depth):
            next_level = set()
            for n in nodes_at_depth:
                # Get all neighbors (both incoming and outgoing)
                for neighbor in g.successors(n):
                    if edge_types is None:
                        next_level.add(neighbor)
                    else:
                        edge_data = g.get_edge_data(n, neighbor)
                        if edge_data and edge_data.get('relation') in edge_types:
                            next_level.add(neighbor)

                for neighbor in g.predecessors(n):
                    if edge_types is None:
                        next_level.add(neighbor)
                    else:
                        edge_data = g.get_edge_data(neighbor, n)
                        if edge_data and edge_data.get('relation') in edge_types:
                            next_level.add(neighbor)

            nodes_at_depth = next_level - all_nodes
            all_nodes.update(next_level)

        # Extract subgraph
        subgraph = g.subgraph(all_nodes).copy()

        # Build node details
        nodes = []
        for n in subgraph.nodes():
            node = subgraph.nodes[n]['data']
            nodes.append({
                "node_id": n,
                "name": getattr(node, 'name', 'Unknown'),
                "type": getattr(node, 'node_type', 'Unknown')
            })

        # Build edge details
        edges = []
        for source, target, data in subgraph.edges(data=True):
            edges.append({
                "source": source,
                "target": target,
                "relation": data.get('relation', 'Unknown')
            })

        # Format text output
        text = f"Subgraph around '{node_id}' (depth: {depth}):\n"
        if edge_types:
            text += f"Edge types filter: {', '.join(edge_types)}\n"
        text += f"\nNodes: {len(nodes)}\n"
        text += f"Edges: {len(edges)}\n\n"

        # Group nodes by type
        nodes_by_type = {}
        for node in nodes:
            node_type = node['type']
            if node_type not in nodes_by_type:
                nodes_by_type[node_type] = []
            nodes_by_type[node_type].append(node)

        for node_type, type_nodes in nodes_by_type.items():
            text += f"{node_type} ({len(type_nodes)}):\n"
            for node in type_nodes[:5]:
                text += f"  - {node['name']} ({node['node_id']})\n"
            if len(type_nodes) > 5:
                text += f"  ... and {len(type_nodes) - 5} more\n"
            text += "\n"

        # Show edge statistics
        edge_by_relation = {}
        for edge in edges:
            relation = edge['relation']
            edge_by_relation[relation] = edge_by_relation.get(relation, 0) + 1

        if edge_by_relation:
            text += "Edge types:\n"
            for relation, count in edge_by_relation.items():
                text += f"  - {relation}: {count}\n"

        return {
            "center_node_id": node_id,
            "depth": depth,
            "edge_types_filter": edge_types,
            "node_count": len(nodes),
            "edge_count": len(edges),
            "nodes": nodes,
            "edges": edges,
            "nodes_by_type": nodes_by_type,
            "edge_by_relation": edge_by_relation,
            "text": text
        }