Spaces:

lailaelkoussy
/

code-knowledge-graph-explorer-transformers-library

Sleeping

App Files Files Community

lailaelkoussy commited on Nov 30, 2025

Commit

a100cc5

verified ·

1 Parent(s): b1ddffc

upload repoknowledgegraphlib

Browse files

Files changed (17) hide show

RepoKnowledgeGraphLib/CodeIndex.py +571 -0
RepoKnowledgeGraphLib/CodeParser.py +70 -0
RepoKnowledgeGraphLib/Entity.py +188 -0
RepoKnowledgeGraphLib/EntityChunkMapper.py +517 -0
RepoKnowledgeGraphLib/EntityExtractor.py +2032 -0
RepoKnowledgeGraphLib/KnowledgeGraphMCPServer.py +1107 -0
RepoKnowledgeGraphLib/ModelService.py +424 -0
RepoKnowledgeGraphLib/Node.py +63 -0
RepoKnowledgeGraphLib/QuestionMaker.py +538 -0
RepoKnowledgeGraphLib/RepoKnowledgeGraph.py +1608 -0
RepoKnowledgeGraphLib/__init__.py +5 -0
RepoKnowledgeGraphLib/utils/__init__.py +0 -0
RepoKnowledgeGraphLib/utils/chunk_utils.py +88 -0
RepoKnowledgeGraphLib/utils/data_utils.py +18 -0
RepoKnowledgeGraphLib/utils/logger_utils.py +74 -0
RepoKnowledgeGraphLib/utils/parsing_utils.py +65 -0
RepoKnowledgeGraphLib/utils/path_utils.py +308 -0

RepoKnowledgeGraphLib/CodeIndex.py ADDED Viewed

	@@ -0,0 +1,571 @@

+import logging
+from tqdm import tqdm
+import uuid
+from typing import Literal
+from abc import ABC, abstractmethod
+import lancedb
+import os
+import numpy as np
+import weaviate
+from weaviate.classes.config import Configure, Property, DataType
+from weaviate.classes.query import MetadataQuery
+try:
+    LANCEDB_AVAILABLE = True
+except ImportError:
+    LANCEDB_AVAILABLE = False
+from .utils.logger_utils import setup_logger
+LOGGER_NAME = 'CODE_INDEX_LOGGER'
+STOP_AFTER_ATTEMPT = int(os.getenv("STOP_AFTER_ATTEMPT", 5))
+WAIT_BETWEEN_RETRIES = int(os.getenv("WAIT_BETWEEN_RETRIES", 2))
+MODEL_ID = os.getenv("MODEL_ID")
+MAX_TOKENS = int(os.getenv('MAX_TOKENS', 2048))
+TEMPERATURE = float(os.getenv('TEMPERATURE', 0.2))
+TOP_P = float(os.getenv('TOP_P', 0.95))
+FREQUENCY_PENALTY = 0
+PRESENCE_PENALTY = 0
+STOP = None
+EMBEDDING_MODEL_URL = os.getenv('EMBEDDING_MODEL_URL')
+EMBEDDING_MODEL_API_KEY = os.getenv('EMBEDDING_MODEL_API_KEY', "no_need")
+EMBEDDING_NUMBER_DIMENSIONS = int(os.getenv('EMBEDDING_NUMBER_DIMENSIONS', 1024))
+WEAVIATE_HOST = os.getenv('WEAVIATE_HOST', "localhost")
+WEAVIATE_PORT = int(os.getenv('WEAVIATE_PORT', 8080))
+WEAVIATE_GRPC_PORT = int(os.getenv('WEAVIATE_GRPC_PORT', 50051))
+ALPHA_SEARCH_VALUE = float(os.getenv('ALPHA_SEARCH_VALUE', 0.8))
+LANCEDB_PATH = os.getenv('LANCEDB_PATH', './local_code_index_db')
+class BaseCodeIndex(ABC):
+    """Abstract base class for code indexing implementations"""
+    def __init__(self, nodes: list, model_service, index_type: Literal['embedding-only', 'keyword-only', 'hybrid'] = 'hybrid',
+                 embedding_batch_size: int = 64, use_embed: bool = True):
+        setup_logger(LOGGER_NAME)
+        self.logger = logging.getLogger(LOGGER_NAME)
+        self.model_service = model_service
+        self.index_type = index_type
+        # Use larger batch size by default for better throughput
+        self.embedding_batch_size = int(os.getenv('EMBEDDING_BATCH_SIZE', embedding_batch_size))
+        self.use_embed = use_embed
+        self.logger.info(f"CodeIndex initialized with batch_size={self.embedding_batch_size}, index_type={index_type}")
+    @abstractmethod
+    def query(self, query: str, n_results: int=10) -> dict:
+        """Query the index and return results"""
+        pass
+    @abstractmethod
+    def __del__(self):
+        """Clean up resources"""
+        pass
+class WeaviateCodeIndex(BaseCodeIndex):
+    """Weaviate-based code index implementation"""
+    def __init__(self, nodes: list, model_service, index_type: Literal['embedding-only', 'keyword-only', 'hybrid'] = 'hybrid',
+                 embedding_batch_size: int = 20, use_embed: bool = True,
+                 host: str = None, port: int = None, grpc_port: int = None):
+        super().__init__(nodes, model_service, index_type, embedding_batch_size, use_embed)
+        # Use provided parameters or fall back to environment variables
+        weaviate_host = host or WEAVIATE_HOST
+        weaviate_port = port or WEAVIATE_PORT
+        weaviate_grpc_port = grpc_port or WEAVIATE_GRPC_PORT
+        # Connect to Weaviate
+        self.weaviate_client = weaviate.connect_to_local(
+            host=weaviate_host,
+            port=weaviate_port,
+            grpc_port=weaviate_grpc_port
+        )
+        # Create a unique collection name
+        self.collection_name = f"CodeChunks_{str(uuid.uuid4()).replace('-', '_')}"
+        # Create collection with schema using the v4 API
+        # Use vector_config with Configure.Vectors.self_provided() - the modern approach
+        self.collection = self.weaviate_client.collections.create(
+            name=self.collection_name,
+            properties=[
+                Property(name="node_id", data_type=DataType.TEXT),
+                Property(name="name", data_type=DataType.TEXT),
+                Property(name="content", data_type=DataType.TEXT),
+                Property(name="description", data_type=DataType.TEXT),
+                Property(name="path", data_type=DataType.TEXT),
+                Property(name="language", data_type=DataType.TEXT),
+                Property(name="node_type", data_type=DataType.TEXT),
+                Property(name="order_in_file", data_type=DataType.INT),
+                Property(name="declared_entities", data_type=DataType.TEXT),
+                Property(name="called_entities", data_type=DataType.TEXT),
+            ],
+            # We provide our own vectors using the modern vector_config API
+            vector_config=Configure.Vectors.self_provided(),
+        )
+        chunk_nodes = [node for node in nodes if node.node_type == 'chunk']
+        self.logger.info(f"Weaviate indexing {len(chunk_nodes)} chunk nodes with batch_size={self.embedding_batch_size}")
+        # Pre-generate embeddings in batches for better performance
+        if self.index_type != 'keyword-only':
+            # Identify nodes that need embeddings
+            nodes_needing_embeddings = [
+                node for node in chunk_nodes
+                if node.embedding is None or (isinstance(node.embedding, (list,)) and len(node.embedding) == 0) or not use_embed
+            ]
+            if nodes_needing_embeddings:
+                total_batches = (len(nodes_needing_embeddings) + self.embedding_batch_size - 1) // self.embedding_batch_size
+                self.logger.info(f'Batch embedding {len(nodes_needing_embeddings)} nodes in {total_batches} batches')
+                # Process in batches
+                for i in tqdm(range(0, len(nodes_needing_embeddings), self.embedding_batch_size),
+                             desc="Batch embedding nodes"):
+                    batch_nodes = nodes_needing_embeddings[i:i + self.embedding_batch_size]
+                    texts_to_embed = [node.get_field_to_embed() for node in batch_nodes]
+                    # Batch embed all texts
+                    embeddings = self.model_service.embed_chunk_code_batch(texts_to_embed)
+                    # Assign embeddings back to nodes
+                    for node, embedding in zip(batch_nodes, embeddings):
+                        node.embedding = embedding
+                    # Log progress every 10 batches
+                    batch_num = i // self.embedding_batch_size + 1
+                    if batch_num % 10 == 0:
+                        self.logger.info(f"Completed batch {batch_num}/{total_batches}")
+                self.logger.info(f"Embedding complete: processed {len(nodes_needing_embeddings)} nodes")
+            else:
+                self.logger.info(f"Using existing embeddings for all {len(chunk_nodes)} nodes")
+        # Batch insert data into Weaviate
+        with self.collection.batch.dynamic() as batch:
+            for node in tqdm(chunk_nodes, desc="Indexing nodes"):
+                self.logger.debug(f'Indexing node : {node.id}')
+                # Use pre-computed embedding
+                embedding = None
+                if self.index_type != 'keyword-only':
+                    embedding = node.embedding
+                # Prepare properties
+                properties = {
+                    "node_id": node.id,
+                    "name": node.name,
+                    "content": node.content,
+                    "description": node.description or "",
+                    "path": node.path,
+                    "language": node.language,
+                    "node_type": node.node_type,
+                    "order_in_file": node.order_in_file,
+                    "declared_entities": str(node.declared_entities),
+                    "called_entities": str(node.called_entities),
+                }
+                # Add object with or without vector based on index_type
+                if self.index_type == 'keyword-only':
+                    # No vector needed for keyword-only search
+                    batch.add_object(properties=properties)
+                else:
+                    # Add with vector for embedding-only and hybrid modes
+                    batch.add_object(
+                        properties=properties,
+                        vector=embedding
+                    )
+    def query(self, query: str, n_results:int=10) -> dict:
+        """
+        Perform search based on index_type:
+        - 'embedding-only': pure vector search
+        - 'keyword-only': pure keyword search (BM25)
+        - 'hybrid': hybrid search combining both (alpha controls weighting)
+        Weaviate's hybrid search uses:
+        - alpha=0: pure keyword search (BM25)
+        - alpha=1: pure vector search
+        - alpha=0.5-0.8: balanced hybrid search (recommended)
+        """
+        try:
+            # Execute search based on index_type
+            if self.index_type == 'keyword-only':
+                # Pure BM25 keyword search
+                response = self.collection.query.bm25(
+                    query=query,
+                    limit=n_results,
+                    return_metadata=MetadataQuery(score=True)
+                )
+            elif self.index_type == 'embedding-only':
+                # Pure vector search
+                embedding = self.model_service.embed_query(query)
+                response = self.collection.query.near_vector(
+                    near_vector=embedding,
+                    limit=n_results,
+                    return_metadata=MetadataQuery(distance=True)
+                )
+            else:  # 'hybrid'
+                # Hybrid search combining keyword and vector
+                embedding = self.model_service.embed_query(query)
+                response = self.collection.query.hybrid(
+                    query=query,
+                    vector=embedding,
+                    limit=n_results,
+                    alpha=ALPHA_SEARCH_VALUE,
+                    return_metadata=MetadataQuery(distance=True, score=True)
+                )
+            # Convert to ChromaDB-like format for compatibility
+            results = {
+                'ids': [[]],
+                'distances': [[]],
+                'metadatas': [[]],
+                'documents': [[]]
+            }
+            for obj in response.objects:
+                results['ids'][0].append(obj.properties['node_id'])
+                results['distances'][0].append(obj.metadata.distance if obj.metadata.distance else 0.0)
+                results['metadatas'][0].append({
+                    'id': obj.properties['node_id'],
+                    'name': obj.properties['name'],
+                    'content': obj.properties['content'],
+                    'description': obj.properties['description'],
+                    'path': obj.properties['path'],
+                    'language': obj.properties['language'],
+                    'node_type': obj.properties['node_type'],
+                    'order_in_file': str(obj.properties['order_in_file']),
+                    'declared_entities': obj.properties['declared_entities'],
+                    'called_entities': obj.properties['called_entities'],
+                })
+                results['documents'][0].append(obj.properties['content'])
+            return results
+        except Exception as e:
+            self.logger.error(f'Failed to query: {e}', exc_info=True)
+            raise e
+    def __del__(self):
+        """Clean up Weaviate connection"""
+        if hasattr(self, 'weaviate_client'):
+            try:
+                self.weaviate_client.close()
+            except:
+                pass
+class LanceDBCodeIndex(BaseCodeIndex):
+    """LanceDB-based code index implementation"""
+    def __init__(self, nodes: list, model_service, index_type: Literal['embedding-only', 'keyword-only', 'hybrid'] = 'hybrid',
+                 embedding_batch_size: int = 20, use_embed: bool = True, db_path: str = None):
+        super().__init__(nodes, model_service, index_type, embedding_batch_size, use_embed)
+        if not LANCEDB_AVAILABLE:
+            raise ImportError("LanceDB is not available. Please install it with: pip install lancedb")
+        # Embedded DB
+        self.db_path = db_path or LANCEDB_PATH
+        self.db = lancedb.connect(self.db_path)
+        self.table_name = f"code_chunks_{uuid.uuid4().hex}"
+        self.table = None
+        chunk_nodes = [node for node in nodes if node.node_type == "chunk"]
+        self.logger.info(f"LanceDB indexing {len(chunk_nodes)} chunk nodes with batch_size={self.embedding_batch_size}")
+        # -----------------------------------------------------------
+        # Create embeddings IF using vector search
+        # -----------------------------------------------------------
+        if self.index_type != "keyword-only":
+            # Find nodes that need embeddings
+            # use_embed=True means we should USE existing embeddings if available
+            # use_embed=False means we should regenerate all embeddings
+            nodes_needing_embeddings = []
+            for node in chunk_nodes:
+                needs_embedding = False
+                if not use_embed:
+                    # Regenerate all embeddings
+                    needs_embedding = True
+                elif node.embedding is None:
+                    needs_embedding = True
+                elif isinstance(node.embedding, (list, np.ndarray)) and len(node.embedding) == 0:
+                    needs_embedding = True
+                if needs_embedding:
+                    nodes_needing_embeddings.append(node)
+            if nodes_needing_embeddings:
+                total_batches = (len(nodes_needing_embeddings) + self.embedding_batch_size - 1) // self.embedding_batch_size
+                self.logger.info(f"Embedding {len(nodes_needing_embeddings)} chunks in {total_batches} batches (batch_size={self.embedding_batch_size})...")
+                for i in tqdm(range(0, len(nodes_needing_embeddings), self.embedding_batch_size),
+                             desc="Batch embedding nodes"):
+                    batch = nodes_needing_embeddings[i:i + self.embedding_batch_size]
+                    texts = [n.get_field_to_embed() for n in batch]
+                    batch_embeds = self.model_service.embed_chunk_code_batch(texts)
+                    for n, emb in zip(batch, batch_embeds):
+                        n.embedding = np.array(emb, dtype=np.float32)
+                    # Log progress every 10 batches
+                    batch_num = i // self.embedding_batch_size + 1
+                    if batch_num % 10 == 0:
+                        self.logger.info(f"Completed batch {batch_num}/{total_batches}")
+                self.logger.info(f"Embedding complete: processed {len(nodes_needing_embeddings)} chunks")
+            else:
+                self.logger.info(f"Using existing embeddings for all {len(chunk_nodes)} chunks")
+        # -----------------------------------------------------------
+        # Prepare rows (only include vector column when allowed)
+        # -----------------------------------------------------------
+        rows = []
+        for node in chunk_nodes:
+            row = {
+                "node_id": node.id,
+                "name": node.name,
+                "content": node.content,
+                "description": node.description or "",
+                "path": node.path,
+                "language": node.language,
+                "node_type": node.node_type,
+                "order_in_file": node.order_in_file,
+                "declared_entities": str(node.declared_entities),
+                "called_entities": str(node.called_entities),
+            }
+            # Add embeddings only for hybrid/embedding-only
+            if self.index_type != "keyword-only":
+                row["vector"] = node.embedding
+            rows.append(row)
+        # Create table
+        self.table = self.db.create_table(self.table_name, data=rows)
+        self.logger.info(f"Created LanceDB table: {self.table_name}")
+        # Create full-text search index for keyword and hybrid search
+        # LanceDB requires INVERTED index for full-text search
+        self._create_fts_indexes()
+    def _create_fts_indexes(self):
+        """
+        Create full-text search indexes on text columns.
+        LanceDB 0.25.x uses create_fts_index() with use_tantivy=True to support
+        multiple columns. Requires tantivy package: pip install tantivy
+        """
+        fts_columns = ["content", "name", "description"]
+        try:
+            # use_tantivy=True is required to support multiple field names as a list
+            self.table.create_fts_index(fts_columns, replace=True, use_tantivy=True)
+            self.logger.info(f"Created FTS index (Tantivy) on columns: {fts_columns}")
+        except Exception as e:
+            self.logger.warning(f"Failed to create FTS index: {e}")
+            self.logger.warning(
+                "Full-text search will fall back to scanning. "
+                "Ensure tantivy is installed: pip install tantivy"
+            )
+    def query(self, query: str, n_results: int=10) -> dict:
+        """
+        Perform search based on index_type:
+        - 'embedding-only': pure vector search
+        - 'keyword-only': full-text search using LanceDB's native FTS
+        - 'hybrid': combines vector similarity and full-text search with reranking
+        """
+        try:
+            # ---------------------- KEYWORD ONLY ----------------------
+            if self.index_type == "keyword-only":
+                # Use LanceDB full-text search (requires FTS index on the table)
+                try:
+                    # Try full-text search first
+                    df = self.table.search(query, query_type="fts").limit(n_results).to_pandas()
+                except Exception as fts_error:
+                    self.logger.warning(f"FTS search failed, falling back to scan: {fts_error}")
+                    # Fallback: scan all rows and filter in Python
+                    all_df = self.table.to_pandas()
+                    query_lower = query.lower()
+                    # Split query into words for more flexible matching
+                    query_words = query_lower.split()
+                    def matches_query(row):
+                        text = f"{row.get('content', '')} {row.get('name', '')} {row.get('description', '')}".lower()
+                        # Match if any query word is found
+                        return any(word in text for word in query_words)
+                    mask = all_df.apply(matches_query, axis=1)
+                    df = all_df[mask].head(n_results)
+                    # Add a dummy distance column
+                    df = df.copy()
+                    df['_distance'] = 0.0
+            # ---------------------- VECTOR ONLY -----------------------
+            elif self.index_type == "embedding-only":
+                emb = np.array(self.model_service.embed_query(query), dtype=np.float32)
+                df = self.table.search(
+                    emb,
+                    vector_column_name="vector"
+                ).limit(n_results).to_pandas()
+            # ---------------------- HYBRID ----------------------------
+            else:
+                # For hybrid search, we do vector search and optionally boost results
+                # that also match keywords. This is more flexible than requiring both.
+                emb = np.array(self.model_service.embed_query(query), dtype=np.float32)
+                # Get more results from vector search to allow for reranking
+                vector_limit = min(n_results * 3, 100)  # Get 3x results for reranking
+                df = self.table.search(
+                    emb,
+                    vector_column_name="vector"
+                ).limit(vector_limit).to_pandas()
+                if not df.empty:
+                    # Rerank results based on keyword matches
+                    query_lower = query.lower()
+                    query_words = query_lower.split()
+                    def compute_keyword_score(row):
+                        """Compute a keyword match score (higher is better)"""
+                        text = f"{row.get('content', '')} {row.get('name', '')} {row.get('description', '')}".lower()
+                        score = 0
+                        # Exact phrase match gets highest score
+                        if query_lower in text:
+                            score += 10
+                        # Word matches
+                        for word in query_words:
+                            if word in text:
+                                score += 1
+                            # Bonus for word in name (more relevant)
+                            if word in str(row.get('name', '')).lower():
+                                score += 2
+                        return score
+                    # Add keyword scores
+                    df = df.copy()
+                    df['_keyword_score'] = df.apply(compute_keyword_score, axis=1)
+                    # Normalize distance to a similarity score (lower distance = higher similarity)
+                    max_dist = df['_distance'].max() if df['_distance'].max() > 0 else 1.0
+                    df['_vector_score'] = 1.0 - (df['_distance'] / max_dist)
+                    # Combined score: weighted sum of vector similarity and keyword score
+                    # Alpha controls the balance (higher alpha = more weight on vector search)
+                    alpha = 0.7  # 70% vector, 30% keyword
+                    max_keyword = df['_keyword_score'].max() if df['_keyword_score'].max() > 0 else 1.0
+                    df['_combined_score'] = (
+                        alpha * df['_vector_score'] +
+                        (1 - alpha) * (df['_keyword_score'] / max_keyword)
+                    )
+                    # Sort by combined score (descending) and take top n_results
+                    df = df.sort_values('_combined_score', ascending=False).head(n_results)
+            # Build result format (ChromaDB-like format for compatibility)
+            results = {
+                "ids": [[]],
+                "distances": [[]],
+                "metadatas": [[]],
+                "documents": [[]],
+            }
+            for _, row in df.iterrows():
+                results["ids"][0].append(row["node_id"])
+                results["documents"][0].append(row["content"])
+                results["distances"][0].append(float(row.get("_distance", 0)))
+                results["metadatas"][0].append({
+                    "id": row["node_id"],
+                    "name": row["name"],
+                    "content": row["content"],
+                    "description": row["description"],
+                    "path": row["path"],
+                    "language": row["language"],
+                    "node_type": row["node_type"],
+                    "order_in_file": str(row["order_in_file"]),
+                    "declared_entities": row["declared_entities"],
+                    "called_entities": row["called_entities"],
+                })
+            return results
+        except Exception as e:
+            self.logger.error(f"Query failed: {e}", exc_info=True)
+            raise
+    def __del__(self):
+        """Clean up resources"""
+        pass
+# Factory function to create the appropriate CodeIndex
+def CodeIndex(
+    nodes: list,
+    model_service,
+    index_type: Literal['embedding-only', 'keyword-only', 'hybrid'] = 'hybrid',
+    embedding_batch_size: int = 20,
+    use_embed: bool = True,
+    backend: Literal['weaviate', 'lancedb'] = 'weaviate',
+    db_path: str = None,
+    host: str = None,
+    port: int = None,
+    grpc_port: int = None
+) -> BaseCodeIndex:
+    """
+    Factory function to create a CodeIndex instance.
+    Args:
+        nodes: List of nodes to index
+        model_service: Service for embedding generation
+        index_type: Type of search ('embedding-only', 'keyword-only', or 'hybrid')
+        embedding_batch_size: Batch size for embedding generation
+        use_embed: Whether to use pre-computed embeddings
+        backend: Which backend to use ('weaviate' or 'lancedb')
+        db_path: Path for LanceDB (only used with 'lancedb' backend)
+        host: Weaviate host (only used with 'weaviate' backend)
+        port: Weaviate port (only used with 'weaviate' backend)
+        grpc_port: Weaviate gRPC port (only used with 'weaviate' backend)
+    Returns:
+        BaseCodeIndex: Either WeaviateCodeIndex or LanceDBCodeIndex instance
+    """
+    if backend == 'lancedb':
+        return LanceDBCodeIndex(
+            nodes=nodes,
+            model_service=model_service,
+            index_type=index_type,
+            embedding_batch_size=embedding_batch_size,
+            use_embed=use_embed,
+            db_path=db_path
+        )
+    elif backend == 'weaviate':
+        return WeaviateCodeIndex(
+            nodes=nodes,
+            model_service=model_service,
+            index_type=index_type,
+            embedding_batch_size=embedding_batch_size,
+            use_embed=use_embed,
+            host=host,
+            port=port,
+            grpc_port=grpc_port
+        )
+    else:  # default to weaviate
+        return WeaviateCodeIndex(
+            nodes=nodes,
+            model_service=model_service,
+            index_type=index_type,
+            embedding_batch_size=embedding_batch_size,
+            use_embed=use_embed,
+            host=host,
+            port=port,
+            grpc_port=grpc_port
+        )

RepoKnowledgeGraphLib/CodeParser.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import logging
+import os
+from dotenv import load_dotenv
+from langchain_text_splitters import (
+    Language,
+    RecursiveCharacterTextSplitter,
+)
+from .utils.logger_utils import setup_logger
+load_dotenv()
+LOGGER_NAME = 'CODE_PARSER_LOGGER'
+CODE_CHUNK_OVERLAP = int(os.getenv('CODE_CHUNK_OVERLAP', 0))
+CODE_CHUNK_SIZE = int(os.getenv('CODE_CHUNK_SIZE', 2000))
+class CodeParser:
+    def __init__(self):
+        setup_logger(LOGGER_NAME)
+        self.logger = logging.getLogger(LOGGER_NAME)
+        self.extension_mapping = {
+            'c': Language.C,
+            'h': Language.C,
+            'cpp': Language.CPP,
+            'cc': Language.CPP,
+            'cxx': Language.CPP,
+            'hpp': Language.CPP,
+            'hh': Language.CPP,
+            'hxx': Language.CPP,
+            'go': Language.GO,
+            'java': Language.JAVA,
+            'py': Language.PYTHON,
+            'pyw': Language.PYTHON,
+            'js': Language.JS,
+            'mjs': Language.JS,
+            'cjs': Language.JS,
+            'md': Language.MARKDOWN,
+            'markdown': Language.MARKDOWN,
+            'html': Language.HTML,
+        }
+    def parse(self, file_name:str, file_content:str) -> list:
+        file_extension = file_name.split('.')[-1]
+        try:
+            self.logger.debug(f'Parsing file: {file_name}')
+            if file_extension not in self.extension_mapping:
+                self.logger.debug(f'File extension not supported: {file_extension}')
+                text_splitter = RecursiveCharacterTextSplitter(
+                    chunk_size=CODE_CHUNK_SIZE,
+                    chunk_overlap=CODE_CHUNK_OVERLAP,
+                    length_function=len,
+                    is_separator_regex=False,
+                )
+                docs = text_splitter.create_documents([file_content])
+            else:
+                self.logger.debug(f'File extension supported: {file_extension}')
+                code_splitter = RecursiveCharacterTextSplitter.from_language(language=self.extension_mapping[file_extension], chunk_size=CODE_CHUNK_SIZE, chunk_overlap=CODE_CHUNK_OVERLAP)
+                docs = code_splitter.create_documents([file_content])
+        except Exception as e:
+            self.logger.error(f'Error when parsing code: {e}')
+        return [doc.page_content for doc in docs]

RepoKnowledgeGraphLib/Entity.py ADDED Viewed

	@@ -0,0 +1,188 @@

+from typing import Optional, Dict, List, Type, Any
+from dataclasses import dataclass, field, asdict, fields, is_dataclass
+# Helper for dynamic class lookup
+ENTITY_TYPE_MAP = {}
+def register_entity(cls):
+    ENTITY_TYPE_MAP[cls.__name__] = cls
+    return cls
+def _entity_to_dict(obj):
+    if isinstance(obj, list):
+        return [_entity_to_dict(item) for item in obj]
+    elif isinstance(obj, dict):
+        return {(_entity_to_dict(k) if isinstance(k, Entity) else k): _entity_to_dict(v) for k, v in obj.items()}
+    elif isinstance(obj, Entity):
+        return obj.to_dict()
+    elif hasattr(obj, 'to_dict'):
+        return obj.to_dict()
+    else:
+        return obj
+def _entity_from_dict(data):
+    if isinstance(data, list):
+        return [_entity_from_dict(item) for item in data]
+    elif isinstance(data, dict) and 'entity_type' in data:
+        cls = ENTITY_TYPE_MAP.get(data['entity_type'].capitalize(), Entity)
+        return cls.from_dict(data)
+    else:
+        return data
+@register_entity
+@dataclass
+class Entity:
+    entity_type: str
+    entity_name: str
+    defined_chunk_id: str
+    entity_dtype: str
+    def to_dict(self):
+        d = asdict(self)
+        d['entity_type'] = self.entity_type
+        d['__class__'] = self.__class__.__name__
+        return d
+    @classmethod
+    def from_dict(cls, data):
+        # Remove __class__ if present
+        data = dict(data)
+        data.pop('__class__', None)
+        return cls(**data)
+@register_entity
+@dataclass
+class Variable(Entity):
+    entity_type = 'variable'
+    def to_dict(self):
+        d = super().to_dict()
+        d['entity_type'] = self.entity_type
+        return d
+    @classmethod
+    def from_dict(cls, data):
+        return super().from_dict(data)
+@register_entity
+@dataclass
+class Parameter(Entity):
+    entity_type = 'parameter'
+    entity_dtype: str
+    def to_dict(self):
+        d = super().to_dict()
+        d['entity_type'] = self.entity_type
+        return d
+    @classmethod
+    def from_dict(cls, data):
+        return super().from_dict(data)
+@register_entity
+@dataclass
+class Method(Entity):
+    entity_type = 'method'
+    parameters: List['Parameter'] = field(default_factory=list)
+    associated_class: Optional['Class'] = None
+    def to_dict(self):
+        d = super().to_dict()
+        d['parameters'] = _entity_to_dict(self.parameters)
+        d['associated_class'] = self.associated_class.to_dict() if self.associated_class else None
+        d['entity_type'] = self.entity_type
+        return d
+    @classmethod
+    def from_dict(cls, data):
+        params = [_entity_from_dict(p) for p in data.get('parameters', [])]
+        assoc_cls = Class.from_dict(data['associated_class']) if data.get('associated_class') else None
+        base = {k: v for k, v in data.items() if k not in ['parameters', 'parameters_pairs', 'associated_class']}
+        return cls(parameters=params, associated_class=assoc_cls, **base)
+@register_entity
+@dataclass
+class Class(Entity):
+    entity_type = 'class'
+    defined_methods: List['Method'] = field(default_factory=list)
+    def to_dict(self):
+        d = super().to_dict()
+        d['defined_methods'] = _entity_to_dict(self.defined_methods)
+        d['entity_type'] = self.entity_type
+        return d
+    @classmethod
+    def from_dict(cls, data):
+        methods = [_entity_from_dict(m) for m in data.get('defined_methods', [])]
+        base = {k: v for k, v in data.items() if k != 'defined_methods'}
+        return cls(defined_methods=methods, **base)
+@register_entity
+@dataclass
+class Function(Entity):
+    entity_type = 'function'
+    parameters: List[Parameter] = field(default_factory=list)
+    parameters_pairs: List[tuple] = field(default_factory=list)  # List of (Parameter, Variable)
+    def to_dict(self):
+        d = super().to_dict()
+        d['parameters'] = _entity_to_dict(self.parameters)
+        d['parameters_pairs'] = [ (p.to_dict(), v.to_dict()) for p, v in self.parameters_pairs ]
+        d['entity_type'] = self.entity_type
+        return d
+    @classmethod
+    def from_dict(cls, data):
+        params = [_entity_from_dict(p) for p in data.get('parameters', [])]
+        parameters_pairs = [(Parameter.from_dict(p), Variable.from_dict(v)) for p, v in data.get('parameters_pairs', [])]
+        base = {k: v for k, v in data.items() if k not in ['parameters', 'parameters_pairs']}
+        return cls(parameters=params, parameters_pairs=parameters_pairs, **base)
+@register_entity
+@dataclass
+class FunctionCall(Entity):
+    entity_type: str = 'function_call'
+    entity_name: str = ''
+    defined_chunk_id: str = ''
+    entity_dtype: str = ''
+    arguments: List[tuple] = field(default_factory=list)  # List of (Parameter, Variable)
+    associated_functions: Optional[Function] = field(default_factory=list)
+    def to_dict(self):
+        d = super().to_dict()
+        d['arguments'] = [ (p.to_dict(), v.to_dict()) for p, v in self.arguments ]
+        d['entity_type'] = self.entity_type
+        return d
+    @classmethod
+    def from_dict(cls, data):
+        arguments = [(Parameter.from_dict(p), Variable.from_dict(v)) for p, v in data.get('arguments', [])]
+        base = {k: v for k, v in data.items() if k != 'arguments'}
+        return cls(arguments=arguments, **base)
+@register_entity
+@dataclass
+class MethodCall(Entity):
+    entity_type: str = 'method_call'
+    entity_name: str = ''
+    defined_chunk_id: str = ''
+    entity_dtype: str = ''
+    arguments: List[tuple] = field(default_factory=list)  # List of (Parameter, Variable)
+    associated_class: Optional[Class] = None
+    associated_method: Optional[Method] = None
+    def to_dict(self):
+        d = super().to_dict()
+        d['arguments'] = [ (p.to_dict(), v.to_dict()) for p, v in self.arguments ]
+        d['associated_class'] = self.associated_class.to_dict() if self.associated_class else None
+        d['entity_type'] = self.entity_type
+        return d
+    @classmethod
+    def from_dict(cls, data):
+        arguments = [(Parameter.from_dict(p), Variable.from_dict(v)) for p, v in data.get('arguments', [])]
+        assoc_cls = Class.from_dict(data['associated_class']) if data.get('associated_class') else None
+        base = {k: v for k, v in data.items() if k not in ['arguments', 'associated_class']}
+        return cls(arguments=arguments, associated_class=assoc_cls, **base)

RepoKnowledgeGraphLib/EntityChunkMapper.py ADDED Viewed

	@@ -0,0 +1,517 @@

+import logging
+import re
+from typing import List, Tuple, Dict, Any, Set, Optional
+from enum import Enum
+class Language(Enum):
+    """Supported programming languages"""
+    PYTHON = "python"
+    C = "c"
+    CPP = "cpp"
+    JAVA = "java"
+class EntityChunkMapper:
+    """Maps entities from file-level extraction back to their respective chunks"""
+    def __init__(self):
+        self.logger = logging.getLogger("ENTITY_CHUNK_MAPPER")
+        self.extension_to_language = {
+            'py': Language.PYTHON,
+            'pyw': Language.PYTHON,
+            'c': Language.C,
+            'h': Language.C,
+            'cpp': Language.CPP,
+            'cc': Language.CPP,
+            'cxx': Language.CPP,
+            'hpp': Language.CPP,
+            'hh': Language.CPP,
+            'hxx': Language.CPP,
+            'java': Language.JAVA,
+        }
+    def _detect_language(self, file_name: Optional[str] = None) -> Language:
+        """
+        Detect the programming language from file extension
+        Args:
+            file_name: Name of the file (optional)
+        Returns:
+            Language enum value, defaults to PYTHON if not detected
+        """
+        if file_name:
+            extension = file_name.split('.')[-1].lower()
+            return self.extension_to_language.get(extension, Language.PYTHON)
+        return Language.PYTHON
+    def _is_comment_or_docstring(self, line: str, in_docstring: bool, language: Language) -> Tuple[bool, bool]:
+        """
+        Check if a line is a comment or part of a docstring/multi-line comment
+        Args:
+            line: The line to check
+            in_docstring: Whether we're currently inside a docstring/multi-line comment
+            language: The programming language
+        Returns:
+            Tuple of (is_comment_or_docstring, new_in_docstring_state)
+        """
+        stripped = line.strip()
+        if language == Language.PYTHON:
+            # Check for single-line comments
+            if stripped.startswith('#'):
+                return True, in_docstring
+            # Check for docstring delimiters (""" or ''')
+            triple_double = '"""'
+            triple_single = "'''"
+            # Count occurrences of triple quotes
+            if triple_double in stripped or triple_single in stripped:
+                # Check if it's a single-line docstring
+                if (stripped.count(triple_double) >= 2 or
+                    stripped.count(triple_single) >= 2):
+                    # Single-line docstring
+                    return True, in_docstring
+                else:
+                    # Toggle docstring state
+                    return True, not in_docstring
+            # If we're in a docstring, this line is part of it
+            if in_docstring:
+                return True, in_docstring
+        elif language in [Language.C, Language.CPP, Language.JAVA]:
+            # Check for single-line comments
+            if stripped.startswith('//'):
+                return True, in_docstring
+            # Check for multi-line comment delimiters /* */
+            if '/*' in line and '*/' in line:
+                # Single-line multi-line comment
+                return True, in_docstring
+            elif '/*' in line:
+                # Start of multi-line comment
+                return True, True
+            elif '*/' in line:
+                # End of multi-line comment
+                return True, False
+            # If we're in a multi-line comment
+            if in_docstring:
+                return True, in_docstring
+        return False, in_docstring
+    def _get_code_lines(self, chunk_lines: List[str], language: Language) -> List[str]:
+        """
+        Filter out comments and docstrings from chunk lines
+        Args:
+            chunk_lines: List of lines in the chunk
+            language: The programming language
+        Returns:
+            List of lines that are actual code (not comments or docstrings)
+        """
+        code_lines = []
+        in_docstring = False
+        for line in chunk_lines:
+            is_doc, in_docstring = self._is_comment_or_docstring(line, in_docstring, language)
+            if not is_doc:
+                code_lines.append(line)
+        return code_lines
+    def _is_valid_identifier_match(self, text: str, identifier: str, position: int) -> bool:
+        """
+        Check if an identifier match at a position is valid (not part of another word)
+        Args:
+            text: The text containing the identifier
+            identifier: The identifier to check
+            position: The position where the identifier was found
+        Returns:
+            True if this is a valid standalone identifier match
+        """
+        # Check character before (if exists)
+        if position > 0:
+            char_before = text[position - 1]
+            if char_before.isalnum() or char_before == '_':
+                return False
+        # Check character after (if exists)
+        end_pos = position + len(identifier)
+        if end_pos < len(text):
+            char_after = text[end_pos]
+            if char_after.isalnum() or char_after == '_':
+                return False
+        return True
+    def _contains_identifier(self, line: str, identifier: str) -> bool:
+        """
+        Check if a line contains an identifier as a standalone word (not part of another word)
+        Args:
+            line: The line to check
+            identifier: The identifier to find
+        Returns:
+            True if the identifier appears as a standalone word
+        """
+        # Use word boundary regex for precise matching
+        pattern = r'\b' + re.escape(identifier) + r'\b'
+        return bool(re.search(pattern, line))
+    def find_entity_in_chunks(self, entity_name: str, chunks: List[str], entity_type: str = None,
+                            file_name: Optional[str] = None) -> Set[int]:
+        """
+        Find which chunks contain a specific entity declaration or call
+        Args:
+            entity_name: Name of the entity to find
+            chunks: List of code chunks
+            entity_type: Type of entity (class, function, method, variable)
+            file_name: Name of the file to detect language (optional)
+        Returns:
+            Set of chunk indices that contain this entity
+        """
+        matching_chunks = set()
+        language = self._detect_language(file_name)
+        # Split the entity name to handle nested entities like "ClassName.method"
+        # For Java/C++, also handle :: separator
+        if '::' in entity_name:
+            parts = entity_name.split('::')
+        else:
+            parts = entity_name.split('.')
+        base_name = parts[-1]  # The actual identifier
+        for chunk_idx, chunk in enumerate(chunks):
+            chunk_lines = chunk.strip().split('\n')
+            # Look for different patterns based on entity type
+            if self._entity_appears_in_chunk(entity_name, base_name, chunk, chunk_lines, entity_type, language):
+                matching_chunks.add(chunk_idx)
+        return matching_chunks
+    def _entity_appears_in_chunk(self, full_name: str, base_name: str, chunk: str, chunk_lines: List[str],
+                                 entity_type: str, language: Language) -> bool:
+        """Check if an entity appears in a specific chunk (excluding comments and docstrings)"""
+        # Filter out comments and docstrings
+        code_lines = self._get_code_lines(chunk_lines, language)
+        # If no code lines remain, entity doesn't appear in actual code
+        if not code_lines:
+            return False
+        # Language-specific entity matching
+        if language == Language.PYTHON:
+            return self._entity_appears_in_python(full_name, base_name, code_lines, entity_type)
+        elif language in [Language.C, Language.CPP]:
+            return self._entity_appears_in_c_cpp(full_name, base_name, code_lines, entity_type)
+        elif language == Language.JAVA:
+            return self._entity_appears_in_java(full_name, base_name, code_lines, entity_type)
+        return False
+    def _entity_appears_in_python(self, full_name: str, base_name: str, code_lines: List[str],
+                                  entity_type: str) -> bool:
+        """Check if entity appears in Python code"""
+        if entity_type == "class":
+            # Look for class definition
+            for line in code_lines:
+                stripped = line.strip()
+                if re.match(rf'class\s+{re.escape(base_name)}[\s:(]', stripped):
+                    return True
+        elif entity_type == "api_endpoint":
+            # Look for API endpoint definition - the function decorated with @app.get, @app.post, etc.
+            # We look for the function definition itself
+            for line in code_lines:
+                stripped = line.strip()
+                # Match the function definition with the endpoint name
+                if re.match(rf'(async\s+)?def\s+{re.escape(base_name)}\s*\(', stripped):
+                    return True
+                # Also check for decorators that might reference the endpoint
+                if re.search(rf'@\w+\.(get|post|put|delete|patch|options|head)\s*\(', stripped):
+                    return True
+        elif entity_type == "function":
+            # Look for function definition (not method)
+            for line in code_lines:
+                stripped = line.strip()
+                # Check it's not indented (not a method)
+                if not line.startswith("    ") and not line.startswith("\t"):
+                    if re.match(rf'(async\s+)?def\s+{re.escape(base_name)}\s*\(', stripped):
+                        return True
+        elif entity_type == "method":
+            # Look for method definition (indented def)
+            method_name = full_name.split('.')[-1]
+            for line in code_lines:
+                stripped = line.strip()
+                # Check it's indented (is a method)
+                if line.startswith("    ") or line.startswith("\t"):
+                    if re.match(rf'(async\s+)?def\s+{re.escape(method_name)}\s*\(', stripped):
+                        return True
+        elif entity_type == "variable":
+            # Look for variable assignment or usage
+            if "." in full_name:
+                parts = full_name.split('.')
+                attr_name = parts[-1]
+                for line in code_lines:
+                    if re.search(rf'\.\s*{re.escape(attr_name)}\b', line):
+                        return True
+            else:
+                for line in code_lines:
+                    stripped = line.strip()
+                    if re.match(rf'{re.escape(base_name)}\s*[=:]', stripped):
+                        return True
+        # For called entities, look for usage patterns
+        if entity_type in ["function", "method"] or entity_type is None:
+            for line in code_lines:
+                if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
+                    return True
+        if entity_type == "class" or entity_type is None:
+            for line in code_lines:
+                if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
+                    return True
+        # General usage as identifier
+        if entity_type is None or entity_type == "variable":
+            for line in code_lines:
+                if self._contains_identifier(line, base_name):
+                    return True
+        return False
+    def _extract_using_namespace_directives(self, code_lines: List[str]) -> List[str]:
+        """
+        Extract using namespace directives from C++ code.
+        Returns a list of namespace names that are being imported.
+        """
+        namespaces = []
+        for line in code_lines:
+            stripped = line.strip()
+            # Match "using namespace <name>;"
+            match = re.match(r'using\s+namespace\s+([a-zA-Z_][a-zA-Z0-9_:]*)\s*;', stripped)
+            if match:
+                namespaces.append(match.group(1))
+        return namespaces
+    def _entity_appears_in_c_cpp(self, full_name: str, base_name: str, code_lines: List[str],
+                                 entity_type: str) -> bool:
+        """Check if entity appears in C/C++ code"""
+        # Extract using namespace directives
+        using_namespaces = self._extract_using_namespace_directives(code_lines)
+        # Check if the full_name matches any imported namespace + base_name
+        # e.g., if full_name is "math::Calculator" and we have "using namespace math",
+        # then "Calculator" in code should match
+        namespace_match = False
+        if '::' in full_name:
+            for ns in using_namespaces:
+                # Check if full_name starts with this namespace
+                if full_name.startswith(ns + '::'):
+                    namespace_match = True
+                    break
+        if entity_type == "class":
+            # Look for class/struct definition
+            for line in code_lines:
+                stripped = line.strip()
+                if re.match(rf'(class|struct)\s+{re.escape(base_name)}[\s:{{]', stripped):
+                    return True
+        elif entity_type == "function":
+            # Look for function definition or declaration
+            for line in code_lines:
+                stripped = line.strip()
+                # Match function patterns: return_type function_name(
+                # Also handle constructors and destructors
+                if (re.search(rf'\b{re.escape(base_name)}\s*\(', stripped) and
+                    not stripped.startswith('//')):
+                    # Additional check: likely a function if followed by parameters
+                    return True
+        elif entity_type == "method":
+            # Look for method definition (with class scope)
+            method_name = full_name.split('::')[-1] if '::' in full_name else full_name.split('.')[-1]
+            for line in code_lines:
+                stripped = line.strip()
+                # Match ClassName::methodName( or just methodName( inside class
+                if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped):
+                    return True
+        elif entity_type == "variable":
+            # Look for variable declaration or usage
+            for line in code_lines:
+                stripped = line.strip()
+                # Match variable declarations and assignments
+                if re.search(rf'\b{re.escape(base_name)}\b', stripped):
+                    return True
+        # For called entities, look for usage patterns
+        if entity_type in ["function", "method"] or entity_type is None:
+            for line in code_lines:
+                if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
+                    return True
+        if entity_type == "class" or entity_type is None:
+            # Look for instantiation or usage
+            for line in code_lines:
+                if re.search(rf'\b{re.escape(base_name)}\b', line):
+                    # If we found base_name and there's a namespace match, this is a match
+                    if namespace_match:
+                        return True
+                    # If full_name doesn't have a namespace, it's a direct match
+                    if '::' not in full_name:
+                        return True
+        # General usage as identifier
+        if entity_type is None or entity_type == "variable":
+            for line in code_lines:
+                if self._contains_identifier(line, base_name):
+                    # If we found base_name and there's a namespace match, this is a match
+                    if namespace_match:
+                        return True
+                    # If full_name doesn't have a namespace, it's a direct match
+                    if '::' not in full_name:
+                        return True
+        return False
+    def _entity_appears_in_java(self, full_name: str, base_name: str, code_lines: List[str],
+                                entity_type: str) -> bool:
+        """Check if entity appears in Java code"""
+        if entity_type == "class":
+            # Look for class/interface/enum definition
+            for line in code_lines:
+                stripped = line.strip()
+                if re.match(rf'(public|private|protected)?\s*(class|interface|enum)\s+{re.escape(base_name)}[\s<{{]', stripped):
+                    return True
+                # Without modifier
+                if re.match(rf'(class|interface|enum)\s+{re.escape(base_name)}[\s<{{]', stripped):
+                    return True
+        elif entity_type == "api_endpoint":
+            # Look for API endpoint definition - the method with Spring annotations
+            # Extract just the method name from the full qualified name (e.g., "com.example.Controller::method" -> "method")
+            method_name = base_name.split('::')[-1] if '::' in base_name else base_name
+            for line in code_lines:
+                stripped = line.strip()
+                # Match the method definition
+                if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped):
+                    return True
+                # Also check for Spring annotations
+                if re.search(r'@(GetMapping|PostMapping|PutMapping|DeleteMapping|PatchMapping|RequestMapping)', stripped):
+                    return True
+        elif entity_type == "function":
+            # In Java, functions are methods
+            for line in code_lines:
+                stripped = line.strip()
+                # Match method signature patterns
+                if re.search(rf'\b{re.escape(base_name)}\s*\(', stripped):
+                    return True
+        elif entity_type == "method":
+            # Look for method definition
+            method_name = full_name.split('.')[-1]
+            for line in code_lines:
+                stripped = line.strip()
+                if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped):
+                    return True
+        elif entity_type == "variable":
+            # Look for variable declaration or usage
+            for line in code_lines:
+                stripped = line.strip()
+                if re.search(rf'\b{re.escape(base_name)}\b', stripped):
+                    return True
+        # For called entities, look for usage patterns
+        if entity_type in ["function", "method"] or entity_type is None:
+            for line in code_lines:
+                if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
+                    return True
+        if entity_type == "class" or entity_type is None:
+            # Look for instantiation (new ClassName) or usage
+            for line in code_lines:
+                if re.search(rf'\b{re.escape(base_name)}\b', line):
+                    return True
+        # General usage as identifier
+        if entity_type is None or entity_type == "variable":
+            for line in code_lines:
+                if self._contains_identifier(line, base_name):
+                    return True
+        return False
+    def map_entities_to_chunks(self, declared_entities: List[Dict[str, Any]],
+                               called_entities: List[str],
+                               chunks: List[str],
+                               file_name: Optional[str] = None) -> Tuple[Dict[int, List[Dict[str, Any]]],
+    Dict[int, List[str]]]:
+        """
+        Map file-level entities back to their respective chunks
+        Args:
+            declared_entities: List of declared entities from file-level extraction
+            called_entities: List of called entities from file-level extraction
+            chunks: List of code chunks
+            file_name: Name of the file to detect language (optional)
+        Returns:
+            Tuple of (chunk_declared_entities, chunk_called_entities)
+            - chunk_declared_entities: Dict mapping chunk_index -> list of declared entities
+            - chunk_called_entities: Dict mapping chunk_index -> list of called entities
+        """
+        chunk_declared = {}
+        chunk_called = {}
+        # Initialize empty lists for all chunks
+        for i in range(len(chunks)):
+            chunk_declared[i] = []
+            chunk_called[i] = []
+        # Map declared entities to chunks
+        for entity in declared_entities:
+            entity_name = entity.get("name", "")
+            entity_type = entity.get("type", "")
+            matching_chunks = self.find_entity_in_chunks(entity_name, chunks, entity_type, file_name)
+            # Add entity to matching chunks
+            for chunk_idx in matching_chunks:
+                chunk_declared[chunk_idx].append(entity)
+        # Map called entities to chunks
+        for called_entity in called_entities:
+            matching_chunks = self.find_entity_in_chunks(called_entity, chunks, None, file_name)
+            # Add called entity to matching chunks
+            for chunk_idx in matching_chunks:
+                if called_entity not in chunk_called[chunk_idx]:
+                    chunk_called[chunk_idx].append(called_entity)
+        return chunk_declared, chunk_called

RepoKnowledgeGraphLib/EntityExtractor.py ADDED Viewed

	@@ -0,0 +1,2032 @@

+import ast
+import os
+import logging
+import tempfile
+from typing import List, Dict, Any, Tuple, Optional
+from clang import cindex
+import javalang
+import javalang.tree as T
+import esprima
+from bs4 import BeautifulSoup
+import tree_sitter_rust as ts_rust
+from tree_sitter import Language, Parser
+import re
+from .utils.path_utils import generate_entity_aliases
+LOGGER_NAME = "AST_ENTITY_EXTRACTOR"
+logger = logging.getLogger(LOGGER_NAME)
+class BaseASTEntityExtractor:
+    def extract_entities(self, code: str, file_path: str = None) -> Tuple[List[Dict[str, Any]], List[str]]:
+        """
+        Extract entities from source code.
+        Args:
+            code: Source code as string
+            file_path: Optional path to the source file (for better context and include resolution)
+        Returns:
+            Tuple of (declared_entities, called_entities)
+        """
+        raise NotImplementedError
+    # Add a reset contract so extractors can be reused safely
+    def reset(self) -> None:
+        """
+        Reset internal state so the extractor instance can be reused.
+        Concrete extractors should override this to clear their buffers.
+        """
+        raise NotImplementedError
+class HTMLEntityExtractor(BaseASTEntityExtractor):
+    """
+    Hybrid HTML AST-based entity extractor.
+    Responsibilities:
+      • Parse HTML into a tree
+      • Extract declared DOM entities (ids, names, classes)
+      • Extract JavaScript calls from inline event handlers
+      • Extract JS entities from <script> tags
+      • Integrate cleanly with the hybrid AST graph linker
+    """
+    EVENT_ATTR_PREFIX = "on"  # e.g., onclick, onsubmit, etc.
+    def __init__(self):
+        self.js_extractor = JavaScriptEntityExtractor()
+        self.reset()
+    # --------------------------------------
+    # Core interface
+    # --------------------------------------
+    def reset(self):
+        self.declared_entities: List[Dict[str, str]] = []
+        self.called_entities: List[str] = []
+    def extract_entities(self, code: str, file_path: str = None) -> Tuple[List[Dict[str, str]], List[str]]:
+        """Main entry point: parse HTML and extract entities."""
+        self.reset()
+        try:
+            soup = BeautifulSoup(code, "html.parser")
+        except Exception as e:
+            print(f"[HTMLEntityExtractor] Parsing error: {e}")
+            return [], []
+        # --- DOM element declarations ---
+        for tag in soup.find_all(True):
+            self._handle_tag_declaration(tag)
+            self._handle_event_attributes(tag)
+        # --- <script> tags (inline + external) ---
+        for script in soup.find_all("script"):
+            self._handle_script(script)
+        # --- Deduplication ---
+        self.declared_entities = self._deduplicate_dicts(self.declared_entities)
+        self.called_entities = self._deduplicate_list(self.called_entities)
+        return self.declared_entities, self.called_entities
+    # --------------------------------------
+    # Tag & attribute handlers
+    # --------------------------------------
+    def _handle_tag_declaration(self, tag):
+        """Extract declared DOM elements (id, name, class)."""
+        if tag.has_attr("id"):
+            self.declared_entities.append({"name": tag["id"], "type": "element"})
+        if tag.has_attr("name"):
+            self.declared_entities.append({"name": tag["name"], "type": "element"})
+        if tag.has_attr("class"):
+            classes = tag["class"]
+            if isinstance(classes, list):
+                for c in classes:
+                    self.declared_entities.append({"name": c, "type": "class"})
+            elif isinstance(classes, str):
+                self.declared_entities.append({"name": classes, "type": "class"})
+    def _handle_event_attributes(self, tag):
+        """Extract JS calls from inline event attributes."""
+        if not self.js_extractor:
+            return
+        for attr, value in tag.attrs.items():
+            if attr.lower().startswith(self.EVENT_ATTR_PREFIX) and isinstance(value, str):
+                try:
+                    _, called = self.js_extractor.extract_entities(value)
+                    self.called_entities.extend(called)
+                except Exception as e:
+                    print(f"[HTMLEntityExtractor] JS parse error in {attr}: {e}")
+    def _handle_script(self, script):
+        """Extract JS entities from <script> blocks or src attributes."""
+        if script.has_attr("src"):
+            src = script["src"]
+            self.called_entities.append(src)
+            return
+        if not self.js_extractor:
+            return
+        js_code = (script.string or "").strip()
+        if js_code:
+            try:
+                declared, called = self.js_extractor.extract_entities(js_code)
+                self.declared_entities.extend(declared)
+                self.called_entities.extend(called)
+            except Exception as e:
+                print(f"[HTMLEntityExtractor] JS parse error in <script>: {e}")
+    # --------------------------------------
+    # Helpers
+    # --------------------------------------
+    @staticmethod
+    def _deduplicate_dicts(dicts: List[Dict]) -> List[Dict]:
+        seen = set()
+        result = []
+        for d in dicts:
+            key = tuple(sorted(d.items()))
+            if key not in seen:
+                seen.add(key)
+                result.append(d)
+        return result
+    @staticmethod
+    def _deduplicate_list(items: List[str]) -> List[str]:
+        seen = set()
+        result = []
+        for i in items:
+            if i not in seen:
+                seen.add(i)
+                result.append(i)
+        return result
+class JavaEntityExtractor(BaseASTEntityExtractor):
+    """
+    Extract declared and called entities from Java code using javalang.
+    Produces the same (declared_entities, called_entities) structure as other extractors.
+    """
+    def __init__(self):
+        self.reset()
+    def reset(self) -> None:
+        self.declared_entities: List[Dict[str, Any]] = []
+        self.called_entities: List[str] = []
+        self.current_package: Optional[str] = None
+        self.scope_stack: List[str] = []
+        self.api_endpoints: List[Dict[str, Any]] = []  # Track API endpoint definitions
+        self.current_class_base_path: Optional[str] = None  # For @RequestMapping on class
+    # -----------------------------------------------------------
+    # Helpers
+    # -----------------------------------------------------------
+    def _qualified(self, name: str) -> str:
+        if not name:
+            return ""
+        scope = "::".join(self.scope_stack)
+        return f"{scope}::{name}" if scope else name
+    def _walk_type(self, t):
+        """Return string representation of a type node."""
+        if not t:
+            return "unknown"
+        if isinstance(t, str):
+            return t
+        if hasattr(t, "name"):
+            name = t.name
+            if getattr(t, "arguments", None):
+                args = [self._walk_type(a.type) for a in t.arguments if hasattr(a, "type")]
+                name += "<" + ", ".join(args) + ">"
+            return name
+        return "unknown"
+    # -----------------------------------------------------------
+    # Main AST traversal
+    # -----------------------------------------------------------
+    def extract_entities(self, code: str, file_path: str = None) -> Tuple[List[Dict[str, Any]], List[str]]:
+        self.reset()
+        try:
+            tree = javalang.parse.parse(code)
+        except javalang.parser.JavaSyntaxError as e:
+            logger.error(f"Syntax error in Java code: {e}")
+            return [], []
+        except Exception as e:
+            logger.error(f"Error parsing Java code: {e}", exc_info=True)
+            return [], []
+        # --- Package ---
+        if tree.package:
+            self.current_package = tree.package.name
+        # --- Imports ---
+        for imp in tree.imports:
+            self.called_entities.append(imp.path)
+        # --- Types (classes, interfaces, enums) ---
+        for type_decl in tree.types:
+            self._visit_type(type_decl)
+        # Deduplicate
+        seen_decl = set()
+        unique_declared = []
+        for e in self.declared_entities:
+            key = (e.get("name"), e.get("type"), e.get("dtype"))
+            if key not in seen_decl:
+                unique_declared.append(e)
+                seen_decl.add(key)
+        unique_called = list(dict.fromkeys(self.called_entities))
+        return unique_declared, unique_called
+    # -----------------------------------------------------------
+    # Visitors for different node types
+    # -----------------------------------------------------------
+    def _visit_type(self, node):
+        if isinstance(node, javalang.tree.ClassDeclaration):
+            self._visit_class(node)
+        elif isinstance(node, javalang.tree.InterfaceDeclaration):
+            self._visit_interface(node)
+        elif isinstance(node, javalang.tree.EnumDeclaration):
+            self._visit_enum(node)
+    def _visit_class(self, node):
+        full_name = node.name
+        if self.current_package:
+            full_name = f"{self.current_package}.{node.name}"
+        qualified = self._qualified(full_name)
+        self.declared_entities.append({"name": qualified, "type": "class"})
+        # Check for REST controller annotations and extract base path
+        old_base_path = self.current_class_base_path
+        if node.annotations:
+            for annotation in node.annotations:
+                if annotation.name in {'RestController', 'Controller'}:
+                    # Mark as REST controller
+                    pass
+                elif annotation.name == 'RequestMapping':
+                    # Extract base path from class-level @RequestMapping
+                    self.current_class_base_path = self._extract_path_from_annotation(annotation)
+        # Inheritance
+        if node.extends:
+            self.called_entities.append(self._walk_type(node.extends))
+        for impl in node.implements or []:
+            self.called_entities.append(self._walk_type(impl))
+        self.scope_stack.append(full_name)
+        for member in node.body:
+            self._visit_member(member)
+        self.scope_stack.pop()
+        # Restore the previous base path
+        self.current_class_base_path = old_base_path
+    def _visit_interface(self, node):
+        full_name = node.name
+        if self.current_package:
+            full_name = f"{self.current_package}.{node.name}"
+        qualified = self._qualified(full_name)
+        self.declared_entities.append({"name": qualified, "type": "interface"})
+        for impl in node.extends or []:
+            self.called_entities.append(self._walk_type(impl))
+        self.scope_stack.append(full_name)
+        for member in node.body:
+            self._visit_member(member)
+        self.scope_stack.pop()
+    def _visit_enum(self, node):
+        full_name = node.name
+        if self.current_package:
+            full_name = f"{self.current_package}.{node.name}"
+        qualified = self._qualified(full_name)
+        self.declared_entities.append({"name": qualified, "type": "enum"})
+    def _visit_member(self, node):
+        # --- Method ---
+        if isinstance(node, T.MethodDeclaration):
+            method_name = self._qualified(node.name)
+            # Check for API endpoint annotations
+            api_info = self._extract_api_endpoint_from_annotations(node)
+            if api_info:
+                self.declared_entities.append({
+                    "name": method_name,
+                    "type": "api_endpoint",
+                    "endpoint": api_info.get("endpoint"),
+                    "methods": api_info.get("methods")
+                })
+                self.api_endpoints.append({**api_info, "function": method_name})
+            else:
+                self.declared_entities.append({"name": method_name, "type": "method"})
+            for param in node.parameters:
+                ptype = self._walk_type(param.type)
+                pname = f"{method_name}.{param.name}"
+                self.declared_entities.append({
+                    "name": pname,
+                    "type": "variable",
+                    "dtype": ptype
+                })
+            # Look for method calls in the body
+            if node.body:
+                self._find_calls(node.body)
+        # --- Constructor ---
+        elif isinstance(node, T.ConstructorDeclaration):
+            ctor_name = self._qualified(node.name)
+            self.declared_entities.append({"name": ctor_name, "type": "constructor"})
+            for param in node.parameters:
+                ptype = self._walk_type(param.type)
+                pname = f"{ctor_name}.{param.name}"
+                self.declared_entities.append({
+                    "name": pname,
+                    "type": "variable",
+                    "dtype": ptype
+                })
+            if node.body:
+                self._find_calls(node.body)
+        # --- Field ---
+        elif isinstance(node, T.FieldDeclaration):
+            dtype = self._walk_type(node.type)
+            for decl in node.declarators:
+                var_name = self._qualified(decl.name)
+                self.declared_entities.append({
+                    "name": var_name,
+                    "type": "variable",
+                    "dtype": dtype
+                })
+        # --- Nested class/interface ---
+        elif isinstance(node, (T.ClassDeclaration, T.InterfaceDeclaration)):
+            self._visit_type(node)
+    # -----------------------------------------------------------
+    # API Endpoint Detection
+    # -----------------------------------------------------------
+    def _extract_api_endpoint_from_annotations(self, method) -> Optional[Dict[str, Any]]:
+        """
+        Extract API endpoint information from Spring Boot method annotations.
+        Handles: @GetMapping, @PostMapping, @RequestMapping, etc.
+        """
+        if not method.annotations:
+            return None
+        for annotation in method.annotations:
+            annotation_name = annotation.name
+            if annotation_name in {'GetMapping', 'PostMapping', 'PutMapping', 'PatchMapping', 'DeleteMapping'}:
+                # Extract HTTP method from annotation name
+                http_method = annotation_name.replace('Mapping', '').upper()
+                path = self._extract_path_from_annotation(annotation)
+                if path:
+                    # Combine with class-level base path if present
+                    full_path = self._combine_paths(self.current_class_base_path, path)
+                    return {
+                        "endpoint": full_path,
+                        "methods": [http_method],
+                        "type": "api_endpoint_definition"
+                    }
+            elif annotation_name == 'RequestMapping':
+                # @RequestMapping can specify multiple methods
+                path = self._extract_path_from_annotation(annotation)
+                methods = self._extract_methods_from_annotation(annotation)
+                if path:
+                    full_path = self._combine_paths(self.current_class_base_path, path)
+                    return {
+                        "endpoint": full_path,
+                        "methods": methods if methods else ['GET'],  # Default to GET
+                        "type": "api_endpoint_definition"
+                    }
+        return None
+    def _extract_path_from_annotation(self, annotation) -> Optional[str]:
+        """Extract path/value from Spring annotation."""
+        if not annotation.element:
+            return None
+        # Handle @GetMapping("/path") - single value
+        if isinstance(annotation.element, T.Literal):
+            return annotation.element.value.strip('"')
+        # Handle @RequestMapping(value = "/path") or @RequestMapping(path = "/path")
+        if isinstance(annotation.element, list):
+            for elem in annotation.element:
+                if isinstance(elem, T.ElementValuePair):
+                    if elem.name in {'value', 'path'}:
+                        if isinstance(elem.value, T.Literal):
+                            return elem.value.value.strip('"')
+                        elif isinstance(elem.value, T.ElementArrayValue):
+                            # Handle array: value = {"/path1", "/path2"}
+                            if elem.value.values:
+                                first_val = elem.value.values[0]
+                                if isinstance(first_val, T.Literal):
+                                    return first_val.value.strip('"')
+        return None
+    def _extract_methods_from_annotation(self, annotation) -> List[str]:
+        """Extract HTTP methods from @RequestMapping annotation."""
+        methods = []
+        if isinstance(annotation.element, list):
+            for elem in annotation.element:
+                if isinstance(elem, T.ElementValuePair):
+                    if elem.name == 'method':
+                        # Handle method = RequestMethod.GET or method = {RequestMethod.GET, RequestMethod.POST}
+                        if hasattr(elem.value, 'member'):
+                            # Single method: RequestMethod.GET
+                            methods.append(elem.value.member)
+                        elif isinstance(elem.value, T.ElementArrayValue):
+                            # Multiple methods: {RequestMethod.GET, RequestMethod.POST}
+                            for val in elem.value.values:
+                                if hasattr(val, 'member'):
+                                    methods.append(val.member)
+        return methods
+    def _combine_paths(self, base_path: Optional[str], path: str) -> str:
+        """Combine base path from class annotation with method path."""
+        if not base_path:
+            return path
+        # Normalize paths
+        base = base_path.rstrip('/')
+        path = path.lstrip('/')
+        return f"{base}/{path}" if path else base
+    # -----------------------------------------------------------
+    # Find method invocations
+    # -----------------------------------------------------------
+    def _find_calls(self, statements):
+        """Recursively find method and constructor calls inside Java AST nodes."""
+        def _recurse(node):
+            if isinstance(node, T.MethodInvocation):
+                if node.qualifier:
+                    self.called_entities.append(f"{node.qualifier}.{node.member}")
+                else:
+                    self.called_entities.append(node.member)
+            elif isinstance(node, T.ClassCreator):
+                self.called_entities.append(self._walk_type(node.type))
+            # Recurse into all children
+            if hasattr(node, '__dict__'):
+                for attr, val in vars(node).items():
+                    if isinstance(val, list):
+                        for child in val:
+                            if isinstance(child, T.Node):
+                                _recurse(child)
+                    elif isinstance(val, T.Node):
+                        _recurse(val)
+        if not statements:
+            return
+        if isinstance(statements, list):
+            for stmt in statements:
+                _recurse(stmt)
+        else:
+            _recurse(statements)
+class JavaScriptEntityExtractor(BaseASTEntityExtractor):
+    """
+    Extract declared and called entities from JavaScript code using esprima.
+    Handles ES6+ syntax including classes, arrow functions, imports/exports.
+    Also detects API endpoint calls (fetch, axios, etc.).
+    """
+    # Common HTTP methods to detect
+    HTTP_METHODS = {'get', 'post', 'put', 'patch', 'delete', 'head', 'options'}
+    # API call patterns to detect
+    API_PATTERNS = {
+        'fetch',           # fetch('/api/users')
+        'axios',           # axios.get('/api/users')
+        '$http',           # Angular $http
+        'request',         # request library
+        'superagent',      # superagent library
+    }
+    def __init__(self):
+        self.reset()
+    def reset(self) -> None:
+        self.declared_entities: List[Dict[str, Any]] = []
+        self.called_entities: List[str] = []
+        self.scope_stack: List[str] = []
+        self.api_calls: List[Dict[str, Any]] = []  # Track API endpoint calls
+    def _qualified(self, name: str) -> str:
+        """Return fully qualified name using current scope stack."""
+        if not name:
+            return ""
+        scope = ".".join(self.scope_stack)
+        return f"{scope}.{name}" if scope else name
+    def _get_function_name(self, node) -> Optional[str]:
+        """Extract function name from various function node types."""
+        if hasattr(node, 'id') and node.id:
+            return node.id.name
+        return None
+    def _walk_node(self, node):
+        """Recursively walk the AST and extract entities."""
+        if not node or not hasattr(node, 'type'):
+            return
+        node_type = node.type
+        # --- Function Declaration ---
+        if node_type == 'FunctionDeclaration':
+            func_name = self._get_function_name(node)
+            if func_name:
+                qualified = self._qualified(func_name)
+                self.declared_entities.append({"name": qualified, "type": "function"})
+                # Extract parameters
+                if hasattr(node, 'params'):
+                    for param in node.params:
+                        param_name = self._extract_pattern_name(param)
+                        if param_name:
+                            self.declared_entities.append({
+                                "name": f"{qualified}.{param_name}",
+                                "type": "variable",
+                                "dtype": "unknown"
+                            })
+                self.scope_stack.append(func_name)
+                if hasattr(node, 'body'):
+                    self._walk_node(node.body)
+                self.scope_stack.pop()
+        # --- Arrow Function Expression ---
+        elif node_type == 'ArrowFunctionExpression':
+            # Arrow functions are typically assigned, handle in VariableDeclarator
+            if hasattr(node, 'params'):
+                for param in node.params:
+                    param_name = self._extract_pattern_name(param)
+                    # Note: can't fully qualify without parent context
+            if hasattr(node, 'body'):
+                self._walk_node(node.body)
+        # --- Function Expression ---
+        elif node_type == 'FunctionExpression':
+            func_name = self._get_function_name(node)
+            if func_name:
+                qualified = self._qualified(func_name)
+                self.declared_entities.append({"name": qualified, "type": "function"})
+                self.scope_stack.append(func_name)
+            if hasattr(node, 'params'):
+                for param in node.params:
+                    param_name = self._extract_pattern_name(param)
+                    if param_name and func_name:
+                        self.declared_entities.append({
+                            "name": f"{self._qualified(func_name)}.{param_name}",
+                            "type": "variable",
+                            "dtype": "unknown"
+                        })
+            if hasattr(node, 'body'):
+                self._walk_node(node.body)
+            if func_name:
+                self.scope_stack.pop()
+        # --- Class Declaration ---
+        elif node_type == 'ClassDeclaration':
+            class_name = node.id.name if hasattr(node, 'id') and node.id else None
+            if class_name:
+                qualified = self._qualified(class_name)
+                self.declared_entities.append({"name": qualified, "type": "class"})
+                # Handle inheritance
+                if hasattr(node, 'superClass') and node.superClass:
+                    if hasattr(node.superClass, 'name'):
+                        self.called_entities.append(node.superClass.name)
+                self.scope_stack.append(class_name)
+                if hasattr(node, 'body') and hasattr(node.body, 'body'):
+                    for method in node.body.body:
+                        self._walk_node(method)
+                self.scope_stack.pop()
+        # --- Method Definition ---
+        elif node_type == 'MethodDefinition':
+            method_name = node.key.name if hasattr(node, 'key') and hasattr(node.key, 'name') else None
+            if method_name:
+                qualified = self._qualified(method_name)
+                self.declared_entities.append({"name": qualified, "type": "method"})
+                if hasattr(node, 'value') and hasattr(node.value, 'params'):
+                    for param in node.value.params:
+                        param_name = self._extract_pattern_name(param)
+                        if param_name:
+                            self.declared_entities.append({
+                                "name": f"{qualified}.{param_name}",
+                                "type": "variable",
+                                "dtype": "unknown"
+                            })
+                if hasattr(node, 'value'):
+                    self._walk_node(node.value)
+        # --- Variable Declaration ---
+        elif node_type == 'VariableDeclaration':
+            if hasattr(node, 'declarations'):
+                for decl in node.declarations:
+                    self._walk_node(decl)
+        # --- Variable Declarator ---
+        elif node_type == 'VariableDeclarator':
+            var_name = self._extract_pattern_name(node.id) if hasattr(node, 'id') else None
+            if var_name:
+                qualified = self._qualified(var_name)
+                # Check if it's a function assignment
+                if hasattr(node, 'init') and node.init:
+                    if node.init.type in ('FunctionExpression', 'ArrowFunctionExpression'):
+                        self.declared_entities.append({"name": qualified, "type": "function"})
+                        self.scope_stack.append(var_name)
+                        self._walk_node(node.init)
+                        self.scope_stack.pop()
+                    else:
+                        self.declared_entities.append({
+                            "name": qualified,
+                            "type": "variable",
+                            "dtype": "unknown"
+                        })
+                        self._walk_node(node.init)
+                else:
+                    self.declared_entities.append({
+                        "name": qualified,
+                        "type": "variable",
+                        "dtype": "unknown"
+                    })
+        # --- Call Expression ---
+        elif node_type == 'CallExpression':
+            callee_name = self._extract_callee_name(node.callee) if hasattr(node, 'callee') else None
+            if callee_name:
+                self.called_entities.append(callee_name)
+                # Detect API endpoint calls
+                self._detect_api_call(node, callee_name)
+            # Walk arguments
+            if hasattr(node, 'arguments'):
+                for arg in node.arguments:
+                    self._walk_node(arg)
+        # --- Member Expression ---
+        elif node_type == 'MemberExpression':
+            # Don't record as call, just traverse
+            if hasattr(node, 'object'):
+                self._walk_node(node.object)
+            if hasattr(node, 'property'):
+                self._walk_node(node.property)
+        # --- Import/Export ---
+        elif node_type == 'ImportDeclaration':
+            if hasattr(node, 'source') and hasattr(node.source, 'value'):
+                self.called_entities.append(node.source.value)
+        elif node_type == 'ExportNamedDeclaration':
+            if hasattr(node, 'declaration'):
+                self._walk_node(node.declaration)
+        elif node_type == 'ExportDefaultDeclaration':
+            if hasattr(node, 'declaration'):
+                self._walk_node(node.declaration)
+        # --- Recursive traversal for other nodes ---
+        else:
+            if hasattr(node, '__dict__'):
+                for attr, val in vars(node).items():
+                    if isinstance(val, list):
+                        for item in val:
+                            if hasattr(item, 'type'):
+                                self._walk_node(item)
+                    elif hasattr(val, 'type'):
+                        self._walk_node(val)
+    def _extract_pattern_name(self, pattern) -> Optional[str]:
+        """Extract name from various pattern types (Identifier, ObjectPattern, etc.)."""
+        if not pattern:
+            return None
+        if hasattr(pattern, 'type'):
+            if pattern.type == 'Identifier':
+                return pattern.name if hasattr(pattern, 'name') else None
+            elif pattern.type == 'RestElement':
+                return self._extract_pattern_name(pattern.argument) if hasattr(pattern, 'argument') else None
+        return None
+    def _extract_callee_name(self, callee) -> Optional[str]:
+        """Extract the name of the function being called."""
+        if not callee:
+            return None
+        if hasattr(callee, 'type'):
+            if callee.type == 'Identifier':
+                return callee.name if hasattr(callee, 'name') else None
+            elif callee.type == 'MemberExpression':
+                obj = self._extract_callee_name(callee.object) if hasattr(callee, 'object') else ""
+                prop = callee.property.name if hasattr(callee, 'property') and hasattr(callee.property, 'name') else ""
+                if obj and prop:
+                    return f"{obj}.{prop}"
+                return prop or obj
+        return None
+    def _detect_api_call(self, call_node, callee_name: str):
+        """
+        Detect API endpoint calls in JavaScript code.
+        Handles patterns like:
+        - fetch('/api/users')
+        - axios.get('/api/users')
+        - axios.post('/api/users', data)
+        - request.get('/api/users')
+        """
+        if not callee_name or not hasattr(call_node, 'arguments'):
+            return
+        # Split callee name to check for patterns
+        parts = callee_name.split('.')
+        base = parts[0]
+        method = parts[-1].lower() if len(parts) > 1 else None
+        # Check if this is an API call
+        is_api_call = False
+        http_method = 'unknown'
+        # Pattern 1: fetch('/api/...')
+        if base == 'fetch':
+            is_api_call = True
+            http_method = 'GET'  # Default for fetch
+        # Pattern 2: axios.get('/api/...'), request.post(...), etc.
+        elif base in self.API_PATTERNS and method in self.HTTP_METHODS:
+            is_api_call = True
+            http_method = method.upper()
+        # Pattern 3: axios('/api/...', {method: 'POST'})
+        elif base in self.API_PATTERNS and method is None:
+            is_api_call = True
+            http_method = 'GET'  # Default
+        if not is_api_call:
+            return
+        # Extract the endpoint URL from arguments
+        if call_node.arguments:
+            first_arg = call_node.arguments[0]
+            endpoint = self._extract_string_literal(first_arg)
+            if endpoint:
+                # Store as a called entity with special type
+                self.called_entities.append(f"API:{http_method}:{endpoint}")
+                # Also track in api_calls for easier filtering
+                self.api_calls.append({
+                    "endpoint": endpoint,
+                    "method": http_method,
+                    "type": "api_call"
+                })
+    def _extract_string_literal(self, node) -> Optional[str]:
+        """Extract string value from a Literal/TemplateLiteral node."""
+        if not node or not hasattr(node, 'type'):
+            return None
+        if node.type == 'Literal' and isinstance(node.value, str):
+            return node.value
+        elif node.type == 'TemplateLiteral':
+            # For template literals, we try to extract the quasi parts
+            # e.g., `/api/${version}/users` -> /api/{version}/users
+            if hasattr(node, 'quasis'):
+                parts = []
+                for i, quasi in enumerate(node.quasis):
+                    if hasattr(quasi, 'value') and hasattr(quasi.value, 'raw'):
+                        parts.append(quasi.value.raw)
+                    if i < len(node.quasis) - 1:
+                        parts.append('{param}')
+                return ''.join(parts)
+        return None
+    def extract_entities(self, code: str, file_path: str = None) -> Tuple[List[Dict[str, Any]], List[str]]:
+        self.reset()
+        try:
+            tree = esprima.parseScript(code, {'tolerant': True, 'loc': False})
+        except Exception as e:
+            # Try parsing as module if script fails
+            try:
+                tree = esprima.parseModule(code, {'tolerant': True, 'loc': False})
+            except Exception as e2:
+                logger.error(f"Failed to parse JavaScript code: {e2}")
+                return [], []
+        if hasattr(tree, 'body'):
+            for node in tree.body:
+                self._walk_node(node)
+        # Deduplicate
+        seen_decl = set()
+        unique_declared = []
+        for e in self.declared_entities:
+            key = (e.get("name"), e.get("type"), e.get("dtype"))
+            if key not in seen_decl:
+                unique_declared.append(e)
+                seen_decl.add(key)
+        unique_called = list(dict.fromkeys(self.called_entities))
+        return unique_declared, unique_called
+class CEntityExtractor(BaseASTEntityExtractor):
+    """
+    Extract declared and called entities from C code using clang.cindex (libclang),
+    with filtering to ignore system headers.
+    """
+    def __init__(self):
+        self.index = cindex.Index.create()
+    def reset(self) -> None:
+        """No persistent state to reset, but method provided for interface consistency."""
+        pass
+    def _walk_cursor(self, cursor, declared, called, source_file):
+        """Recursively walk a clang Cursor, restricted to the main file."""
+        for c in cursor.get_children():
+            # --- Include directives ---
+            # Note: INCLUSION_DIRECTIVE nodes are at the root level and need special handling
+            if c.kind == cindex.CursorKind.INCLUSION_DIRECTIVE:
+                # Get the included file name
+                included_file = c.displayname
+                if included_file:
+                    called.append(included_file)
+                continue
+            loc = c.location
+            if not loc.file or not source_file:
+                continue
+            # Skip system / external headers for other nodes
+            if os.path.abspath(loc.file.name) != os.path.abspath(source_file):
+                continue
+            # --- Declarations ---
+            if c.kind.is_declaration():
+                if c.kind in (cindex.CursorKind.FUNCTION_DECL, cindex.CursorKind.FUNCTION_TEMPLATE):
+                    name = c.spelling or c.displayname
+                    declared.append({"name": name, "type": "function"})
+                    for p in c.get_arguments():
+                        declared.append({
+                            "name": f"{name}.{p.spelling}",
+                            "type": "variable",
+                            "dtype": p.type.spelling
+                        })
+                elif c.kind == cindex.CursorKind.VAR_DECL:
+                    declared.append({
+                        "name": c.spelling,
+                        "type": "variable",
+                        "dtype": c.type.spelling
+                    })
+                    # Add the variable's type to called entities
+                    # This captures struct references like "struct Point p;"
+                    if c.type.spelling:
+                        # Extract the base type name (remove const, &, *, struct keyword, etc.)
+                        type_name = c.type.spelling.strip()
+                        # Remove common qualifiers and keywords
+                        type_name = type_name.replace('const', '').replace('&', '').replace('*', '').replace('struct', '').strip()
+                        if type_name and not type_name in ['int', 'float', 'double', 'char', 'bool', 'void', 'long', 'short', 'unsigned', 'signed', 'size_t']:
+                            called.append(type_name)
+                elif c.kind == cindex.CursorKind.STRUCT_DECL:
+                    declared.append({"name": c.spelling or c.displayname, "type": "struct"})
+                elif c.kind == cindex.CursorKind.TYPEDEF_DECL:
+                    declared.append({"name": c.spelling, "type": "typedef"})
+            # --- Calls ---
+            if c.kind == cindex.CursorKind.CALL_EXPR:
+                callee = None
+                for child in c.get_children():
+                    if child.kind in (cindex.CursorKind.DECL_REF_EXPR, cindex.CursorKind.MEMBER_REF_EXPR):
+                        callee = child.spelling
+                        break
+                if callee:
+                    called.append(callee)
+                else:
+                    called.append(c.displayname or c.spelling)
+            # --- Recurse ---
+            self._walk_cursor(c, declared, called, source_file)
+    def extract_entities(self, code: str, file_path: str = None) -> Tuple[List[Dict[str, Any]], List[str]]:
+        declared, called = [], []
+        # If file_path is provided, use it directly for better include resolution
+        # Otherwise, create a temporary file
+        tf_name = None
+        temp_file = False
+        if file_path and os.path.exists(file_path):
+            tf_name = file_path
+            temp_file = False
+        else:
+            with tempfile.NamedTemporaryFile(suffix=".c", mode="w+", delete=False) as tf:
+                tf_name = tf.name
+                tf.write(code)
+                tf.flush()
+            temp_file = True
+        # Get the directory containing the file for include paths
+        include_dir = os.path.dirname(tf_name) if tf_name else None
+        args = ['-std=c11']
+        if include_dir:
+            args.append(f'-I{include_dir}')
+        try:
+            tu = self.index.parse(
+                tf_name,
+                args=args,
+                options=cindex.TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD
+            )
+        except Exception as e:
+            raise RuntimeError(f"libclang failed to parse translation unit: {e}")
+        self._walk_cursor(tu.cursor, declared, called, tf_name)
+        # Deduplicate
+        seen_decl = set()
+        unique_declared = []
+        for e in declared:
+            key = (e.get("name"), e.get("type"), e.get("dtype", None))
+            if key not in seen_decl:
+                unique_declared.append(e)
+                seen_decl.add(key)
+        unique_called = list(dict.fromkeys(called))
+        # Only delete if we created a temp file
+        if temp_file:
+            try:
+                os.unlink(tf_name)
+            except Exception:
+                pass
+        return unique_declared, unique_called
+class CppEntityExtractor(BaseASTEntityExtractor):
+    """
+    Extract declared and called entities from C++ code using clang.cindex (libclang),
+    including classes, namespaces, and methods.
+    """
+    def __init__(self):
+        self.index = cindex.Index.create()
+        self.reset()
+    def reset(self) -> None:
+        self.declared_entities = []
+        self.called_entities = []
+        self.scope_stack = []
+    def _qualified(self, name: str) -> str:
+        """Return fully qualified name using current scope stack."""
+        if not name:
+            return ""
+        if not self.scope_stack:
+            return name
+        return "::".join(self.scope_stack + [name])
+    def _walk_cursor(self, cursor, source_file: str):
+        for c in cursor.get_children():
+            # --- Include directives ---
+            # Note: INCLUSION_DIRECTIVE nodes are at the root level and need special handling
+            if c.kind == cindex.CursorKind.INCLUSION_DIRECTIVE:
+                # Get the included file name
+                included_file = c.displayname
+                if included_file:
+                    self.called_entities.append(included_file)
+                continue
+            kind = c.kind
+            # --- Namespace --- (process before location check)
+            if kind == cindex.CursorKind.NAMESPACE:
+                if c.spelling:  # Only add non-empty namespace names
+                    self.scope_stack.append(c.spelling)
+                self._walk_cursor(c, source_file)
+                if c.spelling:
+                    self.scope_stack.pop()
+                continue
+            # Check location for other node types
+            loc = c.location
+            # Skip nodes from other files, but allow nodes without location info
+            if loc.file and os.path.abspath(loc.file.name) != os.path.abspath(source_file):
+                continue
+            # --- Class / Struct ---
+            if kind in (cindex.CursorKind.CLASS_DECL, cindex.CursorKind.STRUCT_DECL):
+                # Only process if it has a name
+                if c.spelling:
+                    # Check if it's a definition (not a forward declaration)
+                    is_def = c.is_definition() if hasattr(c, 'is_definition') else True
+                    if is_def:
+                        full_name = self._qualified(c.spelling)
+                        self.declared_entities.append({"name": full_name, "type": "class"})
+                        # Handle base classes (inheritance)
+                        for base in c.get_children():
+                            if base.kind == cindex.CursorKind.CXX_BASE_SPECIFIER:
+                                if base.spelling:
+                                    self.called_entities.append(base.spelling)
+                        self.scope_stack.append(c.spelling)
+                        self._walk_cursor(c, source_file)
+                        self.scope_stack.pop()
+                continue
+            # --- Methods ---
+            if kind in (cindex.CursorKind.CXX_METHOD, cindex.CursorKind.CONSTRUCTOR, cindex.CursorKind.DESTRUCTOR):
+                if c.spelling:  # Only process if it has a name
+                    full_name = self._qualified(c.spelling)
+                    self.declared_entities.append({"name": full_name, "type": "method"})
+                    for p in c.get_arguments():
+                        if p.spelling:  # Only add parameters with names
+                            self.declared_entities.append({
+                                "name": f"{full_name}.{p.spelling}",
+                                "type": "variable",
+                                "dtype": p.type.spelling
+                            })
+                self._walk_cursor(c, source_file)
+                continue
+            # --- Free functions ---
+            if kind == cindex.CursorKind.FUNCTION_DECL:
+                if c.spelling:  # Only process if it has a name
+                    full_name = self._qualified(c.spelling)
+                    self.declared_entities.append({"name": full_name, "type": "function"})
+                    for p in c.get_arguments():
+                        if p.spelling:  # Only add parameters with names
+                            self.declared_entities.append({
+                                "name": f"{full_name}.{p.spelling}",
+                                "type": "variable",
+                                "dtype": p.type.spelling
+                            })
+                self._walk_cursor(c, source_file)
+                continue
+            # --- Variables ---
+            if kind == cindex.CursorKind.VAR_DECL:
+                full_name = self._qualified(c.spelling)
+                self.declared_entities.append({
+                    "name": full_name,
+                    "type": "variable",
+                    "dtype": c.type.spelling
+                })
+                # Look for TYPE_REF children which explicitly reference the type
+                # This is more reliable than c.type.spelling when includes aren't resolved
+                type_ref_found = False
+                for child in c.get_children():
+                    if child.kind == cindex.CursorKind.TYPE_REF:
+                        # TYPE_REF.spelling gives us the fully qualified type name
+                        # It may have 'class ' or 'struct ' prefix, so strip it
+                        if child.spelling:
+                            type_name = child.spelling.replace('class ', '').replace('struct ', '').strip()
+                            if type_name:
+                                # TYPE_REF gives us the canonical name from the definition,
+                                # which includes namespace qualifiers if present.
+                                # We only add this canonical name and rely on alias resolution
+                                # to match unqualified usage (e.g., 'Calculator' -> 'math::Calculator')
+                                self.called_entities.append(type_name)
+                                type_ref_found = True
+                                break
+                # Fallback: use c.type.spelling if no TYPE_REF found
+                # Note: c.type.spelling may give us the name as written in source code,
+                # which could be unqualified even if it refers to a namespaced type
+                if not type_ref_found and c.type.spelling:
+                    # Extract the base type name (remove const, &, *, etc.)
+                    type_name = c.type.spelling.strip()
+                    # Remove common qualifiers
+                    type_name = type_name.replace('const', '').replace('&', '').replace('*', '').strip()
+                    if type_name and not type_name in ['int', 'float', 'double', 'char', 'bool', 'void', 'long', 'short', 'unsigned', 'signed']:
+                        # Only add if not already added via TYPE_REF
+                        # c.type.spelling might give unqualified name even for namespaced types
+                        # We'll add it and let alias resolution handle it
+                        self.called_entities.append(type_name)
+            # --- Calls ---
+            if kind == cindex.CursorKind.CALL_EXPR:
+                callee = None
+                for child in c.get_children():
+                    if child.kind in (cindex.CursorKind.DECL_REF_EXPR, cindex.CursorKind.MEMBER_REF_EXPR):
+                        callee = child.spelling
+                        break
+                if callee:
+                    self.called_entities.append(callee)
+                else:
+                    self.called_entities.append(c.displayname or c.spelling)
+            # Recurse
+            self._walk_cursor(c, source_file)
+    def extract_entities(self, code: str, file_path: str = None) -> Tuple[List[Dict[str, Any]], List[str]]:
+        self.reset()
+        # If file_path is provided, use it directly for better include resolution
+        # Otherwise, create a temporary file
+        tf_name = None
+        temp_file = False
+        if file_path and os.path.exists(file_path):
+            tf_name = file_path
+            temp_file = False
+        else:
+            with tempfile.NamedTemporaryFile(suffix=".cpp", mode="w+", delete=False) as tf:
+                tf_name = tf.name
+                tf.write(code)
+                tf.flush()
+            temp_file = True
+        # Get the directory containing the file for include paths
+        include_dir = os.path.dirname(tf_name) if tf_name else None
+        args = ['-std=c++17', '-xc++']
+        if include_dir:
+            args.append(f'-I{include_dir}')
+        try:
+            tu = self.index.parse(
+                tf_name,
+                args=args,
+                options=cindex.TranslationUnit.PARSE_DETAILED_PROCESSING_RECORD
+            )
+        except Exception as e:
+            raise RuntimeError(f"libclang failed to parse C++ translation unit: {e}")
+        self._walk_cursor(tu.cursor, tf_name)
+        # Deduplicate
+        seen_decl = set()
+        unique_declared = []
+        for e in self.declared_entities:
+            key = (e.get("name"), e.get("type"), e.get("dtype", None))
+            if key not in seen_decl:
+                unique_declared.append(e)
+                seen_decl.add(key)
+        unique_called = list(dict.fromkeys(self.called_entities))
+        # Only delete if we created a temp file
+        if temp_file:
+            try:
+                os.unlink(tf_name)
+            except Exception:
+                pass
+        return unique_declared, unique_called
+class RustEntityExtractor(BaseASTEntityExtractor):
+    """
+    Extract declared and called entities from Rust code using tree-sitter.
+    Handles structs, enums, traits, functions, methods, and modules.
+    Also detects API endpoint definitions (Actix-web, Rocket, Axum, Warp).
+    """
+    # HTTP method route macros for Rust web frameworks
+    ROUTE_MACROS = {
+        'get', 'post', 'put', 'patch', 'delete', 'head', 'options',  # Actix-web, Rocket
+        'Get', 'Post', 'Put', 'Patch', 'Delete', 'Head', 'Options',  # Alternative casing
+    }
+    # Route-related macros and functions
+    ROUTE_PATTERNS = {
+        'route',        # Generic route macro
+        'web::get', 'web::post', 'web::put', 'web::delete',  # Actix-web with web::
+    }
+    def __init__(self):
+        self.parser = Parser()
+        self.parser.language = Language(ts_rust.language())
+        self.reset()
+    def reset(self) -> None:
+        self.declared_entities = []
+        self.called_entities = []
+        self.scope_stack = []
+        self.api_endpoints: List[Dict[str, Any]] = []  # Track API endpoint definitions
+    def _qualified(self, name: str) -> str:
+        """Return fully qualified name using current scope stack."""
+        if not name:
+            return ""
+        if not self.scope_stack:
+            return name
+        return "::".join(self.scope_stack + [name])
+    def _get_node_text(self, node, code_bytes: bytes) -> str:
+        """Extract text content of a node."""
+        return code_bytes[node.start_byte:node.end_byte].decode('utf8')
+    def _extract_api_endpoint_from_attributes(self, node, code_bytes: bytes) -> Optional[Dict[str, Any]]:
+        """
+        Extract API endpoint information from Rust function attributes.
+        Handles patterns like:
+        - #[get("/users")]                    # Actix-web, Rocket
+        - #[post("/users")]                   # Actix-web, Rocket
+        - #[route("/users", method="GET")]    # Generic route
+        Note: In tree-sitter Rust AST, attributes appear as PREVIOUS SIBLINGS
+        of the function_item node, not as children.
+        """
+        # Get the parent node to access siblings
+        parent = node.parent
+        if not parent:
+            return None
+        # Find the index of current node in parent's children
+        node_index = None
+        for i, child in enumerate(parent.children):
+            if child == node:
+                node_index = i
+                break
+        if node_index is None:
+            return None
+        # Look backwards through previous siblings for attribute_item nodes
+        for i in range(node_index - 1, -1, -1):
+            sibling = parent.children[i]
+            # Stop if we hit a non-attribute node (except comments/whitespace)
+            if sibling.type not in ['attribute_item', 'line_comment', 'block_comment']:
+                break
+            if sibling.type == 'attribute_item':
+                attr_text = self._get_node_text(sibling, code_bytes)
+                # Match HTTP method macros: #[get("/path")], #[post("/path")], #[post("/path", data = "<var>")], etc.
+                # The pattern now allows optional additional parameters after the path
+                method_pattern = r'#\[(get|post|put|patch|delete|head|options)\s*\(\s*"([^"]+)"(?:\s*,.*?)?\s*\)\]'
+                match = re.search(method_pattern, attr_text, re.IGNORECASE)
+                if match:
+                    http_method = match.group(1).upper()
+                    endpoint_path = match.group(2)
+                    return {
+                        "endpoint": endpoint_path,
+                        "methods": [http_method],
+                        "type": "api_endpoint_definition"
+                    }
+                # Match generic route macro: #[route("/path", method="GET")]
+                route_pattern = r'#\[route\s*\(\s*"([^"]+)"(?:.*?method\s*=\s*"([^"]+)")?\s*\)\]'
+                match = re.search(route_pattern, attr_text, re.IGNORECASE)
+                if match:
+                    endpoint_path = match.group(1)
+                    http_method = match.group(2).upper() if match.group(2) else "GET"
+                    return {
+                        "endpoint": endpoint_path,
+                        "methods": [http_method],
+                        "type": "api_endpoint_definition"
+                    }
+        return None
+    def _walk_tree(self, node, code_bytes: bytes):
+        """Recursively walk the tree-sitter AST."""
+        node_type = node.type
+        # --- Module declarations ---
+        if node_type == 'mod_item':
+            # mod my_module { ... }
+            name_node = node.child_by_field_name('name')
+            if name_node:
+                mod_name = self._get_node_text(name_node, code_bytes)
+                qualified = self._qualified(mod_name)
+                self.declared_entities.append({"name": qualified, "type": "module"})
+                self.scope_stack.append(mod_name)
+                body = node.child_by_field_name('body')
+                if body:
+                    for child in body.children:
+                        self._walk_tree(child, code_bytes)
+                self.scope_stack.pop()
+                return
+        # --- Struct declarations ---
+        elif node_type == 'struct_item':
+            name_node = node.child_by_field_name('name')
+            if name_node:
+                struct_name = self._get_node_text(name_node, code_bytes)
+                qualified = self._qualified(struct_name)
+                self.declared_entities.append({"name": qualified, "type": "struct"})
+                # Check for generic parameters
+                type_params = node.child_by_field_name('type_parameters')
+                if type_params:
+                    self._walk_tree(type_params, code_bytes)
+                self.scope_stack.append(struct_name)
+                # Process fields
+                body = node.child_by_field_name('body')
+                if body:
+                    for child in body.children:
+                        if child.type == 'field_declaration':
+                            field_name_node = child.child_by_field_name('name')
+                            field_type_node = child.child_by_field_name('type')
+                            if field_name_node:
+                                field_name = self._get_node_text(field_name_node, code_bytes)
+                                field_type = self._get_node_text(field_type_node, code_bytes) if field_type_node else "unknown"
+                                self.declared_entities.append({
+                                    "name": f"{qualified}.{field_name}",
+                                    "type": "field",
+                                    "dtype": field_type
+                                })
+                self.scope_stack.pop()
+                return
+        # --- Enum declarations ---
+        elif node_type == 'enum_item':
+            name_node = node.child_by_field_name('name')
+            if name_node:
+                enum_name = self._get_node_text(name_node, code_bytes)
+                qualified = self._qualified(enum_name)
+                self.declared_entities.append({"name": qualified, "type": "enum"})
+                self.scope_stack.append(enum_name)
+                body = node.child_by_field_name('body')
+                if body:
+                    for child in body.children:
+                        if child.type == 'enum_variant':
+                            variant_name_node = child.child_by_field_name('name')
+                            if variant_name_node:
+                                variant_name = self._get_node_text(variant_name_node, code_bytes)
+                                self.declared_entities.append({
+                                    "name": f"{qualified}::{variant_name}",
+                                    "type": "enum_variant"
+                                })
+                self.scope_stack.pop()
+                return
+        # --- Trait declarations ---
+        elif node_type == 'trait_item':
+            name_node = node.child_by_field_name('name')
+            if name_node:
+                trait_name = self._get_node_text(name_node, code_bytes)
+                qualified = self._qualified(trait_name)
+                self.declared_entities.append({"name": qualified, "type": "trait"})
+                self.scope_stack.append(trait_name)
+                body = node.child_by_field_name('body')
+                if body:
+                    for child in body.children:
+                        self._walk_tree(child, code_bytes)
+                self.scope_stack.pop()
+                return
+        # --- Implementation blocks ---
+        elif node_type == 'impl_item':
+            # impl MyStruct { ... } or impl Trait for MyStruct { ... }
+            type_node = node.child_by_field_name('type')
+            trait_node = node.child_by_field_name('trait')
+            impl_name = None
+            if type_node:
+                impl_name = self._get_node_text(type_node, code_bytes)
+            if trait_node:
+                trait_name = self._get_node_text(trait_node, code_bytes)
+                self.called_entities.append(trait_name)
+            if impl_name:
+                self.scope_stack.append(impl_name)
+            body = node.child_by_field_name('body')
+            if body:
+                for child in body.children:
+                    self._walk_tree(child, code_bytes)
+            if impl_name:
+                self.scope_stack.pop()
+            return
+        # --- Function declarations ---
+        elif node_type == 'function_item':
+            name_node = node.child_by_field_name('name')
+            if name_node:
+                func_name = self._get_node_text(name_node, code_bytes)
+                qualified = self._qualified(func_name)
+                # Check for API endpoint attributes (e.g., #[get("/users")])
+                api_info = self._extract_api_endpoint_from_attributes(node, code_bytes)
+                if api_info:
+                    # This is an API endpoint handler
+                    self.declared_entities.append({
+                        "name": qualified,
+                        "type": "api_endpoint",
+                        "endpoint": api_info.get("endpoint"),
+                        "methods": api_info.get("methods")
+                    })
+                    self.api_endpoints.append({**api_info, "function": qualified})
+                    entity_type = "api_endpoint"
+                else:
+                    # Determine if this is a method (inside impl block) or free function
+                    entity_type = "method" if len(self.scope_stack) > 0 else "function"
+                    self.declared_entities.append({"name": qualified, "type": entity_type})
+                # Extract parameters
+                params = node.child_by_field_name('parameters')
+                if params:
+                    for child in params.children:
+                        if child.type == 'parameter':
+                            pattern = child.child_by_field_name('pattern')
+                            type_node = child.child_by_field_name('type')
+                            if pattern:
+                                param_name = self._get_node_text(pattern, code_bytes)
+                                param_type = self._get_node_text(type_node, code_bytes) if type_node else "unknown"
+                                # Skip 'self' parameters
+                                if param_name not in ['self', '&self', '&mut self', 'mut self']:
+                                    self.declared_entities.append({
+                                        "name": f"{qualified}.{param_name}",
+                                        "type": "variable",
+                                        "dtype": param_type
+                                    })
+                # Walk the function body to find calls
+                body = node.child_by_field_name('body')
+                if body:
+                    self._walk_tree(body, code_bytes)
+                return
+        # --- Type alias ---
+        elif node_type == 'type_item':
+            name_node = node.child_by_field_name('name')
+            if name_node:
+                type_name = self._get_node_text(name_node, code_bytes)
+                qualified = self._qualified(type_name)
+                self.declared_entities.append({"name": qualified, "type": "type_alias"})
+                return
+        # --- Constant declarations ---
+        elif node_type == 'const_item':
+            name_node = node.child_by_field_name('name')
+            type_node = node.child_by_field_name('type')
+            if name_node:
+                const_name = self._get_node_text(name_node, code_bytes)
+                const_type = self._get_node_text(type_node, code_bytes) if type_node else "unknown"
+                qualified = self._qualified(const_name)
+                self.declared_entities.append({
+                    "name": qualified,
+                    "type": "constant",
+                    "dtype": const_type
+                })
+        # --- Static declarations ---
+        elif node_type == 'static_item':
+            name_node = node.child_by_field_name('name')
+            type_node = node.child_by_field_name('type')
+            if name_node:
+                static_name = self._get_node_text(name_node, code_bytes)
+                static_type = self._get_node_text(type_node, code_bytes) if type_node else "unknown"
+                qualified = self._qualified(static_name)
+                self.declared_entities.append({
+                    "name": qualified,
+                    "type": "static",
+                    "dtype": static_type
+                })
+        # --- Let bindings (local variables) ---
+        elif node_type == 'let_declaration':
+            pattern = node.child_by_field_name('pattern')
+            type_node = node.child_by_field_name('type')
+            if pattern and pattern.type == 'identifier':
+                var_name = self._get_node_text(pattern, code_bytes)
+                var_type = self._get_node_text(type_node, code_bytes) if type_node else "unknown"
+                # Only track top-level or module-level variables, not function-local ones
+                # For now, we skip local variables to avoid clutter
+        # --- Use declarations (imports) ---
+        elif node_type == 'use_declaration':
+            # Extract imported items
+            use_text = self._get_node_text(node, code_bytes)
+            self.called_entities.append(use_text)
+        # --- Call expressions ---
+        elif node_type == 'call_expression':
+            function = node.child_by_field_name('function')
+            if function:
+                func_text = self._get_node_text(function, code_bytes)
+                # Clean up function call to get just the name/path
+                # Handle method calls like obj.method() and path calls like std::vec::Vec::new()
+                self.called_entities.append(func_text)
+        # --- Macro invocations ---
+        elif node_type == 'macro_invocation':
+            macro_node = node.child_by_field_name('macro')
+            if macro_node:
+                macro_name = self._get_node_text(macro_node, code_bytes)
+                self.called_entities.append(f"{macro_name}!")
+        # --- Field expressions (method calls or field access) ---
+        elif node_type == 'field_expression':
+            field = node.child_by_field_name('field')
+            if field:
+                field_name = self._get_node_text(field, code_bytes)
+                # This could be a field access or method call, record it
+                # We don't have full context here, so just record the field name
+        # Recursively walk all children
+        for child in node.children:
+            self._walk_tree(child, code_bytes)
+    def extract_entities(self, code: str, file_path: str = None) -> Tuple[List[Dict[str, Any]], List[str]]:
+        """Extract entities from Rust code using tree-sitter."""
+        self.reset()
+        code_bytes = code.encode('utf8')
+        tree = self.parser.parse(code_bytes)
+        # Walk the AST
+        self._walk_tree(tree.root_node, code_bytes)
+        # Deduplicate
+        seen_decl = set()
+        unique_declared = []
+        for e in self.declared_entities:
+            key = (e.get("name"), e.get("type"), e.get("dtype", None))
+            if key not in seen_decl:
+                unique_declared.append(e)
+                seen_decl.add(key)
+        unique_called = list(dict.fromkeys(self.called_entities))
+        return unique_declared, unique_called
+class PythonASTEntityExtractor(ast.NodeVisitor, BaseASTEntityExtractor):
+    """
+    AST-based entity extractor for Python code.
+    Also detects API endpoint definitions (FastAPI, Flask, Django REST Framework).
+    """
+    # Common HTTP decorators/patterns for Python web frameworks
+    API_DECORATORS = {
+        'route',           # Flask @app.route
+        'get', 'post', 'put', 'patch', 'delete', 'head', 'options',  # FastAPI/Flask methods
+        'api_view',        # DRF @api_view
+    }
+    def __init__(self):
+        self.declared_entities: List[Dict[str, Any]] = []
+        self.called_entities: List[str] = []
+        self.current_class: Optional[str] = None
+        self.current_function: Optional[str] = None
+        self.api_endpoints: List[Dict[str, Any]] = []  # Track API endpoint definitions
+    def reset(self) -> None:
+        """Clear previous extraction state including context"""
+        self.declared_entities = []
+        self.called_entities = []
+        self.current_class = None
+        self.current_function = None
+        self.api_endpoints = []
+    def _get_type_annotation(self, node: ast.AST) -> str:
+        """Extract type annotation from AST node"""
+        if isinstance(node, ast.Name):
+            return node.id
+        elif isinstance(node, ast.Constant):
+            return type(node.value).__name__
+        elif isinstance(node, ast.Attribute):
+            return f"{self._get_type_annotation(node.value)}.{node.attr}"
+        elif isinstance(node, ast.Subscript):
+            # Handle generic types like List[str], Dict[str, int]
+            base = self._get_type_annotation(node.value)
+            if isinstance(node.slice, ast.Tuple):
+                args = [self._get_type_annotation(elt) for elt in node.slice.elts]
+                return f"{base}[{', '.join(args)}]"
+            else:
+                arg = self._get_type_annotation(node.slice)
+                return f"{base}[{arg}]"
+        return "unknown"
+    def _infer_type_from_value(self, node: ast.AST) -> str:
+        """Infer type from assigned value"""
+        if isinstance(node, ast.Constant):
+            return type(node.value).__name__
+        elif isinstance(node, ast.List):
+            return "list"
+        elif isinstance(node, ast.Dict):
+            return "dict"
+        elif isinstance(node, ast.Set):
+            return "set"
+        elif isinstance(node, ast.Tuple):
+            return "tuple"
+        elif isinstance(node, ast.Call):
+            if isinstance(node.func, ast.Name):
+                return node.func.id  # Constructor call
+            elif isinstance(node.func, ast.Attribute):
+                return "unknown"
+        elif isinstance(node, ast.Name):
+            return "unknown"  # Reference to another variable
+        return "unknown"
+    def visit_ClassDef(self, node: ast.ClassDef):
+        """Visit class definitions"""
+        old_class = self.current_class
+        self.current_class = node.name
+        # Add class to declared entities
+        self.declared_entities.append({
+            "name": node.name,
+            "type": "class"
+        })
+        # Record base classes as called entities
+        for base in node.bases:
+            if isinstance(base, ast.Name):
+                self.called_entities.append(base.id)
+            elif isinstance(base, ast.Attribute):
+                self.called_entities.append(self._get_type_annotation(base))
+        # Continue visiting child nodes
+        self.generic_visit(node)
+        self.current_class = old_class
+    def visit_FunctionDef(self, node: ast.FunctionDef):
+        """Visit function/method definitions and detect API endpoints"""
+        old_function = self.current_function
+        if self.current_class:
+            # This is a method
+            full_name = f"{self.current_class}.{node.name}"
+            entity_type = "method"
+        else:
+            # This is a function
+            full_name = node.name
+            entity_type = "function"
+        self.current_function = full_name
+        # Check for API endpoint decorators
+        api_info = self._extract_api_endpoint_from_decorators(node.decorator_list, full_name)
+        if api_info:
+            # Mark this as an API endpoint
+            self.declared_entities.append({
+                "name": full_name,
+                "type": "api_endpoint",
+                "endpoint": api_info.get("endpoint"),
+                "methods": api_info.get("methods")
+            })
+            self.api_endpoints.append(api_info)
+        else:
+            self.declared_entities.append({
+                "name": full_name,
+                "type": entity_type
+            })
+        # Process parameters
+        for arg in node.args.args:
+            if arg.arg == 'self' and self.current_class:
+                continue  # Skip self parameter
+            dtype = "unknown"
+            if arg.annotation:
+                dtype = self._get_type_annotation(arg.annotation)
+            param_name = f"{full_name}.{arg.arg}" if entity_type == "method" else arg.arg
+            self.declared_entities.append({
+                "name": param_name,
+                "type": "variable",
+                "dtype": dtype
+            })
+        # Continue visiting child nodes
+        self.generic_visit(node)
+        self.current_function = old_function
+    def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef):
+        """Visit async function/method definitions"""
+        # Treat async functions the same as regular functions
+        self.visit_FunctionDef(node)
+    def visit_Assign(self, node: ast.Assign):
+        """Visit assignment statements"""
+        # Infer type from the assigned value
+        dtype = self._infer_type_from_value(node.value)
+        for target in node.targets:
+            if isinstance(target, ast.Name):
+                # Simple variable assignment
+                var_name = target.id
+                if self.current_class and self.current_function and self.current_function.startswith(self.current_class):
+                    # Local variable in method
+                    pass  # Could add local variables if needed
+                else:
+                    # Module-level variable
+                    self.declared_entities.append({
+                        "name": var_name,
+                        "type": "variable",
+                        "dtype": dtype
+                    })
+            elif isinstance(target, ast.Attribute) and isinstance(target.value, ast.Name):
+                # Attribute assignment like self.name = value
+                if target.value.id == 'self' and self.current_class:
+                    attr_name = f"{self.current_class}.{target.attr}"
+                    self.declared_entities.append({
+                        "name": attr_name,
+                        "type": "variable",
+                        "dtype": dtype
+                    })
+        # Continue visiting to catch function calls in the assignment
+        self.generic_visit(node)
+    def visit_AnnAssign(self, node: ast.AnnAssign):
+        """Visit annotated assignment statements (PEP 526)"""
+        if isinstance(node.target, ast.Name):
+            dtype = self._get_type_annotation(node.annotation)
+            var_name = node.target.id
+            self.declared_entities.append({
+                "name": var_name,
+                "type": "variable",
+                "dtype": dtype
+            })
+        elif isinstance(node.target, ast.Attribute) and isinstance(node.target.value, ast.Name):
+            if node.target.value.id == 'self' and self.current_class:
+                dtype = self._get_type_annotation(node.annotation)
+                attr_name = f"{self.current_class}.{node.target.attr}"
+                self.declared_entities.append({
+                    "name": attr_name,
+                    "type": "variable",
+                    "dtype": dtype
+                })
+        # Continue visiting
+        if node.value:
+            self.generic_visit(node)
+    def visit_Import(self, node: ast.Import):
+        """Visit import statements"""
+        for alias in node.names:
+            # Record the imported module/package
+            self.called_entities.append(alias.name)
+        self.generic_visit(node)
+    def visit_ImportFrom(self, node: ast.ImportFrom):
+        """Visit from...import statements"""
+        if node.module:
+            # Record the module being imported from
+            self.called_entities.append(node.module)
+            # Optionally, also record specific imports as module.name
+            for alias in node.names:
+                if alias.name != '*':
+                    self.called_entities.append(f"{node.module}.{alias.name}")
+        else:
+            # Relative imports without module (from . import x)
+            for alias in node.names:
+                if alias.name != '*':
+                    self.called_entities.append(alias.name)
+        self.generic_visit(node)
+    def visit_Call(self, node: ast.Call):
+        """Visit function/method calls"""
+        if isinstance(node.func, ast.Name):
+            # Simple function call
+            self.called_entities.append(node.func.id)
+        elif isinstance(node.func, ast.Attribute):
+            # Method call or attribute access
+            if isinstance(node.func.value, ast.Name):
+                # obj.method() - we need to infer the class of obj
+                # For now, just record the method name
+                method_name = node.func.attr
+                # Try to find the variable type from our declared entities
+                obj_name = node.func.value.id
+                obj_class = self._find_variable_type(obj_name)
+                if obj_class and obj_class != "unknown":
+                    self.called_entities.append(f"{obj_class}.{method_name}")
+                else:
+                    # Fallback: just record the method call
+                    self.called_entities.append(method_name)
+            elif isinstance(node.func.value, ast.Attribute):
+                # Nested attribute access like module.Class.method()
+                full_name = self._get_type_annotation(node.func)
+                self.called_entities.append(full_name)
+        # Continue visiting child nodes
+        self.generic_visit(node)
+    def _find_variable_type(self, var_name: str) -> str:
+        """Find the type of a variable from declared entities"""
+        for entity in self.declared_entities:
+            if entity["name"] == var_name and entity["type"] == "variable":
+                return entity.get("dtype", "unknown")
+        return "unknown"
+    def _extract_api_endpoint_from_decorators(self, decorators: List[ast.expr], function_name: str) -> Optional[Dict[str, Any]]:
+        """
+        Extract API endpoint information from function decorators.
+        Handles patterns like:
+        - @app.route("/api/users", methods=["GET", "POST"])  # Flask
+        - @app.get("/api/users")                              # FastAPI
+        - @router.post("/api/users")                          # FastAPI with router
+        - @api_view(['GET', 'POST'])                          # Django REST Framework
+        """
+        for decorator in decorators:
+            # Handle @app.route(...) or @app.get(...)
+            if isinstance(decorator, ast.Call):
+                if isinstance(decorator.func, ast.Attribute):
+                    # e.g., app.route, app.get, router.post
+                    method_name = decorator.func.attr.lower()
+                    if method_name in self.API_DECORATORS:
+                        endpoint = None
+                        http_methods = []
+                        # Extract endpoint from first positional argument
+                        if decorator.args and isinstance(decorator.args[0], ast.Constant):
+                            endpoint = decorator.args[0].value
+                        # For FastAPI-style decorators (@app.get, @app.post)
+                        if method_name in {'get', 'post', 'put', 'patch', 'delete', 'head', 'options'}:
+                            http_methods = [method_name.upper()]
+                        # For Flask-style @app.route with methods kwarg
+                        elif method_name == 'route':
+                            for keyword in decorator.keywords:
+                                if keyword.arg == 'methods':
+                                    if isinstance(keyword.value, ast.List):
+                                        http_methods = [
+                                            elt.value for elt in keyword.value.elts
+                                            if isinstance(elt, ast.Constant) and isinstance(elt.value, str)
+                                        ]
+                            if not http_methods:
+                                http_methods = ['GET']  # Flask default
+                        # For DRF @api_view(['GET', 'POST'])
+                        elif method_name == 'api_view':
+                            if decorator.args and isinstance(decorator.args[0], ast.List):
+                                http_methods = [
+                                    elt.value for elt in decorator.args[0].elts
+                                    if isinstance(elt, ast.Constant) and isinstance(elt.value, str)
+                                ]
+                        if endpoint:
+                            return {
+                                "function": function_name,
+                                "endpoint": endpoint,
+                                "methods": http_methods,
+                                "type": "api_endpoint_definition"
+                            }
+        return None
+    def extract_entities(self, code: str, file_path: str = None) -> Tuple[List[Dict[str, Any]], List[str]]:
+        """
+        Extract entities from Python code using AST parsing
+        Args:
+            code: Python source code as string
+            file_path: Optional path to the source file (for context)
+        Returns:
+            Tuple of (declared_entities, called_entities)
+        """
+        # Ensure fresh state on each extraction
+        self.reset()
+        try:
+            tree = ast.parse(code)
+            self.visit(tree)
+            # Remove duplicates while preserving order
+            seen_declared = set()
+            unique_declared = []
+            for entity in self.declared_entities:
+                key = (entity["name"], entity["type"], entity.get("dtype"))
+                if key not in seen_declared:
+                    unique_declared.append(entity)
+                    seen_declared.add(key)
+            unique_called = list(dict.fromkeys(self.called_entities))  # Remove duplicates
+            return unique_declared, unique_called
+        except SyntaxError as e:
+            logger.error(f"Syntax error in Python code: {e}")
+            return [], []
+        except Exception as e:
+            logger.error(f"Error parsing Python code: {e}", exc_info=True)
+            return [], []
+class HybridEntityExtractor:
+    """
+    Hybrid entity extractor that uses AST for known languages,
+    falls back to LLM for unknown ones
+    """
+    def __init__(self):
+        self.extractors = {
+            'py': PythonASTEntityExtractor(),
+            'c': CEntityExtractor(),
+            'h': CppEntityExtractor(),  # C/C++ headers
+            'cpp': CppEntityExtractor(),
+            'cc': CppEntityExtractor(),
+            'cxx': CppEntityExtractor(),
+            'hpp': CppEntityExtractor(),
+            'hxx': CppEntityExtractor(),
+            'hh': CppEntityExtractor(),
+            'java': JavaEntityExtractor(),
+            'js': JavaScriptEntityExtractor(),  # ✅ NEW
+            'jsx': JavaScriptEntityExtractor(),  # ✅ NEW
+            'ts': JavaScriptEntityExtractor(),  # TypeScript uses similar AST
+            'tsx': JavaScriptEntityExtractor(),  # TSX similar to JSX
+            'rs': RustEntityExtractor(),
+            'html': HTMLEntityExtractor()
+        }
+    def _get_language_from_filename(self, file_name: str) -> str:
+        ext = file_name.split('.')[-1].lower()
+        return ext
+    def extract_entities(self, code: str, file_name: str):
+        lang = self._get_language_from_filename(file_name)
+        extractor = self.extractors.get(lang)
+        if extractor:
+            # Reset the shared extractor instance to ensure no state is carried over
+            try:
+                extractor.reset()
+            except Exception:
+                # If extractor doesn't implement reset for some reason, ignore and proceed
+                pass
+            logger.info(f"Using AST extraction for {lang.upper()} file: {file_name}")
+            try:
+                # Try to pass file_name if the extractor supports it (C++ extractor does)
+                try:
+                    declared_entities, called_entities = extractor.extract_entities(code, file_path=file_name)
+                except TypeError:
+                    # Fallback for extractors that don't accept file_path parameter
+                    declared_entities, called_entities = extractor.extract_entities(code)
+                # Add aliases to each declared entity based on file path
+                for entity in declared_entities:
+                    entity_name = entity.get('name', '')
+                    if entity_name:
+                        aliases = generate_entity_aliases(entity_name, file_name)
+                        entity['aliases'] = aliases
+                        logger.debug(f"Generated aliases for entity '{entity_name}': {aliases}")
+                return declared_entities, called_entities
+            except Exception as e:
+                logger.error(f"Error during AST extraction for file {file_name}: {e}", exc_info=True)
+                return [], []
+        else:
+            raise Exception(f"Using LLM extraction for unsupported language: {file_name}")

RepoKnowledgeGraphLib/KnowledgeGraphMCPServer.py ADDED Viewed

	@@ -0,0 +1,1107 @@

+import os
+from typing import Optional, Annotated
+from fastmcp import FastMCP
+from langfuse import get_client, observe
+from .RepoKnowledgeGraph import RepoKnowledgeGraph
+# Custom Exceptions
+class MCPServerError(Exception):
+    """Base exception for MCP server errors"""
+    pass
+class NodeNotFoundError(MCPServerError):
+    """Raised when a node is not found"""
+    pass
+class EntityNotFoundError(MCPServerError):
+    """Raised when an entity is not found"""
+    pass
+class InvalidInputError(MCPServerError):
+    """Raised when input validation fails"""
+    pass
+class KnowledgeGraphMCPServer:
+    """
+    MCP Server for interacting with a codebase knowledge graph.
+    Attributes:
+        knowledge_graph (RepoKnowledgeGraph): The loaded knowledge graph object.
+        app (FastMCP): The FastMCP application instance for tool registration and serving.
+    """
+    def __init__(self, knowledge_graph: Optional[RepoKnowledgeGraph] = None, knowledge_graph_path: Optional[str] = None, server_name: str = "knowledge-graph-mcp-server"):
+        if knowledge_graph is not None:
+            self.knowledge_graph = knowledge_graph
+        else:
+            if knowledge_graph_path is None:
+                knowledge_graph_path = os.path.join(os.path.dirname(__file__), "knowledge_graph.json")
+            self.knowledge_graph = RepoKnowledgeGraph.load_graph_from_file(knowledge_graph_path)
+        self.langfuse = get_client()
+        self.app = FastMCP(server_name)
+        self.register_tools()
+    def _validate_node_exists(self, node_id: str) -> bool:
+        """Centralized node validation"""
+        if node_id not in self.knowledge_graph.graph:
+            raise NodeNotFoundError(f"Node '{node_id}' not found in knowledge graph")
+        return True
+    def _validate_entity_exists(self, entity_name: str) -> bool:
+        """Centralized entity validation"""
+        if entity_name not in self.knowledge_graph.entities:
+            raise EntityNotFoundError(f"Entity '{entity_name}' not found in knowledge graph")
+        return True
+    def _validate_positive_int(self, value: int, param_name: str) -> bool:
+        """Validate that an integer parameter is positive"""
+        if value <= 0:
+            raise InvalidInputError(f"{param_name} must be a positive integer, got {value}")
+        return True
+    def _sanitize_chunk_dict(self, chunk_dict: dict) -> dict:
+        """Remove embedding data from chunk dictionary before returning to user"""
+        sanitized = chunk_dict.copy()
+        sanitized.pop('embedding', None)
+        return sanitized
+    def _sanitize_node_dict(self, node_dict: dict) -> dict:
+        """Remove embedding data from node dictionary before returning to user"""
+        sanitized = node_dict.copy()
+        if 'data' in sanitized and isinstance(sanitized['data'], dict):
+            sanitized['data'] = sanitized['data'].copy()
+            sanitized['data'].pop('embedding', None)
+        sanitized.pop('embedding', None)
+        return sanitized
+    def _handle_error(self, error: Exception, context: str = "") -> dict:
+        """Centralized error handling with structured response"""
+        if isinstance(error, NodeNotFoundError):
+            return {
+                "error": str(error),
+                "error_type": "node_not_found",
+                "context": context
+            }
+        elif isinstance(error, EntityNotFoundError):
+            return {
+                "error": str(error),
+                "error_type": "entity_not_found",
+                "context": context
+            }
+        elif isinstance(error, InvalidInputError):
+            return {
+                "error": str(error),
+                "error_type": "invalid_input",
+                "context": context
+            }
+        else:
+            return {
+                "error": str(error),
+                "error_type": "internal_error",
+                "context": context
+            }
+    @classmethod
+    def from_path(cls, path: str, skip_dirs=None, index_nodes=True, describe_nodes=False, extract_entities=False, model_service_kwargs=None, code_index_kwargs=None, server_name: str = "knowledge-graph-mcp-server"):
+        """
+        Build a KnowledgeGraphMCPServer from a code repository path.
+        """
+        if skip_dirs is None:
+            skip_dirs = []
+        if model_service_kwargs is None:
+            model_service_kwargs = {}
+        kg = RepoKnowledgeGraph.from_path(path, skip_dirs=skip_dirs, index_nodes=index_nodes, describe_nodes=describe_nodes, extract_entities=extract_entities, model_service_kwargs=model_service_kwargs, code_index_kwargs=code_index_kwargs)
+        return cls(knowledge_graph=kg, server_name=server_name)
+    @classmethod
+    def from_file(cls, filepath: str, index_nodes=True, use_embed=True, model_service_kwargs=None, code_index_kwargs = None, server_name: str = "knowledge-graph-mcp-server"):
+        """
+        Build a KnowledgeGraphMCPServer from a serialized knowledge graph file.
+        """
+        if model_service_kwargs is None:
+            model_service_kwargs = {}
+        kg = RepoKnowledgeGraph.load_graph_from_file(filepath, index_nodes=index_nodes, use_embed=use_embed, model_service_kwargs=model_service_kwargs, code_index_kwargs=code_index_kwargs)
+        return cls(knowledge_graph=kg, server_name=server_name)
+    @classmethod
+    def from_repo(cls, repo_url: str, index_nodes=True, describe_nodes=False, model_service_kwargs=None, code_index_kwargs=None, server_name: str = "knowledge-graph-mcp-server", github_token=None, allow_unauthenticated_clone=True, skip_dirs=None, extract_entities=True):
+        if model_service_kwargs is None:
+            model_service_kwargs = {}
+        kg = RepoKnowledgeGraph.from_repo(repo_url=repo_url, describe_nodes=describe_nodes, index_nodes=index_nodes, model_service_kwargs=model_service_kwargs, github_token=github_token, allow_unauthenticated_clone=allow_unauthenticated_clone, skip_dirs=skip_dirs, extract_entities=extract_entities, code_index_kwargs=code_index_kwargs)
+        return cls(knowledge_graph=kg, server_name=server_name)
+    def register_tools(self):
+        @self.app.tool(
+            description="Get detailed information about a node in the knowledge graph, including its type, name, description, declared and called entities, and a content preview."
+        )
+        @observe(as_type='tool')
+        async def get_node_info(
+                node_id: Annotated[str, "The ID of the node to retrieve information for."]
+        ) -> dict:
+            try:
+                self._validate_node_exists(node_id)
+                node = self.knowledge_graph.graph.nodes[node_id]['data']
+                declared_entities = getattr(node, 'declared_entities', [])
+                called_entities = getattr(node, 'called_entities', [])
+                content = getattr(node, 'content', None)
+                content_preview = content[:200] + "..." if content and len(content) > 200 else content
+                return {
+                    "node_id": node_id,
+                    "class": node.__class__.__name__,
+                    "name": getattr(node, 'name', 'Unknown'),
+                    "type": getattr(node, 'node_type', 'Unknown'),
+                    "description": getattr(node, 'description', None),
+                    "declared_entities": declared_entities,
+                    "called_entities": called_entities,
+                    "content_preview": content_preview,
+                    "text": f"Node {node_id} ({getattr(node, 'name', '?')}) — {getattr(node, 'node_type', '?')} with {len(declared_entities)} declared and {len(called_entities)} called entities."
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "get_node_info")
+            except Exception as e:
+                return self._handle_error(e, "get_node_info")
+        @self.app.tool(
+            description="List all incoming and outgoing edges for a node, showing relationships to other nodes."
+        )
+        @observe(as_type='tool')
+        async def get_node_edges(
+                node_id: Annotated[str, "The ID of the node whose edges to list."]
+        ) -> dict:
+            try:
+                self._validate_node_exists(node_id)
+                g = self.knowledge_graph.graph
+                incoming = [
+                    {"source": src, "target": tgt, "relation": data.get("relation", "?")}
+                    for src, tgt, data in g.in_edges(node_id, data=True)
+                ]
+                outgoing = [
+                    {"source": src, "target": tgt, "relation": data.get("relation", "?")}
+                    for src, tgt, data in g.out_edges(node_id, data=True)
+                ]
+                return {
+                    "node_id": node_id,
+                    "incoming": incoming,
+                    "outgoing": outgoing,
+                    "incoming_count": len(incoming),
+                    "outgoing_count": len(outgoing),
+                    "text": f"Node '{node_id}' has {len(incoming)} incoming and {len(outgoing)} outgoing edges."
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "get_node_edges")
+            except Exception as e:
+                return self._handle_error(e, "get_node_edges")
+        @self.app.tool(
+            description="Search for nodes in the knowledge graph by query string, using the code index semantic and keyword search."
+        )
+        @observe(as_type='tool')
+        async def search_nodes(
+                query: Annotated[str, "The search string to match against code index."],
+                limit: Annotated[int, "Maximum number of results to return."] = 10
+        ) -> dict:
+            try:
+                self._validate_positive_int(limit, "limit")
+                results = self.knowledge_graph.code_index.query(query, n_results=limit)
+                metadatas = results.get("metadatas", [[]])[0]
+                if not metadatas:
+                    return {"query": query, "results": [], "text": f"No results found for '{query}'."}
+                structured_results = [
+                    {
+                        "id": res.get("id"),
+                        "content": res.get("content"),
+                        "declared_entities": res.get("declared_entities"),
+                        "called_entities": res.get("called_entities")
+                    }
+                    for res in metadatas
+                ]
+                return {
+                    "query": query,
+                    "count": len(structured_results),
+                    "results": structured_results,
+                    "text": f"Found {len(structured_results)} result(s) for query '{query}'."
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "search_nodes")
+            except Exception as e:
+                return self._handle_error(e, "search_nodes")
+        @self.app.tool(
+            description="Get overall statistics about the knowledge graph, including node and edge counts, types, and relations."
+        )
+        @observe(as_type='tool')
+        async def get_graph_stats() -> dict:
+            g = self.knowledge_graph.graph
+            num_nodes = g.number_of_nodes()
+            num_edges = g.number_of_edges()
+            node_types = {}
+            for _, node_attrs in g.nodes(data=True):
+                node_type = getattr(node_attrs['data'], 'node_type', 'Unknown')
+                node_types[node_type] = node_types.get(node_type, 0) + 1
+            edge_relations = {}
+            for _, _, attrs in g.edges(data=True):
+                relation = attrs.get('relation', 'Unknown')
+                edge_relations[relation] = edge_relations.get(relation, 0) + 1
+            return {
+                "total_nodes": num_nodes,
+                "total_edges": num_edges,
+                "node_types": node_types,
+                "edge_relations": edge_relations,
+                "text": f"Graph with {num_nodes} nodes, {num_edges} edges, {len(node_types)} node types, and {len(edge_relations)} relation types."
+            }
+        @self.app.tool(
+            description="List nodes of a specific type in the knowledge graph."
+        )
+        @observe(as_type='tool')
+        async def list_nodes_by_type(
+                node_type: Annotated[str, "The type of nodes to list (e.g., 'function', 'class', 'file')."],
+                limit: Annotated[int, "Maximum number of nodes to return."] = 20
+        ) -> dict:
+            g = self.knowledge_graph.graph
+            matching_nodes = [
+                {
+                    "id": node_id,
+                    "name": getattr(data['data'], 'name', 'Unknown')
+                }
+                for node_id, data in g.nodes(data=True)
+                if getattr(data['data'], 'node_type', None) == node_type
+            ][:limit]
+            if not matching_nodes:
+                return {"node_type": node_type, "results": [], "text": f"No nodes found of type '{node_type}'."}
+            return {
+                "node_type": node_type,
+                "count": len(matching_nodes),
+                "results": matching_nodes,
+                "text": f"Found {len(matching_nodes)} node(s) of type '{node_type}'."
+            }
+        @self.app.tool(
+            description="Get all nodes directly connected to a given node, including the relationship type."
+        )
+        @observe(as_type='tool')
+        async def get_neighbors(
+            node_id: Annotated[str, "The ID of the node whose neighbors to retrieve."]
+        ) -> dict:
+            """Get all nodes directly connected to this node, with their relationship types."""
+            try:
+                self._validate_node_exists(node_id)
+                neighbors = self.knowledge_graph.get_neighbors(node_id)
+                if not neighbors:
+                    return {
+                        "node_id": node_id,
+                        "neighbors": [],
+                        "text": f"No neighbors found for node '{node_id}'"
+                    }
+                neighbor_list = []
+                for neighbor in neighbors[:20]:
+                    neighbor_info = {
+                        "id": neighbor.id,
+                        "name": getattr(neighbor, 'name', 'Unknown'),
+                        "type": neighbor.node_type,
+                        "relation": None
+                    }
+                    if self.knowledge_graph.graph.has_edge(node_id, neighbor.id):
+                        edge_data = self.knowledge_graph.graph.get_edge_data(node_id, neighbor.id)
+                        neighbor_info["relation"] = edge_data.get('relation', 'Unknown')
+                        neighbor_info["direction"] = "outgoing"
+                    elif self.knowledge_graph.graph.has_edge(neighbor.id, node_id):
+                        edge_data = self.knowledge_graph.graph.get_edge_data(neighbor.id, node_id)
+                        neighbor_info["relation"] = edge_data.get('relation', 'Unknown')
+                        neighbor_info["direction"] = "incoming"
+                    neighbor_list.append(neighbor_info)
+                text = f"Neighbors of '{node_id}' ({len(neighbors)} total):\n\n"
+                for neighbor in neighbor_list:
+                    text += f"- {neighbor['id']}: {neighbor['name']} ({neighbor['type']})\n"
+                    if neighbor['relation']:
+                        arrow = "→" if neighbor['direction'] == "outgoing" else "←"
+                        text += f"  {arrow} Relation: {neighbor['relation']}\n"
+                if len(neighbors) > 20:
+                    text += f"\n... and {len(neighbors) - 20} more neighbors\n"
+                return {
+                    "node_id": node_id,
+                    "total_neighbors": len(neighbors),
+                    "neighbors": neighbor_list,
+                    "has_more": len(neighbors) > 20,
+                    "text": text
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "get_neighbors")
+            except Exception as e:
+                return self._handle_error(e, "get_neighbors")
+        @self.app.tool(
+            description="Find where an entity (function, class, variable, etc.) is declared or defined in the codebase."
+        )
+        @observe(as_type='tool')
+        async def go_to_definition(
+            entity_name: Annotated[str, "The name of the entity to find the definition for."]
+        ) -> dict:
+            """Find where an entity is declared/defined in the codebase."""
+            try:
+                self._validate_entity_exists(entity_name)
+                entity_info = self.knowledge_graph.entities[entity_name]
+                declaring_chunks = entity_info.get('declaring_chunk_ids', [])
+                if not declaring_chunks:
+                    return {
+                        "entity_name": entity_name,
+                        "declarations": [],
+                        "text": f"Entity '{entity_name}' found but no declarations identified."
+                    }
+                declarations = []
+                for chunk_id in declaring_chunks[:5]:
+                    if chunk_id in self.knowledge_graph.graph:
+                        chunk = self.knowledge_graph.graph.nodes[chunk_id]['data']
+                        content_preview = chunk.content[:150] + "..." if len(chunk.content) > 150 else chunk.content
+                        declarations.append({
+                            "chunk_id": chunk_id,
+                            "file_path": chunk.path,
+                            "order_in_file": chunk.order_in_file,
+                            "content_preview": content_preview
+                        })
+                text = f"Definition(s) for '{entity_name}':\n\n"
+                text += f"Type: {', '.join(entity_info.get('type', ['Unknown']))}\n"
+                if entity_info.get('dtype'):
+                    text += f"Data Type: {entity_info['dtype']}\n"
+                text += f"\nDeclared in {len(declaring_chunks)} location(s):\n\n"
+                for decl in declarations:
+                    text += f"- Chunk: {decl['chunk_id']}\n"
+                    text += f"  File: {decl['file_path']}\n"
+                    text += f"  Order: {decl['order_in_file']}\n"
+                    text += f"  Content: {decl['content_preview']}\n\n"
+                if len(declaring_chunks) > 5:
+                    text += f"... and {len(declaring_chunks) - 5} more locations\n"
+                return {
+                    "entity_name": entity_name,
+                    "type": entity_info.get('type', []),
+                    "dtype": entity_info.get('dtype'),
+                    "total_declarations": len(declaring_chunks),
+                    "declarations": declarations,
+                    "has_more": len(declaring_chunks) > 5,
+                    "text": text
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "go_to_definition")
+            except Exception as e:
+                return self._handle_error(e, "go_to_definition")
+        @self.app.tool(
+            description="Find all usages or calls of an entity (function, class, variable, etc.) in the codebase."
+        )
+        @observe(as_type='tool')
+        async def find_usages(
+            entity_name: Annotated[str, "The name of the entity to find usages for."],
+            limit: Annotated[int, "Maximum number of usages to return."] = 20
+        ) -> dict:
+            """Find where an entity is used/called in the codebase."""
+            try:
+                self._validate_entity_exists(entity_name)
+                self._validate_positive_int(limit, "limit")
+                entity_info = self.knowledge_graph.entities[entity_name]
+                calling_chunks = entity_info.get('calling_chunk_ids', [])
+                if not calling_chunks:
+                    return {
+                        "entity_name": entity_name,
+                        "usages": [],
+                        "text": f"Entity '{entity_name}' found but no usages identified."
+                    }
+                usages = []
+                for chunk_id in calling_chunks[:limit]:
+                    if chunk_id in self.knowledge_graph.graph:
+                        chunk = self.knowledge_graph.graph.nodes[chunk_id]['data']
+                        content_preview = chunk.content[:150] + "..." if len(chunk.content) > 150 else chunk.content
+                        usages.append({
+                            "chunk_id": chunk_id,
+                            "file_path": chunk.path,
+                            "order_in_file": chunk.order_in_file,
+                            "content_preview": content_preview
+                        })
+                text = f"Usages of '{entity_name}' ({len(calling_chunks)} total):\n\n"
+                for usage in usages:
+                    text += f"- {usage['file_path']} (chunk {usage['order_in_file']})\n"
+                    text += f"  Content: {usage['content_preview']}\n\n"
+                if len(calling_chunks) > limit:
+                    text += f"\n... and {len(calling_chunks) - limit} more usages\n"
+                return {
+                    "entity_name": entity_name,
+                    "total_usages": len(calling_chunks),
+                    "usages": usages,
+                    "has_more": len(calling_chunks) > limit,
+                    "text": text
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "find_usages")
+            except Exception as e:
+                return self._handle_error(e, "find_usages")
+        @self.app.tool(
+            description="Get an overview of the structure of a file, including its chunks and declared entities."
+        )
+        @observe(as_type='tool')
+        async def get_file_structure(
+            file_path: Annotated[str, "The path of the file to get the structure for."]
+        ) -> dict:
+            """Get an overview of chunks and entities in a specific file."""
+            try:
+                self._validate_node_exists(file_path)
+                file_node = self.knowledge_graph.graph.nodes[file_path]['data']
+                chunks = self.knowledge_graph.get_chunks_of_file(file_path)
+                declared_entities = []
+                if hasattr(file_node, 'declared_entities') and file_node.declared_entities:
+                    for entity in file_node.declared_entities[:15]:
+                        if isinstance(entity, dict):
+                            declared_entities.append({
+                                "name": entity.get('name', '?'),
+                                "type": entity.get('type', '?')
+                            })
+                        else:
+                            declared_entities.append({"name": str(entity), "type": "Unknown"})
+                chunk_list = []
+                for chunk in chunks[:10]:
+                    chunk_list.append({
+                        "id": chunk.id,
+                        "order_in_file": chunk.order_in_file,
+                        "description": chunk.description[:80] + "..." if chunk.description and len(chunk.description) > 80 else chunk.description
+                    })
+                text = f"File Structure: {file_node.name}\n"
+                text += f"Path: {file_path}\n"
+                text += f"Language: {getattr(file_node, 'language', 'Unknown')}\n"
+                text += f"Total Chunks: {len(chunks)}\n\n"
+                if declared_entities:
+                    text += f"Declared Entities ({len(file_node.declared_entities)}):\n"
+                    for entity in declared_entities:
+                        text += f"  - {entity['name']} ({entity['type']})\n"
+                    if len(file_node.declared_entities) > 15:
+                        text += f"  ... and {len(file_node.declared_entities) - 15} more\n"
+                text += f"\nChunks:\n"
+                for chunk_info in chunk_list:
+                    text += f"  [{chunk_info['order_in_file']}] {chunk_info['id']}\n"
+                    if chunk_info['description']:
+                        text += f"      {chunk_info['description']}\n"
+                if len(chunks) > 10:
+                    text += f"  ... and {len(chunks) - 10} more chunks\n"
+                return {
+                    "file_path": file_path,
+                    "file_name": file_node.name,
+                    "language": getattr(file_node, 'language', 'Unknown'),
+                    "total_chunks": len(chunks),
+                    "total_declared_entities": len(file_node.declared_entities) if hasattr(file_node, 'declared_entities') else 0,
+                    "declared_entities": declared_entities,
+                    "chunks": chunk_list,
+                    "has_more_entities": hasattr(file_node, 'declared_entities') and len(file_node.declared_entities) > 15,
+                    "has_more_chunks": len(chunks) > 10,
+                    "text": text
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "get_file_structure")
+            except Exception as e:
+                return self._handle_error(e, "get_file_structure")
+        @self.app.tool(
+            description="Get chunks related to a given chunk by a specific relationship (e.g., 'calls', 'contains')."
+        )
+        @observe(as_type='tool')
+        async def get_related_chunks(
+            chunk_id: Annotated[str, "The ID of the chunk to find related chunks for."],
+            relation_type: Annotated[str, "The type of relationship to filter by (e.g., 'calls', 'contains')."] = "calls"
+        ) -> dict:
+            """Get chunks related to this chunk by a specific relationship (e.g., 'calls', 'contains')."""
+            try:
+                self._validate_node_exists(chunk_id)
+                related = []
+                for _, target, attrs in self.knowledge_graph.graph.out_edges(chunk_id, data=True):
+                    if attrs.get('relation') == relation_type:
+                        target_node = self.knowledge_graph.graph.nodes[target]['data']
+                        related.append({
+                            "id": target,
+                            "file_path": getattr(target_node, 'path', 'Unknown'),
+                            "entity_name": attrs.get('entity_name')
+                        })
+                if not related:
+                    return {
+                        "chunk_id": chunk_id,
+                        "relation_type": relation_type,
+                        "related_chunks": [],
+                        "text": f"No chunks found with '{relation_type}' relationship from '{chunk_id}'"
+                    }
+                text = f"Chunks related to '{chunk_id}' via '{relation_type}' ({len(related)} total):\n\n"
+                for chunk in related[:15]:
+                    text += f"- {chunk['id']}\n"
+                    text += f"  File: {chunk['file_path']}\n"
+                    if chunk['entity_name']:
+                        text += f"  Entity: {chunk['entity_name']}\n"
+                if len(related) > 15:
+                    text += f"\n... and {len(related) - 15} more\n"
+                return {
+                    "chunk_id": chunk_id,
+                    "relation_type": relation_type,
+                    "total_related": len(related),
+                    "related_chunks": related[:15],
+                    "has_more": len(related) > 15,
+                    "text": text
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "get_related_chunks")
+            except Exception as e:
+                return self._handle_error(e, "get_related_chunks")
+        @self.app.tool(
+            description="List all entities tracked in the knowledge graph, including their types, declaration, and usage counts."
+        )
+        @observe(as_type='tool')
+        async def list_all_entities(
+            limit: Annotated[int, "Maximum number of entities to return."] = 50
+        ) -> dict:
+            """List all entities tracked in the knowledge graph with their metadata."""
+            if not self.knowledge_graph.entities:
+                return {
+                    "entities": [],
+                    "text": "No entities found in the knowledge graph."
+                }
+            entities = []
+            for entity_name, info in list(self.knowledge_graph.entities.items())[:limit]:
+                entities.append({
+                    "name": entity_name,
+                    "types": info.get('type', ['Unknown']),
+                    "declaration_count": len(info.get('declaring_chunk_ids', [])),
+                    "usage_count": len(info.get('calling_chunk_ids', []))
+                })
+            text = f"All Entities ({len(self.knowledge_graph.entities)} total):\n\n"
+            for i, entity in enumerate(entities, 1):
+                text += f"{i}. {entity['name']}\n"
+                text += f"   Types: {', '.join(entity['types'])}\n"
+                text += f"   Declarations: {entity['declaration_count']}\n"
+                text += f"   Usages: {entity['usage_count']}\n\n"
+            if len(self.knowledge_graph.entities) > limit:
+                text += f"... and {len(self.knowledge_graph.entities) - limit} more entities\n"
+            return {
+                "total_entities": len(self.knowledge_graph.entities),
+                "entities": entities,
+                "has_more": len(self.knowledge_graph.entities) > limit,
+                "text": text
+            }
+        # --- New Tools ---
+        @self.app.tool(
+            description="Show the diff between two code chunks or nodes by their IDs."
+        )
+        @observe(as_type='tool')
+        async def diff_chunks(
+            node_id_1: Annotated[str, "The ID of the first node/chunk."],
+            node_id_2: Annotated[str, "The ID of the second node/chunk."]
+        ) -> dict:
+            try:
+                import difflib
+                self._validate_node_exists(node_id_1)
+                self._validate_node_exists(node_id_2)
+                g = self.knowledge_graph.graph
+                content1 = getattr(g.nodes[node_id_1]['data'], 'content', None)
+                content2 = getattr(g.nodes[node_id_2]['data'], 'content', None)
+                if not content1 or not content2:
+                    raise InvalidInputError("One or both nodes have no content.")
+                diff = list(difflib.unified_diff(
+                    content1.splitlines(), content2.splitlines(),
+                    fromfile=node_id_1, tofile=node_id_2, lineterm=""
+                ))
+                diff_text = "\n".join(diff) if diff else "No differences."
+                return {
+                    "node_id_1": node_id_1,
+                    "node_id_2": node_id_2,
+                    "has_differences": bool(diff),
+                    "diff": diff,
+                    "text": diff_text
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "diff_chunks")
+            except Exception as e:
+                return self._handle_error(e, "diff_chunks")
+        @self.app.tool(
+            description="Show a tree view of the repository or a subtree starting from a given node ID."
+        )
+        @observe(as_type='tool')
+        async def print_tree(
+            root_id: Annotated[Optional[str], "The node ID to start the tree from (default: repo root)."] = 'root',
+            max_depth: Annotated[int, "Maximum depth to show."] = 3
+        ) -> dict:
+            try:
+                g = self.knowledge_graph.graph
+                def build_tree(node_id, depth, tree_data):
+                    if depth > max_depth:
+                        return
+                    node = g.nodes[node_id]['data']
+                    node_info = {
+                        "id": node_id,
+                        "name": getattr(node, 'name', node_id),
+                        "type": getattr(node, 'node_type', '?'),
+                        "depth": depth,
+                        "children": []
+                    }
+                    tree_data.append(node_info)
+                    children = [t for s, t in g.out_edges(node_id)]
+                    for child in children:
+                        build_tree(child, depth + 1, node_info["children"])
+                def format_tree(tree_data):
+                    result = ""
+                    for node in tree_data:
+                        result += "  " * node["depth"] + f"- {node['name']} ({node['type']})\n"
+                        for child in node["children"]:
+                            result += format_subtree(child)
+                    return result
+                def format_subtree(node):
+                    result = "  " * node["depth"] + f"- {node['name']} ({node['type']})\n"
+                    for child in node["children"]:
+                        result += format_subtree(child)
+                    return result
+                if root_id is None:
+                    roots = [n for n, d in g.nodes(data=True) if getattr(d['data'], 'node_type', None) in ('repo', 'directory', 'file')]
+                    root_id = roots[0] if roots else list(g.nodes)[0]
+                self._validate_node_exists(root_id)
+                tree_data = []
+                build_tree(root_id, 0, tree_data)
+                return {
+                    "root_id": root_id,
+                    "max_depth": max_depth,
+                    "tree": tree_data,
+                    "text": format_tree(tree_data)
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "print_tree")
+            except Exception as e:
+                return self._handle_error(e, "print_tree")
+        @self.app.tool(
+            description="Show all relationships (calls, contains, etc.) for a given entity or node."
+        )
+        @observe(as_type='tool')
+        async def entity_relationships(
+            node_id: Annotated[str, "The node/entity ID to explore relationships for."]
+        ) -> dict:
+            try:
+                self._validate_node_exists(node_id)
+                g = self.knowledge_graph.graph
+                incoming = []
+                outgoing = []
+                for source, target, data in g.in_edges(node_id, data=True):
+                    incoming.append({
+                        "source": source,
+                        "target": target,
+                        "relation": data.get('relation', '?')
+                    })
+                for source, target, data in g.out_edges(node_id, data=True):
+                    outgoing.append({
+                        "source": source,
+                        "target": target,
+                        "relation": data.get('relation', '?')
+                    })
+                text = f"Relationships for '{node_id}':\n"
+                for rel in incoming:
+                    text += f"← {rel['source']} [{rel['relation']}]\n"
+                for rel in outgoing:
+                    text += f"→ {rel['target']} [{rel['relation']}]\n"
+                if not incoming and not outgoing:
+                    text = "No relationships found."
+                return {
+                    "node_id": node_id,
+                    "incoming": incoming,
+                    "outgoing": outgoing,
+                    "incoming_count": len(incoming),
+                    "outgoing_count": len(outgoing),
+                    "text": text
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "entity_relationships")
+            except Exception as e:
+                return self._handle_error(e, "entity_relationships")
+        @self.app.tool(
+            description="Search for nodes/entities by type and name substring with fuzzy matching support. For entities, searches by entity_type (e.g., 'class', 'function', 'method'). For other nodes, searches by node_type (e.g., 'file', 'chunk', 'directory')."
+        )
+        @observe(as_type='tool')
+        async def search_by_type_and_name(
+            node_type: Annotated[str, "Type of node/entity (e.g., 'function', 'class', 'file', 'chunk', 'directory')."],
+            name_query: Annotated[str, "Substring to match in the name (case-insensitive, supports partial matches)."],
+            limit: Annotated[int, "Maximum results to return."] = 10,
+            fuzzy: Annotated[bool, "Enable fuzzy/partial matching (default: True)."] = True
+        ) -> dict:
+            import re
+            try:
+                self._validate_positive_int(limit, "limit")
+                g = self.knowledge_graph.graph
+                matches = []
+                query_lower = name_query.lower()
+                # Build regex pattern for fuzzy matching
+                if fuzzy:
+                    fuzzy_pattern = '.*'.join(re.escape(c) for c in query_lower)
+                    fuzzy_regex = re.compile(fuzzy_pattern, re.IGNORECASE)
+                for nid, n in g.nodes(data=True):
+                    node = n['data']
+                    node_name = getattr(node, 'name', '')
+                    if not node_name:
+                        continue
+                    # Check if name matches the query
+                    name_matches = False
+                    if fuzzy:
+                        if query_lower in node_name.lower() or fuzzy_regex.search(node_name):
+                            name_matches = True
+                    else:
+                        if query_lower in node_name.lower():
+                            name_matches = True
+                    if not name_matches:
+                        continue
+                    # Check type based on node_type
+                    current_node_type = getattr(node, 'node_type', None)
+                    # For entity nodes, check entity_type instead of node_type
+                    if current_node_type == 'entity':
+                        entity_type = getattr(node, 'entity_type', '')
+                        # Fallback: if entity_type is empty, check the entities dictionary
+                        if not entity_type and nid in self.knowledge_graph.entities:
+                            entity_types = self.knowledge_graph.entities[nid].get('type', [])
+                            entity_type = entity_types[0] if entity_types else ''
+                        if entity_type and entity_type.lower() == node_type.lower():
+                            score = 0 if query_lower == node_name.lower() else (1 if query_lower in node_name.lower() else 2)
+                            matches.append({
+                                "id": nid,
+                                "name": node_name,
+                                "type": f"entity ({entity_type})",
+                                "content": getattr(node, 'content', None),
+                                "score": score
+                            })
+                    # For other nodes, check node_type directly
+                    elif current_node_type == node_type:
+                        score = 0 if query_lower == node_name.lower() else (1 if query_lower in node_name.lower() else 2)
+                        matches.append({
+                            "id": nid,
+                            "name": node_name,
+                            "type": current_node_type,
+                            "content": getattr(node, 'content', None),
+                            "score": score
+                        })
+                # Sort by match score (best matches first) and limit results
+                matches.sort(key=lambda x: (x['score'], x['name'].lower()))
+                matches = matches[:limit]
+                if not matches:
+                    return {
+                        "node_type": node_type,
+                        "name_query": name_query,
+                        "matches": [],
+                        "text": f"No matches for type '{node_type}' and name containing '{name_query}'."
+                    }
+                text = f"Matches for type '{node_type}' and name '{name_query}' ({len(matches)} results):\n"
+                for match in matches:
+                    text += f"- {match['id']}: {match['name']} [{match['type']}]\n"
+                return {
+                    "node_type": node_type,
+                    "name_query": name_query,
+                    "count": len(matches),
+                    "matches": matches,
+                    "text": text
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "search_by_type_and_name")
+            except Exception as e:
+                return self._handle_error(e, "search_by_type_and_name")
+        @self.app.tool(
+            description="Get the full content of a code chunk along with its surrounding chunks (previous and next)."
+        )
+        @observe(as_type='tool')
+        async def get_chunk_context(
+            node_id: Annotated[str, "The node/chunk ID to get context for."]
+        ) -> dict:
+            from .utils.chunk_utils import organize_chunks_by_file_name, join_organized_chunks
+            try:
+                self._validate_node_exists(node_id)
+                g = self.knowledge_graph.graph
+                current_chunk = g.nodes[node_id]['data']
+                previous_chunk = self.knowledge_graph.get_previous_chunk(node_id)
+                next_chunk = self.knowledge_graph.get_next_chunk(node_id)
+                # Collect all chunks (previous, current, next)
+                chunks = []
+                prev_info = None
+                next_info = None
+                current_info = {
+                    "id": node_id,
+                    "content": getattr(current_chunk, 'content', '')
+                }
+                if previous_chunk:
+                    prev_info = {
+                        "id": previous_chunk.id,
+                        "content": previous_chunk.content
+                    }
+                    chunks.append(previous_chunk)
+                chunks.append(current_chunk)
+                if next_chunk:
+                    next_info = {
+                        "id": next_chunk.id,
+                        "content": next_chunk.content
+                    }
+                    chunks.append(next_chunk)
+                # Organize and join chunks
+                organized = organize_chunks_by_file_name(chunks)
+                full_content = join_organized_chunks(organized)
+                return {
+                    "node_id": node_id,
+                    "current_chunk": current_info,
+                    "previous_chunk": prev_info,
+                    "next_chunk": next_info,
+                    "text": full_content
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "get_chunk_context")
+            except Exception as e:
+                return self._handle_error(e, "get_chunk_context")
+        @self.app.tool(
+            description="Get statistics for a file or directory: number of entities, lines, chunks, etc."
+        )
+        @observe(as_type='tool')
+        async def get_file_stats(
+            path: Annotated[str, "The file or directory path to get statistics for."]
+        ) -> dict:
+            try:
+                g = self.knowledge_graph.graph
+                nodes = [n for n, d in g.nodes(data=True) if getattr(d['data'], 'path', None) == path]
+                if not nodes:
+                    raise NodeNotFoundError(f"No nodes found for path '{path}'.")
+                stats = []
+                text = f"Statistics for '{path}':\n"
+                for node_id in nodes:
+                    node = g.nodes[node_id]['data']
+                    content = getattr(node, 'content', '')
+                    declared = getattr(node, 'declared_entities', [])
+                    called = getattr(node, 'called_entities', [])
+                    chunks = [t for s, t in g.out_edges(node_id) if getattr(g.nodes[t]['data'], 'node_type', None) == 'chunk']
+                    declared_list = []
+                    for entity in declared[:10]:
+                        if isinstance(entity, dict):
+                            declared_list.append({
+                                "name": entity.get('name', '?'),
+                                "type": entity.get('type', '?')
+                            })
+                        else:
+                            declared_list.append({"name": str(entity), "type": "Unknown"})
+                    called_list = [str(entity) for entity in called[:10]]
+                    node_stats = {
+                        "node_id": node_id,
+                        "node_type": getattr(node, 'node_type', '?'),
+                        "lines": len(content.splitlines()) if content else 0,
+                        "declared_entities_count": len(declared),
+                        "declared_entities": declared_list,
+                        "called_entities_count": len(called),
+                        "called_entities": called_list,
+                        "chunks_count": len(chunks),
+                        "has_more_declared": len(declared) > 10,
+                        "has_more_called": len(called) > 10
+                    }
+                    stats.append(node_stats)
+                    text += f"- Node: {node_id} ({node_stats['node_type']})\n"
+                    text += f"  Lines: {node_stats['lines']}\n"
+                    if declared_list:
+                        text += f"  Declared entities ({len(declared)}):\n"
+                        for entity in declared_list:
+                            text += f"    - {entity['name']} ({entity['type']})\n"
+                        if len(declared) > 10:
+                            text += f"    ... and {len(declared) - 10} more\n"
+                    else:
+                        text += f"  Declared entities: 0\n"
+                    if called_list:
+                        text += f"  Called entities ({len(called)}):\n"
+                        for entity in called_list:
+                            text += f"    - {entity}\n"
+                        if len(called) > 10:
+                            text += f"    ... and {len(called) - 10} more\n"
+                    else:
+                        text += f"  Called entities: 0\n"
+                    text += f"  Chunks: {len(chunks)}\n"
+                return {
+                    "path": path,
+                    "nodes": stats,
+                    "text": text
+                }
+            except (NodeNotFoundError, InvalidInputError, EntityNotFoundError) as e:
+                return self._handle_error(e, "get_file_stats")
+            except Exception as e:
+                return self._handle_error(e, "get_file_stats")
+        # --- End New Tools ---
+        @self.app.tool(
+            description="Search for file names in the repository using a regular expression pattern."
+        )
+        @observe(as_type='tool')
+        async def search_file_names_by_regex(
+            pattern: Annotated[str, "The regular expression pattern to match file names."]
+        ) -> dict:
+            """Search for file names matching a regex pattern."""
+            import re
+            g = self.knowledge_graph.graph
+            try:
+                regex = re.compile(pattern)
+            except re.error as e:
+                return {"error": f"Invalid regex pattern: {str(e)}"}
+            matches = []
+            for node_id, node_attrs in g.nodes(data=True):
+                node = node_attrs['data']
+                if getattr(node, 'node_type', None) == 'file':
+                    file_name = getattr(node, 'name', '') or getattr(node, 'path', '')
+                    if regex.search(file_name):
+                        matches.append({
+                            "node_id": node_id,
+                            "file_name": file_name
+                        })
+            if not matches:
+                return {
+                    "pattern": pattern,
+                    "matches": [],
+                    "text": f"No file names matched the pattern: '{pattern}'"
+                }
+                text = f"Files matching pattern '{pattern}':\n"
+                for match in matches[:20]:
+                    text += f"- {match['file_name']} (node ID: {match['node_id']})\n"
+                if len(matches) > 20:
+                    text += f"... and {len(matches) - 20} more\n"
+                return {
+                    "pattern": pattern,
+                    "count": len(matches),
+                    "matches": matches[:20],
+                    "has_more": len(matches) > 20,
+                    "text": text
+                }
+        @self.app.tool(
+            description="Find the shortest path between two nodes in the knowledge graph."
+        )
+        @observe(as_type='tool')
+        async def find_path(
+            source_id: Annotated[str, "The ID of the source node."],
+            target_id: Annotated[str, "The ID of the target node."],
+            max_depth: Annotated[int, "Maximum depth to search for a path."] = 5
+        ) -> dict:
+            """Find shortest path between two nodes."""
+            return self.knowledge_graph.find_path(source_id, target_id, max_depth)
+        @self.app.tool(
+            description="Extract a subgraph around a node up to a specified depth, optionally filtering by edge types."
+        )
+        @observe(as_type='tool')
+        async def get_subgraph(
+            node_id: Annotated[str, "The ID of the central node."],
+            depth: Annotated[int, "The depth/radius of the subgraph to extract."] = 2,
+            edge_types: Annotated[Optional[list], "Optional list of edge types to include (e.g., ['calls', 'contains'])."] = None
+        ) -> dict:
+            """Extract a subgraph around a node."""
+            return self.knowledge_graph.get_subgraph(node_id, depth, edge_types)
+    def run(self, **kwargs):
+        self.app.run(**kwargs)

RepoKnowledgeGraphLib/ModelService.py ADDED Viewed

	@@ -0,0 +1,424 @@

+from abc import ABC, abstractmethod
+from openai import OpenAI, AsyncOpenAI
+from dotenv import load_dotenv
+import os
+import logging
+from tenacity import retry, stop_after_attempt, wait_fixed
+import httpx
+from sentence_transformers import SentenceTransformer
+# Optional torch import for CUDA detection
+try:
+    import torch
+    _TORCH_AVAILABLE = True
+except Exception:
+    torch = None
+    _TORCH_AVAILABLE = False
+from .utils.logger_utils import setup_logger
+LOGGER_NAME = "MODEL_SERVICE_LOGGER"
+# GENERATION ENV VARIABLES (defaults)
+OPENAI_BASE_URL = os.getenv("OPENAI_BASE_URL", 'http://0.0.0.0:8000/v1')
+OPENAI_TOKEN = os.getenv("OPENAI_TOKEN", 'no-need')
+MODEL_NAME = os.getenv('MODEL_NAME', "meta-llama/Llama-3.2-3B-Instruct")
+# EMBED ENV VARIABLES (defaults)
+OPENAI_EMBED_BASE_URL = os.getenv("OPENAI_EMBED_BASE_URL", 'http://0.0.0.0:8001/v1')
+OPENAI_EMBED_TOKEN = os.getenv("OPENAI_EMBED_TOKEN", 'no-need')
+EMBED_MODEL_NAME = os.getenv('EMBED_MODEL_NAME', "Alibaba-NLP/gte-Qwen2-1.5B-instruct")
+# Additional ENV defaults requested
+MAX_TOKENS = int(os.getenv("MAX_TOKENS", 2048))
+TEMPERATURE = float(os.getenv("TEMPERATURE", 0.2))
+TOP_P = float(os.getenv("TOP_P", 0.95))
+FREQUENCY_PENALTY = float(os.getenv("FREQUENCY_PENALTY", 0))
+PRESENCE_PENALTY = float(os.getenv("PRESENCE_PENALTY", 0))
+EMBEDDING_MODEL_URL = os.getenv("EMBEDDING_MODEL_URL", "")
+EMBEDDING_MODEL_API_KEY = os.getenv("EMBEDDING_MODEL_API_KEY", "no_need")
+EMBEDDING_NUMBER_DIMENSIONS = int(os.getenv("EMBEDDING_NUMBER_DIMENSIONS", 1024))
+STOP_AFTER_ATTEMPT = int(os.getenv("STOP_AFTER_ATTEMPT", 5))
+WAIT_BETWEEN_RETRIES = int(os.getenv("WAIT_BETWEEN_RETRIES", 2))
+REQUEST_TIMEOUT = int(os.getenv("REQUEST_TIMEOUT", 240))
+# Note: module-level clients remain for backward compatibility but instances will create their own if timeout is overridden.
+long_timeout_client = httpx.Client(timeout=REQUEST_TIMEOUT)
+long_timeout_async_client = httpx.AsyncClient(timeout=REQUEST_TIMEOUT)
+class ModelServiceInterface(ABC):
+    """
+    Abstract base class defining the interface for model services.
+    All model services should implement these methods.
+    """
+    # accept model_kwargs so variables can be overridden at runtime
+    def __init__(self, model_name: str = None, model_kwargs: dict = None):
+        setup_logger(LOGGER_NAME)
+        self.logger = logging.getLogger(LOGGER_NAME)
+        model_kwargs = model_kwargs or {}
+        # allow overriding via model_kwargs; fall back to module-level defaults
+        self.openai_base_url = model_kwargs.get("OPENAI_BASE_URL", OPENAI_BASE_URL)
+        self.openai_token = model_kwargs.get("OPENAI_TOKEN", OPENAI_TOKEN)
+        # model_name param takes precedence, then model_kwargs then default env
+        self.model_name = model_name or model_kwargs.get("MODEL_NAME", MODEL_NAME)
+        # embed defaults (may be overridden by subclasses or model_kwargs)
+        self.openai_embed_base_url = model_kwargs.get("OPENAI_EMBED_BASE_URL", OPENAI_EMBED_BASE_URL)
+        self.openai_embed_token = model_kwargs.get("OPENAI_EMBED_TOKEN", OPENAI_EMBED_TOKEN)
+        self.embed_model_name = model_kwargs.get("EMBED_MODEL_NAME", EMBED_MODEL_NAME)
+        # other configurable parameters
+        self.max_tokens = int(model_kwargs.get("MAX_TOKENS", MAX_TOKENS))
+        self.temperature = float(model_kwargs.get("TEMPERATURE", TEMPERATURE))
+        self.top_p = float(model_kwargs.get("TOP_P", TOP_P))
+        self.frequency_penalty = float(model_kwargs.get("FREQUENCY_PENALTY", FREQUENCY_PENALTY))
+        self.presence_penalty = float(model_kwargs.get("PRESENCE_PENALTY", PRESENCE_PENALTY))
+        self.embedding_model_url = model_kwargs.get("EMBEDDING_MODEL_URL", EMBEDDING_MODEL_URL)
+        self.embedding_model_api_key = model_kwargs.get("EMBEDDING_MODEL_API_KEY", EMBEDDING_MODEL_API_KEY)
+        self.embedding_number_dimensions = int(model_kwargs.get("EMBEDDING_NUMBER_DIMENSIONS", EMBEDDING_NUMBER_DIMENSIONS))
+        self.stop_after_attempt = int(model_kwargs.get("STOP_AFTER_ATTEMPT", STOP_AFTER_ATTEMPT))
+        self.wait_between_retries = int(model_kwargs.get("WAIT_BETWEEN_RETRIES", WAIT_BETWEEN_RETRIES))
+        request_timeout = int(model_kwargs.get("REQUEST_TIMEOUT", REQUEST_TIMEOUT))
+        # create per-instance httpx clients in case REQUEST_TIMEOUT was overridden
+        self.long_timeout_client = httpx.Client(timeout=request_timeout)
+        self.long_timeout_async_client = httpx.AsyncClient(timeout=request_timeout)
+        # Initialize query client (shared by all implementations)
+        self.client = OpenAI(
+            base_url=self.openai_base_url,
+            api_key=self.openai_token,
+            http_client=self.long_timeout_client,
+        )
+        self.async_client = AsyncOpenAI(
+            base_url=self.openai_base_url,
+            api_key=self.openai_token,
+            http_client=self.long_timeout_async_client,
+        )
+    @retry(stop=stop_after_attempt(STOP_AFTER_ATTEMPT), wait=wait_fixed(WAIT_BETWEEN_RETRIES))
+    def query(self, prompt: str, model_name: str) -> str:
+        """Query the model with a prompt."""
+        if model_name is None:
+            model_name = self.model_name
+        completion = self.client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "user", "content": prompt}
+            ]
+        )
+        return completion.choices[0].message.content
+    @retry(stop=stop_after_attempt(STOP_AFTER_ATTEMPT), wait=wait_fixed(WAIT_BETWEEN_RETRIES))
+    def query_with_instructions(self, prompt: str, instructions: str, model_name: str) -> str:
+        """Query the model with additional system instructions."""
+        if model_name is None:
+            model_name = self.model_name
+        completion = self.client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "system", "content": instructions},
+                {"role": "user", "content": prompt}
+            ]
+        )
+        return completion.choices[0].message.content
+    @retry(stop=stop_after_attempt(STOP_AFTER_ATTEMPT), wait=wait_fixed(WAIT_BETWEEN_RETRIES))
+    async def query_async(self, prompt: str, model_name: str ) -> str:
+        """Async version of query."""
+        if model_name is None:
+            model_name = self.model_name
+        completion = await self.async_client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "user", "content": prompt}
+            ]
+        )
+        return completion.choices[0].message.content
+    @retry(stop=stop_after_attempt(STOP_AFTER_ATTEMPT), wait=wait_fixed(WAIT_BETWEEN_RETRIES))
+    async def query_with_instructions_async(self, prompt: str, instructions: str, model_name: str) -> str:
+        """Async version of query with instructions."""
+        if model_name is None:
+            model_name = self.model_name
+        completion = await self.async_client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {"role": "system", "content": instructions},
+                {"role": "user", "content": prompt}
+            ]
+        )
+        return completion.choices[0].message.content
+    @abstractmethod
+    def embed(self, text_to_embed: str) -> list:
+        """Embed text using the configured embedding model."""
+        pass
+    @abstractmethod
+    async def embed_async(self, text_to_embed: str) -> list:
+        """Async version of embed."""
+        pass
+    @abstractmethod
+    def embed_chunk_code(self, code_to_embed: str) -> list:
+        """Embed code chunk for storage/indexing."""
+        pass
+    @abstractmethod
+    def embed_query(self, query_to_embed: str) -> list:
+        """Embed query for retrieval."""
+        pass
+    @abstractmethod
+    def embed_batch(self, texts_to_embed: list[str]) -> list[list]:
+        """Embed multiple texts in a batch for better performance."""
+        pass
+    @abstractmethod
+    def embed_chunk_code_batch(self, codes_to_embed: list[str]) -> list[list]:
+        """Embed multiple code chunks in a batch for storage/indexing."""
+        pass
+class OpenAIModelService(ModelServiceInterface):
+    """
+    Model service that uses OpenAI client for both queries and embeddings.
+    """
+    def __init__(self, model_name: str = None, embed_model_name: str = None, model_kwargs: dict = None):
+        # forward model_kwargs to base so it can set instance-wide config
+        super().__init__(model_name=model_name, model_kwargs=model_kwargs)
+        # allow override of embed model name via param or model_kwargs
+        model_kwargs = model_kwargs or {}
+        self.embed_model_name = embed_model_name or model_kwargs.get("EMBED_MODEL_NAME", self.embed_model_name)
+        # embed client should use the instance-level embed base/token
+        self.embed_client = OpenAI(
+            base_url=model_kwargs.get("OPENAI_EMBED_BASE_URL", self.openai_embed_base_url),
+            api_key=model_kwargs.get("OPENAI_EMBED_TOKEN", self.openai_embed_token),
+            http_client=self.long_timeout_client,
+        )
+        self.async_embed_client = AsyncOpenAI(
+            base_url=model_kwargs.get("OPENAI_EMBED_BASE_URL", self.openai_embed_base_url),
+            api_key=model_kwargs.get("OPENAI_EMBED_TOKEN", self.openai_embed_token),
+            http_client=self.long_timeout_async_client,
+        )
+    @retry(stop=stop_after_attempt(STOP_AFTER_ATTEMPT), wait=wait_fixed(WAIT_BETWEEN_RETRIES))
+    def embed(self, text_to_embed: str) -> list:
+        """Embed text using OpenAI embeddings API."""
+        response = self.embed_client.embeddings.create(
+            input=text_to_embed,
+            model=self.embed_model_name,
+        )
+        return response.data[0].embedding
+    @retry(stop=stop_after_attempt(STOP_AFTER_ATTEMPT), wait=wait_fixed(WAIT_BETWEEN_RETRIES))
+    async def embed_async(self, text_to_embed: str) -> list:
+        """Async version of embed using OpenAI embeddings API."""
+        response = await self.async_embed_client.embeddings.create(
+            input=text_to_embed,
+            model=self.embed_model_name,
+        )
+        return response.data[0].embedding
+    def embed_chunk_code(self, code_to_embed: str) -> list:
+        """Embed code chunk using OpenAI embeddings API (same as embed)."""
+        return self.embed(code_to_embed)
+    def embed_query(self, query_to_embed: str) -> list:
+        """Embed query using OpenAI embeddings API (same as embed)."""
+        return self.embed(query_to_embed)
+    @retry(stop=stop_after_attempt(STOP_AFTER_ATTEMPT), wait=wait_fixed(WAIT_BETWEEN_RETRIES))
+    def embed_batch(self, texts_to_embed: list[str]) -> list[list]:
+        """Embed multiple texts in a batch using OpenAI embeddings API."""
+        if not texts_to_embed:
+            return []
+        response = self.embed_client.embeddings.create(
+            input=texts_to_embed,
+            model=self.embed_model_name,
+        )
+        return [item.embedding for item in response.data]
+    def embed_chunk_code_batch(self, codes_to_embed: list[str]) -> list[list]:
+        """Embed multiple code chunks in a batch using OpenAI embeddings API."""
+        return self.embed_batch(codes_to_embed)
+class SentenceTransformersModelService(ModelServiceInterface):
+    """
+    Model service that uses OpenAI client for queries and SentenceTransformers for embeddings.
+    Optimized for high-throughput batch embedding with GPU support.
+    """
+    def __init__(self, model_name: str = None, embed_model_name: str = None, model_kwargs: dict = None, skip_embedder: bool = False):
+        super().__init__(model_name=model_name, model_kwargs=model_kwargs)
+        model_kwargs = model_kwargs or {}
+        # embed_model_name may be overridden by model_kwargs
+        self.embed_model_name = embed_model_name or model_kwargs.get("EMBED_MODEL_NAME", self.embed_model_name)
+        self.skip_embedder = skip_embedder
+        self.embedding_model = None
+        if skip_embedder:
+            self.logger.info('Skipping embedder initialization (keyword-only mode)')
+            self.device = "cpu"
+            self.encode_batch_size = 32
+            return
+        # Debug GPU detection
+        self.logger.info(f'PyTorch available: {_TORCH_AVAILABLE}')
+        if _TORCH_AVAILABLE:
+            self.logger.info(f'CUDA available: {torch.cuda.is_available()}')
+            self.logger.info(f'CUDA device count: {torch.cuda.device_count()}')
+            if torch.cuda.is_available():
+                self.logger.info(f'CUDA device name: {torch.cuda.get_device_name(0)}')
+        # Select device: prefer CUDA if available
+        self.device = "cuda" if (_TORCH_AVAILABLE and torch.cuda.is_available()) else "cpu"
+        self.logger.info(f'Initializing SentenceTransformer on device: {self.device}')
+        # Set batch size based on device and available memory
+        # Larger batch sizes significantly improve GPU throughput
+        self.encode_batch_size = int(model_kwargs.get("ENCODE_BATCH_SIZE", 64 if self.device == "cuda" else 32))
+        # Show CUDA memory info if available
+        if self.device == "cuda" and _TORCH_AVAILABLE:
+            try:
+                gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024**3)
+                self.logger.info(f'GPU memory available: {gpu_memory:.2f} GB')
+                # Adjust batch size based on available GPU memory
+                if gpu_memory > 16:
+                    self.encode_batch_size = max(self.encode_batch_size, 128)
+                elif gpu_memory > 8:
+                    self.encode_batch_size = max(self.encode_batch_size, 64)
+            except Exception as e:
+                self.logger.warning(f'Could not get GPU memory info: {e}')
+        self.logger.info(f'Using encode batch size: {self.encode_batch_size}')
+        # Initialize embedding model on the chosen device with performance optimizations
+        self.embedding_model = SentenceTransformer(
+            self.embed_model_name,
+            trust_remote_code=True,
+            device=self.device
+        )
+        # Enable half precision for faster inference on CUDA
+        if self.device == "cuda" and _TORCH_AVAILABLE:
+            try:
+                # Check if model supports half precision
+                self.embedding_model.half()
+                self.logger.info('Enabled half precision (FP16) for faster GPU inference')
+            except Exception as e:
+                self.logger.warning(f'Could not enable half precision: {e}')
+    def _check_embedder(self):
+        """Check if embedder is available, raise error if not."""
+        if self.skip_embedder or self.embedding_model is None:
+            raise RuntimeError(
+                "Embedding model not initialized. This model service was created with skip_embedder=True "
+                "(keyword-only mode). To use embeddings, set index_type to 'hybrid' or 'embedding-only'."
+            )
+    def embed(self, text_to_embed: str) -> list:
+        """Embed text using SentenceTransformers."""
+        self._check_embedder()
+        embeddings = self.embedding_model.encode(
+            [text_to_embed],
+            convert_to_numpy=True,
+            show_progress_bar=False
+        )
+        return embeddings[0].tolist() if hasattr(embeddings[0], 'tolist') else list(embeddings[0])
+    async def embed_async(self, text_to_embed: str) -> list:
+        """
+        Async version of embed using SentenceTransformers.
+        Note: SentenceTransformers doesn't have native async support,
+        so this runs synchronously but maintains the async interface.
+        """
+        return self.embed(text_to_embed)
+    def embed_chunk_code(self, code_to_embed: str) -> list:
+        """Embed code chunk using SentenceTransformers (no special prompt)."""
+        self._check_embedder()
+        self.logger.debug(f'Embedding code using {self.embed_model_name}')
+        embeddings = self.embedding_model.encode(
+            [code_to_embed],
+            convert_to_numpy=True,
+            show_progress_bar=False
+        )
+        return embeddings[0].tolist() if hasattr(embeddings[0], 'tolist') else list(embeddings[0])
+    def embed_query(self, query_to_embed: str) -> list:
+        """Embed query using SentenceTransformers with retrieval prompt."""
+        self._check_embedder()
+        self.logger.debug(f'Embedding query using {self.embed_model_name}')
+        embeddings = self.embedding_model.encode(
+            [query_to_embed],
+            prompt='Given this prompt, retrieve relevant content\n Query:',
+            convert_to_numpy=True,
+            show_progress_bar=False
+        )
+        return embeddings[0].tolist() if hasattr(embeddings[0], 'tolist') else list(embeddings[0])
+    def embed_batch(self, texts_to_embed: list[str]) -> list[list]:
+        """Embed multiple texts in a batch using SentenceTransformers with optimized settings."""
+        if not texts_to_embed:
+            return []
+        self._check_embedder()
+        self.logger.info(f'Batch embedding {len(texts_to_embed)} texts using {self.embed_model_name}')
+        embeddings = self.embedding_model.encode(
+            texts_to_embed,
+            batch_size=self.encode_batch_size,
+            convert_to_numpy=True,
+            show_progress_bar=len(texts_to_embed) > 100,  # Only show progress for large batches
+            normalize_embeddings=True  # Normalize for better similarity computation
+        )
+        return [emb.tolist() if hasattr(emb, 'tolist') else list(emb) for emb in embeddings]
+    def embed_chunk_code_batch(self, codes_to_embed: list[str]) -> list[list]:
+        """Embed multiple code chunks in a batch using SentenceTransformers with optimized settings."""
+        if not codes_to_embed:
+            return []
+        self._check_embedder()
+        self.logger.info(f'Batch embedding {len(codes_to_embed)} code chunks using {self.embed_model_name}')
+        embeddings = self.embedding_model.encode(
+            codes_to_embed,
+            batch_size=self.encode_batch_size,
+            convert_to_numpy=True,
+            show_progress_bar=len(codes_to_embed) > 100,  # Only show progress for large batches
+            normalize_embeddings=True  # Normalize for better similarity computation
+        )
+        return [emb.tolist() if hasattr(emb, 'tolist') else list(emb) for emb in embeddings]
+def create_model_service(skip_embedder: bool = False, **kwargs) -> ModelServiceInterface:
+    """
+    Factory function to create the appropriate ModelService based on embedder_type.
+    Args:
+        skip_embedder (bool): If True, skip loading the embedding model (for keyword-only search).
+        **kwargs: Additional arguments including 'embedder_type' ('openai' or 'sentence-transformers')
+                and optional 'model_kwargs' dict which can override any env var defaults.
+    Returns:
+        ModelServiceInterface: An instance of the appropriate ModelService
+    """
+    model_kwargs = kwargs.pop('model_kwargs', None)
+    embedder_type = kwargs.pop('embedder_type', 'openai')
+    if embedder_type == 'openai':
+        return OpenAIModelService(model_kwargs=model_kwargs, **kwargs)
+    elif embedder_type == 'sentence-transformers':
+        return SentenceTransformersModelService(model_kwargs=model_kwargs, skip_embedder=skip_embedder, **kwargs)
+    else:
+        logging.getLogger(LOGGER_NAME).warning(
+            f'Unknown embedder type: {embedder_type}, defaulting to OpenAI'
+        )
+        return OpenAIModelService(model_kwargs=model_kwargs, **kwargs)

RepoKnowledgeGraphLib/Node.py ADDED Viewed

	@@ -0,0 +1,63 @@

+from typing import Optional, Dict, List
+from dataclasses import dataclass, field, asdict
+from .Entity import Entity
+@dataclass
+class Node:
+    id: str = ''
+    name: str = ''
+    node_type: str = ''
+    description: Optional[str] = None
+    declared_entities: List[dict] = field(default_factory=list)  # Classes, functions, variables
+    called_entities: List[str] = field(  default_factory=list)  # Classes, functions, variables, but also external libraries
+    def dict(self):
+        return {k: str(v) for k, v in asdict(self).items()}
+@dataclass
+class DirectoryNode(Node):
+    path: str = ''
+    node_type: str = 'directory'
+@dataclass
+class FileNode(Node):
+    path: str = ''
+    content: str = ''
+    node_type: str = 'file'
+    language : str = ''
+@dataclass
+class ChunkNode(FileNode):
+    node_type: str = 'chunk'
+    order_in_file: int = field(default_factory=int)
+    embedding : list = None
+    def get_field_to_embed(self) -> Optional[str]:
+        # Use description if available, otherwise fall back to content
+        # This ensures we always have something meaningful to embed
+        if self.description and self.description.strip():
+            return self.description
+        return self.content
+@dataclass
+class EntityNode(Node):
+    entity_type: str = ''
+    declaring_chunk_ids: List[str] = field(default_factory=list)
+    calling_chunk_ids: List[str] = field(default_factory=list)
+    aliases: List[str] = field(default_factory=list)  # All possible aliases for this entity
+    node_type: str = 'entity'
+    def __post_init__(self):
+        # Use entity_name (stored in name field) as the id if id is not set
+        if not self.id and self.name:
+            self.id = self.name
+    def dict(self):
+        return {k: str(v) for k, v in asdict(self).items()}
+    def get_field_to_embed(self) -> Optional[str]:
+        return self.name

RepoKnowledgeGraphLib/QuestionMaker.py ADDED Viewed

	@@ -0,0 +1,538 @@

+import logging
+import asyncio
+from tqdm import tqdm
+from .RepoKnowledgeGraph import RepoKnowledgeGraph
+from .ModelService import ModelService
+from .utils.logger_utils import setup_logger
+from .utils.chunk_utils import organize_chunks_by_file_name, join_organized_chunks, extract_filename_from_chunk
+from .Node import ChunkNode
+LOGGER_NAME = "QUESTION_MAKER_LOGGER"
+class QuestionMaker:
+    """
+    The QuestionMaker class is responsible for generating code comprehension questions and answers
+    based on code chunks and knowledge graphs. It leverages a language model service to formulate
+    questions and answers that test deep understanding of code, focusing on mechanisms, design decisions,
+    and subtle behaviors. It supports generating questions for neighboring code chunks as well as for
+    specific entities (e.g., functions, classes) that are both declared and called in the codebase.
+    """
+    def __init__(self):
+        """
+        Initializes the QuestionMaker, sets up logging, and instantiates the model service.
+        """
+        setup_logger(LOGGER_NAME)
+        self.logger = logging.getLogger(LOGGER_NAME)
+        self.model_service = ModelService()
+    def generate_questions_answers(self, candidate_chunks:dict) -> list:
+        """
+        Placeholder for generating questions and answers from candidate chunks.
+        Args:
+            candidate_chunks (dict): Dictionary mapping chunk groups to process.
+        Returns:
+            list: List of question-answer pairs.
+        """
+        pass
+    def test_chunk_sensibility(self, knowledge_graph: RepoKnowledgeGraph) -> list:
+        """
+        Placeholder for testing the sensibility of code chunks in the knowledge graph.
+        Args:
+            knowledge_graph (RepoKnowledgeGraph): The knowledge graph to test.
+        Returns:
+            list: List of results or metrics.
+        """
+        pass
+    async def make_n_neighbouring_chunk_questions_async(self, knowledge_graph: RepoKnowledgeGraph) -> list:
+        """
+        Generates questions and answers for all possible groups of n directly neighboring code chunks
+        in each file of the knowledge graph. This helps assess understanding of code that spans multiple
+        adjacent chunks, such as related functions or code blocks.
+        Args:
+            knowledge_graph (RepoKnowledgeGraph): The knowledge graph to generate questions from.
+        Returns:
+            list: A list of dictionaries, each containing a question, answer, the involved chunks, and category.
+        """
+        file_nodes = knowledge_graph.get_all_files()
+        # create candidate chunks dictionary
+        candidate_chunks = []
+        for file_node in file_nodes:
+            self.logger.info(f"Processing file node: {file_node}")
+            chunks = knowledge_graph.get_chunks_of_file(file_node.id)
+            num_chunks = len(chunks)
+            # For each n, collect all n-sized tuples of directly neighbouring chunks
+            for n in range(2, num_chunks + 1):
+                for i in range(num_chunks - n + 1):
+                    # Only directly neighbouring chunks
+                    candidate_chunks.append(list(chunks[i:i+n]))
+        # generate questions and answers from candidate chunks in parallel, in batches of 15
+        async def process_chunk_group(chunks):
+            """
+            Helper coroutine to generate a question and answer for a specific group of neighboring chunks.
+            Args:
+                chunks (list): The list of code chunks to generate the question and answer from.
+            Returns:
+                dict: Contains question, answer, chunks, and category.
+            """
+            question = await self._generate_neighboring_question_from_chunks_async(chunks)
+            answer = await self.answer_question_about_chunks_async(chunks, question)
+            return {
+                'question': question,
+                'clean_question': question,
+                'answer': answer,
+                'chunks': [chunk.dict() for chunk in chunks],
+                'category': 'neighbouring_chunks'
+            }
+        # Batch processing in groups of 15 with tqdm
+        batch_size = 15
+        results = []
+        total = len(candidate_chunks)
+        for i in tqdm(range(0, total, batch_size), desc="Generating neighbouring chunk questions", unit="batch"):
+            batch = candidate_chunks[i:i+batch_size]
+            tasks = [process_chunk_group(chunks) for chunks in batch]
+            batch_results = []
+            for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Questions in batch", leave=False):
+                batch_results.append(await coro)
+            results.extend(batch_results)
+        return results
+    async def make_entity_declaration_call_specific_questions_async(self, knowledge_graph: RepoKnowledgeGraph) -> list:
+        """
+        Generates questions and answers about specific entities (e.g., functions, classes) that have both
+        a declaration and at least one call site in the knowledge graph. Focuses on cross-file references
+        by default.
+        Args:
+            knowledge_graph (RepoKnowledgeGraph): The knowledge graph to generate questions from.
+        Returns:
+            list: A list of dictionaries, each containing a question, answer, entity, involved chunks, and category.
+        """
+        self.logger.info("Generating entity-specific questions.")
+        candidate_pairs = self.get_entities_with_declaration_and_calling(knowledge_graph)
+        async def process_entity_pair(pair):
+            """
+            Helper coroutine to generate a question and answer for a specific entity's declaration and call site.
+            Args:
+                pair (dict): Contains entity name, declaring_chunk_id, and calling_chunk_id.
+            Returns:
+                dict: Contains question, answer, entity, chunks, and category.
+            """
+            entity_name = pair['entity']
+            chunks = [knowledge_graph[pair['declaring_chunk_id']], knowledge_graph[pair['calling_chunk_id']]]
+            question = await self.make_entity_specific_question_async(chunks, entity_name)
+            answer = await self.answer_question_about_chunks_async(chunks, question)
+            return {
+                'question': question,
+                'clean_question': question,
+                'answer': answer,
+                'entity': entity_name,
+                'chunks': [chunk.dict() for chunk in chunks],
+                'category': 'entity_declaration_call_specific'
+            }
+        # Batch processing with tqdm
+        batch_size = 15
+        results = []
+        total = len(candidate_pairs)
+        for i in tqdm(range(0, total, batch_size), desc="Generating entity-specific questions", unit="batch"):
+            batch = candidate_pairs[i:i+batch_size]
+            tasks = [process_entity_pair(pair) for pair in batch]
+            batch_results = []
+            for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Questions in batch", leave=False):
+                batch_results.append(await coro)
+            results.extend(batch_results)
+        return results
+    async def make_interacting_entities_specific_questions_async(self, entity_A:str, entity_B:str,
+                                                            decl_chunk_A: ChunkNode, decl_chunk_B: ChunkNode,
+                                                            call_chunk: ChunkNode) -> str:
+        """
+        Generates a question and answer about two entities that interact in the same chunk.
+        Each entity has a declaration and at least one call site, and the question focuses on their interaction.
+        Args:
+            entity_A (str): Name of the first entity.
+            entity_B (str): Name of the second entity.
+            decl_chunk_A (str): Chunk of the declaration of entity A.
+            decl_chunk_B (str): Chunk of the declaration of entity B.
+            call_chunk (str): Chunk ID where both entities interact.
+        Returns:
+            str: the generated question as plain text.
+        """
+        entity_A_definition_code = decl_chunk_A.content
+        entity_B_definition_code = decl_chunk_B.content
+        entity_interaction_code = call_chunk.content
+        prompt = f"""You are given two code entities, {entity_A} and {entity_B}, along with a snippet where they interact.
+        Your task is to write **one clear and concise question** about their relationship.
+        ### Input:
+        * {entity_A} Definition Code:
+        {entity_A_definition_code}
+        * {entity_B} Definition Code:
+        {entity_B_definition_code}
+        * Interaction Code (where they interact):
+        {entity_interaction_code}
+        ### Guidelines:
+        * Ask about design, abstraction, dependencies, or side effects.
+        * The question should highlight something a developer might consider when reviewing or improving the code.
+        * Keep the question short and direct so it can be answered briefly.
+        * Do not explain the code or provide answers.
+        ### Output:
+        **Question**: <your question here>
+        """
+        initial_question = await self.model_service.query_async(prompt=prompt)
+        return await self.extract_question_from_generated_text_async(generated_text=initial_question)
+    def get_all_candidate_pairs_and_triplets(self, knowledge_graph: RepoKnowledgeGraph) -> list:
+        candidate_triplets = []
+        candidate_pairs = []
+        interacting_entity_triplets = self.get_interacting_entity_triplets(knowledge_graph)
+        for triplet in interacting_entity_triplets:
+            chunks = [
+                    knowledge_graph[triplet['decl_chunk_A']],
+                    knowledge_graph[triplet['decl_chunk_B']],
+                    knowledge_graph[triplet['call_chunk']]
+                ]
+            candidate_triplets.append({
+                'entities': (triplet['entity_A'], triplet['entity_B']),
+                'chunks': [chunk.dict() for chunk in chunks],
+                'category': 'interacting_entities'
+            })
+        declaration_calling_pairs = self.get_entities_with_declaration_and_calling(knowledge_graph)
+        for pair in declaration_calling_pairs:
+            chunks = [knowledge_graph[pair['declaring_chunk_id']], knowledge_graph[pair['calling_chunk_id']]]
+            candidate_pairs.append({
+                'entity': pair['entity'],
+                'chunks': [chunk.dict() for chunk in chunks],
+                'category': 'entity_declaration_call_specific'
+            })
+        return candidate_pairs, candidate_triplets
+    async def make_interacting_entity_questions_async(self, knowledge_graph: RepoKnowledgeGraph) -> list:
+        """
+        Generates questions and answers about pairs of entities that interact in the same chunk.
+        Each entity has a declaration and at least one call site, and the question focuses on their interaction.
+        Args:
+            knowledge_graph (RepoKnowledgeGraph): The knowledge graph to generate questions from.
+        Returns:
+            list: A list of dictionaries, each containing a question, answer, entities, involved chunks, and category.
+        """
+        self.logger.info("Generating interacting entity questions.")
+        triplets = self.get_interacting_entity_triplets(knowledge_graph)
+        async def process_triplet(triplet):
+            """
+            Helper coroutine to generate a question and answer for a specific interacting entity triplet.
+            Args:
+                triplet (dict): Contains entity_A, entity_B, decl_chunk_A, decl_chunk_B, and call_chunk.
+            Returns:
+                dict: Contains question, answer, entities, chunks, and category.
+            """
+            chunks = [
+                knowledge_graph[triplet['decl_chunk_A']],
+                knowledge_graph[triplet['decl_chunk_B']],
+                knowledge_graph[triplet['call_chunk']]
+            ]
+            question = await self.make_interacting_entities_specific_questions_async(entity_A=triplet['entity_A'],
+                                                                                     entity_B=triplet['entity_B'],
+                                                                                    decl_chunk_A=knowledge_graph[triplet['decl_chunk_A']],
+                                                                                    decl_chunk_B=knowledge_graph[triplet['decl_chunk_B']],
+                                                                                    call_chunk=knowledge_graph[triplet['call_chunk']])
+            answer = await self.answer_question_about_chunks_async(chunks, question)
+            return {
+                'question': question,
+                'clean_question': question,
+                'answer': answer,
+                'entities': (triplet['entity_A'], triplet['entity_B']),
+                'chunks': [chunk.dict() for chunk in chunks],
+                'category': 'interacting_entities'
+            }
+        # Batch processing with tqdm
+        batch_size = 15
+        results = []
+        total = len(triplets)
+        for i in tqdm(range(0, total, batch_size), desc="Generating interacting entity questions", unit="batch"):
+            batch = triplets[i:i+batch_size]
+            tasks = [process_triplet(triplet) for triplet in batch]
+            batch_results = []
+            for coro in tqdm(asyncio.as_completed(tasks), total=len(tasks), desc="Questions in batch", leave=False):
+                batch_results.append(await coro)
+            results.extend(batch_results)
+        return results
+    async def _generate_neighboring_question_from_chunks_async(self, chunks: list) -> str:
+        """
+        Generates a single code comprehension question for a group of code chunks using the model service.
+        The question is designed to probe deep understanding of the code's mechanisms, design, or pitfalls.
+        Args:
+            chunks (list): The list of code chunks to generate the question from.
+        Returns:
+            str: The generated question as plain text.
+        """
+        organized_chunks = organize_chunks_by_file_name(chunks)
+        joined_chunks = join_organized_chunks(organized_chunks)
+        system_prompt = """You are an expert in evaluating code comprehension. The user will provide, in the next message, the content of a code submission (in any programming language). Your goal is to analyze this code, identify its critical, subtle, or obscure aspects, and generate **one relevant question in English** to ask someone in order to assess their understanding of the code.
+        This question should focus on:
+        * essential mechanisms of how the code works,
+        * important design decisions,
+        * potential pitfalls or unexpected behaviors,
+        * or any aspect that requires deep comprehension.
+        The goal is to test whether the person has **truly understood** the code—not just skimmed through it.
+        Respond with **only one question**, in plain text. Do not include any explanation, comment, or wrapper (e.g., no dictionaries, no lists).
+        """
+        initial_question = await self.model_service.query_with_instructions_async(instructions=system_prompt, prompt=joined_chunks)
+        return await self.extract_question_from_generated_text_async(generated_text=initial_question)
+    async def answer_question_about_chunks_async(self, chunks: list, question: str) -> str:
+        """
+        Generates an answer to a code comprehension question about a group of code chunks using the model service.
+        The answer should demonstrate deep understanding and cover mechanisms, design, and pitfalls.
+        Args:
+            chunks (list): The list of code chunks to answer the question about.
+            question (str): The question to answer.
+        Returns:
+            str: The generated answer as plain text.
+        """
+        organized_chunks = organize_chunks_by_file_name(chunks)
+        joined_chunks = join_organized_chunks(organized_chunks)
+        system_prompt = """You are an expert in evaluating code comprehension. The user will provide, in the next message, the content of a code submission (in any programming language) and a question about it. Your goal is to analyze this code, identify its critical, subtle, or obscure aspects, and generate **one relevant answer in English** to the question.
+        This answer should focus on:
+        * essential mechanisms of how the code works,
+        * important design decisions,
+        * potential pitfalls or unexpected behaviors,
+        * or any aspect that requires deep comprehension.
+        The goal is to provide a clear and thorough answer that demonstrates a deep understanding of the code.
+        """
+        return await self.model_service.query_with_instructions_async(instructions=system_prompt, prompt=joined_chunks + "\n\n" + question)
+    async def make_entity_specific_question_async(self, chunks: list, entity_name:str):
+        """
+        Generates a question about a specific entity (e.g., function, class) in the context of the provided code chunks.
+        The question is designed to probe understanding of the entity's purpose, behavior, and interactions.
+        Args:
+            chunks (list): The list of code chunks to generate the question from.
+            entity_name (str): The name of the entity to focus on.
+        Returns:
+            str: The generated question as plain text.
+        """
+        organized_chunks = organize_chunks_by_file_name(chunks)
+        joined_chunks = join_organized_chunks(organized_chunks)
+        system_prompt = f"""You will be given one or more code snippets, possibly from multiple files.
+        A specific entity (such as a class, function, or variable) will be identified.
+        ---
+        ## Entity of Focus: {entity_name}
+        ### Task:
+        * Write **one clear and concise question** about this entity.
+        * The question should highlight something a developer might consider, such as its purpose, behavior, interactions, or potential improvements.
+        ### Guidelines:
+        * Keep the question short and direct.
+        * Do not explain the code or give an answer.
+        ### Output:
+        **Question**: <your question here>
+        """
+        initial_question= await self.model_service.query_with_instructions_async(instructions=system_prompt, prompt=joined_chunks)
+        return await self.extract_question_from_generated_text_async(generated_text=initial_question)
+    def get_entities_with_declaration_and_calling(self, knowledge_graph: RepoKnowledgeGraph, cross_file_only: bool = True) -> list:
+        """
+        Finds all entities in the knowledge graph that have both a declaration and at least one call site.
+        Optionally restricts to cases where the declaration and call are in different files (cross-file).
+        Args:
+            knowledge_graph (RepoKnowledgeGraph): The knowledge graph to search in.
+            cross_file_only (bool): If True, only consider cross-file declaration/call pairs.
+        Returns:
+            list: List of dictionaries with 'entity', 'declaring_chunk_id', and 'calling_chunk_id'.
+        """
+        candidate_pairs = []
+        entities = knowledge_graph.entities
+        for entity_name in entities:
+            entity = entities[entity_name]
+            if len(entity['declaring_chunk_ids']) and len(entity['calling_chunk_ids']):
+                found = False
+                for declaring_chunk_id in entity['declaring_chunk_ids']:
+                    for calling_chunk_id in entity['calling_chunk_ids']:
+                        if declaring_chunk_id != calling_chunk_id:
+                            if cross_file_only and extract_filename_from_chunk(knowledge_graph[declaring_chunk_id]) == extract_filename_from_chunk(knowledge_graph[calling_chunk_id]):
+                                continue
+                            else:
+                                candidate_pairs.append({'entity': entity_name, 'declaring_chunk_id' : declaring_chunk_id, 'calling_chunk_id':  calling_chunk_id})
+                                found = True
+                                break
+                    if found:
+                        break
+        return candidate_pairs
+    def get_interacting_entity_triplets(self, knowledge_graph: RepoKnowledgeGraph) -> list:
+        """
+        Finds triplets of chunk ids such that:
+        - Two entities (A, B) are interacting in the same chunk (call_chunk)
+        - Each entity has a declaring chunk (decl_chunk_A, decl_chunk_B)
+        - Both entities have non-empty declaring_chunk_ids and calling_chunk_ids
+        Returns:
+            list of dicts with keys:
+                'entity_A', 'entity_B', 'decl_chunk_A', 'decl_chunk_B', 'call_chunk'
+        """
+        triplets = []
+        seen_pairs = set()
+        entities = knowledge_graph.entities
+        for entity_A_name, entity_A in entities.items():
+            if not entity_A['declaring_chunk_ids'] or not entity_A['calling_chunk_ids']:
+                continue
+            for entity_B_name, entity_B in entities.items():
+                if entity_A_name == entity_B_name:
+                    continue
+                if not entity_B['declaring_chunk_ids'] or not entity_B['calling_chunk_ids']:
+                    continue
+                pair_key = (entity_A_name, entity_B_name)
+                if pair_key in seen_pairs:
+                    continue
+                # Find intersection of calling_chunk_ids
+                call_chunks = set(entity_A['calling_chunk_ids']) & set(entity_B['calling_chunk_ids'])
+                found = False
+                for call_chunk in call_chunks:
+                    for decl_chunk_A in entity_A['declaring_chunk_ids']:
+                        for decl_chunk_B in entity_B['declaring_chunk_ids']:
+                            triplets.append({
+                                'entity_A': entity_A_name,
+                                'entity_B': entity_B_name,
+                                'decl_chunk_A': decl_chunk_A,
+                                'decl_chunk_B': decl_chunk_B,
+                                'call_chunk': call_chunk
+                            })
+                            seen_pairs.add(pair_key)
+                            found = True
+                            break
+                        if found:
+                            break
+                    if found:
+                        break
+        return triplets
+    async def extract_question_from_generated_text_async(self, generated_text: str) -> str:
+        """
+        Extracts the question from the generated text. The question is expected to be the last line of the text.
+        Args:
+            generated_text (str): The text generated by the model.
+        Returns:
+            str: The extracted question.
+        """
+        prompt = f"Extract only the question from the following text. Return the question exactly, with no extra words or labels:\n\n{generated_text}\n\n"
+        return await self.model_service.query_async(prompt=prompt)
+    def select_diverse_candidates(self, candidate_pairs, candidate_triplets, max_pairs=20, max_triplets=20):
+        """
+        Selects a limited number of pairs and triplets with maximum diversity in entity representation.
+        Args:
+            candidate_pairs (list): List of candidate pairs (dicts with 'entity', ...).
+            candidate_triplets (list): List of candidate triplets (dicts with 'entities', ...).
+            max_pairs (int): Maximum number of pairs to select.
+            max_triplets (int): Maximum number of triplets to select.
+        Returns:
+            (list, list): Selected pairs and triplets.
+        """
+        # Select pairs
+        selected_pairs = []
+        used_entities = set()
+        for pair in candidate_pairs:
+            entity = pair['entity']
+            if entity not in used_entities:
+                selected_pairs.append(pair)
+                used_entities.add(entity)
+            if len(selected_pairs) >= max_pairs:
+                break
+        # Select triplets
+        selected_triplets = []
+        used_entities_triplets = set()
+        for triplet in candidate_triplets:
+            entities = set(triplet['entities'])
+            if not entities & used_entities_triplets:
+                selected_triplets.append(triplet)
+                used_entities_triplets.update(entities)
+            if len(selected_triplets) >= max_triplets:
+                break
+        return selected_pairs, selected_triplets
+    async def transform_answser_into_mcq_answer_async(self, question, answer, chunks):
+        """
+        Transforms the question and answer into a format suitable for MCQ generation.
+        """
+        code = join_organized_chunks(organize_chunks_by_file_name(chunks))
+        prompt = f"""
+    You are an expert Python developer and technical writer. I will give you:
+    1. A Python code snippet
+    2. A question about that code
+    3. A detailed answer to the question
+    Your task is to **sanitize** the answer. That means:
+    - Strip away all fluff, filler, and redundant explanation
+    - Focus only on what directly answers the question
+    - Make it **short, clear, and direct**, as if it were a correct MCQ answer
+    - Prefer concise phrases or a single clear sentence over paragraph explanations
+    - Keep any necessary technical detail, but no more than needed
+    Do **not** repeat the question. Do **not** rephrase the code. Just give the concise, final answer.
+    - **Input Code**:
+    {code}
+    - **Question**:
+    {question}
+    - **Original Answer**:
+    {answer}
+    - **Sanitized Answer**:
+    """
+        return await self.model_service.query_async(prompt)

RepoKnowledgeGraphLib/RepoKnowledgeGraph.py ADDED Viewed

	@@ -0,0 +1,1608 @@

+import networkx as nx
+import json
+import os
+import asyncio
+import nest_asyncio
+import tqdm
+# from pathlib import Path
+import os.path
+import tempfile
+import subprocess
+from typing import List, Optional, Dict
+import logging
+import urllib.parse
+from .ModelService import create_model_service
+from .Node import Node, DirectoryNode, FileNode, ChunkNode, EntityNode
+from .CodeParser import CodeParser
+from .EntityExtractor import HybridEntityExtractor
+from .CodeIndex import CodeIndex
+from .utils.logger_utils import setup_logger
+from .utils.parsing_utils import read_directory_files_recursively, get_language_from_filename
+from .utils.path_utils import prepare_input_path, build_entity_alias_map, resolve_entity_call
+from .EntityChunkMapper import EntityChunkMapper
+LOGGER_NAME = 'REPO_KNOWLEDGE_GRAPH_LOGGER'
+MODEL_SERVICE_TYPES = ['openai', 'sentence-transformers']
+# A RepoKnowledgeGraph is a weighted DAG based on a tree-structure with added edges
+class RepoKnowledgeGraph:
+    """
+    RepoKnowledgeGraph builds a knowledge graph of a code repository.
+    It parses source files, extracts code entities and relationships, and organizes them
+    into a directed acyclic graph (DAG) with additional semantic edges.
+    Use `from_path()` or `load_graph_from_file()` to create instances.
+    """
+    def __init__(self):
+        """
+        Private constructor. Use from_path() or load_graph_from_file() instead.
+        """
+        raise RuntimeError(
+            "Cannot instantiate RepoKnowledgeGraph directly. "
+            "Use RepoKnowledgeGraph.from_path() or RepoKnowledgeGraph.load_graph_from_file() instead."
+        )
+    def _initialize(self, model_service_kwargs: dict, code_index_kwargs: Optional[dict] = None):
+        """Internal initialization method."""
+        setup_logger(LOGGER_NAME)
+        self.logger = logging.getLogger(LOGGER_NAME)
+        self.logger.info('Initializing RepoKnowledgeGraph instance.')
+        self.code_parser = CodeParser()
+        # Determine if we should skip loading the embedder based on index_type
+        index_type = (code_index_kwargs or {}).get('index_type', 'hybrid')
+        skip_embedder = index_type == 'keyword-only'
+        if skip_embedder:
+            self.logger.info('Using keyword-only index, skipping embedder initialization')
+        self.model_service = create_model_service(skip_embedder=skip_embedder, **model_service_kwargs)
+        self.entities = {}
+        self.graph = nx.DiGraph()
+        self.knowledge_graph = nx.DiGraph()
+        self.code_index = None
+        self.entity_extractor = HybridEntityExtractor()
+    def __iter__(self):
+        # Yield only the 'data' attribute from each node
+        return (node_data['data'] for _, node_data in self.graph.nodes(data=True))
+    def __getitem__(self, node_id):
+        return self.graph.nodes[node_id]['data']
+    @classmethod
+    def from_path(cls, path: str, skip_dirs: Optional[list] = None, index_nodes: bool = True, describe_nodes=False,
+                  extract_entities: bool = False, model_service_kwargs: Optional[dict] = None, code_index_kwargs: Optional[dict] = None):
+        if skip_dirs is None:
+            skip_dirs = []
+        if model_service_kwargs is None:
+            model_service_kwargs = {}
+        """
+        Alternative constructor to build a RepoKnowledgeGraph from a path, with options to skip directories
+        and control entity extraction and node description.
+        Args:
+            path (str): Path to the root of the code repository.
+            skip_dirs (list): List of directory names to skip.
+            index_nodes (bool): Whether to build a code index.
+            describe_nodes (bool): Whether to generate descriptions for code chunks.
+            extract_entities (bool): Whether to extract entities from code.
+        Returns:
+            RepoKnowledgeGraph: The constructed knowledge graph.
+        """
+        instance = cls.__new__(cls)  # Create instance without calling __init__
+        instance._initialize(model_service_kwargs=model_service_kwargs, code_index_kwargs=code_index_kwargs)
+        instance.logger.info(f"Preparing to build knowledge graph from path: {path}")
+        prepared_path = prepare_input_path(path)
+        instance.logger.debug(f"Prepared input path: {prepared_path}")
+        # Handle running event loop (e.g., in Jupyter)
+        try:
+            loop = asyncio.get_running_loop()
+        except RuntimeError:
+            loop = None
+        if loop and loop.is_running():
+            instance.logger.debug("Detected running event loop, applying nest_asyncio.")
+            nest_asyncio.apply()
+            task = instance._initial_parse_path_async(prepared_path, skip_dirs=skip_dirs, index_nodes=index_nodes,
+                                                      describe_nodes=describe_nodes, extract_entities=extract_entities)
+            loop.run_until_complete(task)
+        else:
+            instance.logger.debug("No running event loop, using asyncio.run.")
+            asyncio.run(instance._initial_parse_path_async(prepared_path, skip_dirs=skip_dirs, index_nodes=index_nodes,
+                                                           describe_nodes=describe_nodes,
+                                                           extract_entities=extract_entities))
+        instance.logger.info("Parsing files and building initial nodes...")
+        instance.logger.info("Initial parse and node creation complete. Building relationships between nodes...")
+        instance._build_relationships()
+        if index_nodes:
+            instance.logger.info("Building code index for all nodes in the graph...")
+            instance.code_index = CodeIndex(list(instance), model_service=instance.model_service, **(code_index_kwargs or {}))
+        instance.logger.info("Knowledge graph construction from path completed successfully.")
+        return instance
+    @classmethod
+    def from_repo(
+            cls,
+            repo_url: str,
+            skip_dirs: Optional[list] = None,
+            index_nodes: bool = True,
+            describe_nodes: bool = False,
+            extract_entities: bool = False,
+            model_service_kwargs: Optional[dict] = None,
+            code_index_kwargs: Optional[dict]=None,
+            github_token: Optional[str] = None,
+            allow_unauthenticated_clone: bool = True,
+    ):
+        """
+        Alternative constructor to build a RepoKnowledgeGraph from a remote git repository URL.
+        Args:
+            repo_url (str): Git repository URL (SSH or HTTPS).
+            skip_dirs (list): List of directory names to skip.
+            index_nodes (bool): Whether to build a code index.
+            describe_nodes (bool): Whether to generate descriptions for code chunks.
+            extract_entities (bool): Whether to extract entities from code.
+            github_token (str, optional): Personal access token to access private GitHub repos.
+                If not provided, the method will look for the `GITHUB_OAUTH_TOKEN` environment variable.
+            allow_unauthenticated_clone (bool): If True, attempt to clone without a token when none is provided.
+                If False, raise an error when no token is available.
+        Returns:
+            RepoKnowledgeGraph: The constructed knowledge graph.
+        """
+        if skip_dirs is None:
+            skip_dirs = []
+        if model_service_kwargs is None:
+            model_service_kwargs = {}
+        instance = cls.__new__(cls)
+        instance._initialize(model_service_kwargs=model_service_kwargs, code_index_kwargs=code_index_kwargs)
+        instance.logger.info(f"Starting knowledge graph build from remote repository: {repo_url}")
+        # Determine token
+        token = github_token or os.environ.get('GITHUB_OAUTH_TOKEN')
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            clone_url = repo_url
+            try:
+                if repo_url.startswith('git@'):
+                    # Convert git@github.com:owner/repo.git -> https://github.com/owner/repo.git
+                    clone_url = repo_url.replace(':', '/').split('git@')[-1]
+                    clone_url = f'https://{clone_url}'
+                if token and clone_url.startswith('https://'):
+                    encoded_token = urllib.parse.quote(token, safe='')
+                    clone_url = clone_url.replace('https://', f'https://{encoded_token}@')
+                elif not token and not allow_unauthenticated_clone:
+                    raise ValueError(
+                        "GitHub token not provided and unauthenticated clone is disabled. "
+                        "Set allow_unauthenticated_clone=True or provide a token."
+                    )
+                instance.logger.debug(f"Running git clone: {clone_url} -> {tmpdirname}")
+                subprocess.run(['git', 'clone', clone_url, tmpdirname], check=True)
+            except Exception as e:
+                instance.logger.error(f"Failed to clone repository {repo_url} using URL {clone_url}: {e}")
+                raise
+            instance.logger.info(f"Repository successfully cloned to: {tmpdirname}")
+            return cls.from_path(
+                tmpdirname,
+                skip_dirs=skip_dirs,
+                index_nodes=index_nodes,
+                describe_nodes=describe_nodes,
+                extract_entities=extract_entities,
+                model_service_kwargs=model_service_kwargs,
+                code_index_kwargs=code_index_kwargs
+            )
+    async def _initial_parse_path_async(self, path: str, skip_dirs: list, index_nodes=True, describe_nodes=True,
+                                        extract_entities: bool = True):
+        self.logger.info(f"Beginning async parsing of repository at path: {path}")
+        """
+        Orchestrates the parsing and graph construction process:
+        1. Reads files and splits into chunks.
+        2. Extracts entities and relationships.
+        3. Builds chunk, file, directory, and root nodes.
+        4. Aggregates entity information.
+        Args:
+            path (str): Root path to parse.
+            skip_dirs (list): Directories to skip.
+            index_nodes (bool): Whether to build code index.
+            describe_nodes (bool): Whether to generate descriptions.
+            extract_entities (bool): Whether to extract entities.
+        """
+        # --- Pass 1: Create ChunkNodes ---
+        level1_node_contents = read_directory_files_recursively(
+            path, skip_dirs=skip_dirs,
+            skip_pattern=r"(?:\.log$|\.json$|(?:^|/)(?:\.git|\.idea|__pycache__|\.cache)(?:/|$)|(?:^|/)(?:changelog|ChangeLog)(?:\.[a-z0-9]+)?$|\.cache$)"
+        )
+        self.logger.debug(f"Found {len(level1_node_contents)} files to process.")
+        self.logger.info("Chunk nodes creation step started.")
+        chunk_info = await self._create_chunk_nodes(
+            level1_node_contents, extract_entities, describe_nodes, index_nodes, root_path=path
+        )
+        self.logger.info("Chunk nodes creation step finished.")
+        self.logger.info("File nodes creation step started.")
+        file_info = self._create_file_nodes(
+            chunk_info, level1_node_contents
+        )
+        self.logger.info("File nodes creation step finished.")
+        self.logger.info("Directory nodes creation step started.")
+        dir_agg = self._create_directory_nodes(
+            file_info
+        )
+        self.logger.info("Directory nodes creation step finished.")
+        self.logger.info("Aggregating all nodes to root node.")
+        self._aggregate_to_root(dir_agg)
+        self.logger.info("Async parse and node aggregation fully complete.")
+    async def _create_chunk_nodes(self, level1_node_contents, extract_entities, describe_nodes, index_nodes, root_path=None):
+        self.logger.info(f"Starting chunk node creation for {len(level1_node_contents)} files.")
+        accepted_extensions = {'.py', '.c', '.cpp', '.h', '.hpp', '.java', '.js', '.ts', '.jsx', '.tsx', '.rs', '.html'}
+        chunk_info = {}
+        entity_mapper = EntityChunkMapper()
+        total_chunks = 0
+        # Use tqdm for progress bar over files
+        for file_path in tqdm.tqdm(level1_node_contents, desc="Processing files for chunk nodes"):
+            self.logger.debug(f"Processing file for chunk nodes: {file_path}")
+            full_path = os.path.normpath(file_path)
+            parts = full_path.split(os.sep)
+            _, ext = os.path.splitext(file_path)
+            is_code_file = ext.lower() in accepted_extensions
+            self.logger.debug(f"Parsing file: {file_path}")
+            # Parse file into chunks
+            parsed_content = self.code_parser.parse(file_name=file_path, file_content=level1_node_contents[file_path])
+            self.logger.debug(f"Parsed {len(parsed_content)} chunks from file: {file_path}")
+            total_chunks += len(parsed_content)
+            # Entity extraction logging
+            if extract_entities and is_code_file:
+                self.logger.debug(f"Extracting entities from code file: {file_path}")
+                try:
+                    # Construct full path for entity extraction (needed for C/C++ include resolution)
+                    extraction_file_path = os.path.join(root_path, file_path) if root_path else file_path
+                    file_declared_entities, file_called_entities = self.entity_extractor.extract_entities(
+                        code=level1_node_contents[file_path], file_name=extraction_file_path)
+                    self.logger.debug(f"Extracted {len(file_declared_entities)} declared and {len(file_called_entities)} called entities from file: {file_path}")
+                    chunk_declared_map, chunk_called_map = entity_mapper.map_entities_to_chunks(
+                        file_declared_entities, file_called_entities, parsed_content, file_name=file_path)
+                    self.logger.debug(f"Mapped entities to {len(parsed_content)} chunks for file: {file_path}")
+                except Exception as e:
+                    self.logger.error(f"Error extracting entities from {file_path}: {e}")
+                    file_declared_entities, file_called_entities = [], []
+                    chunk_declared_map = {i: [] for i in range(len(parsed_content))}
+                    chunk_called_map = {i: [] for i in range(len(parsed_content))}
+            else:
+                self.logger.debug(f"Skipping entity extraction for non-code file: {file_path}")
+                file_declared_entities, file_called_entities = [], []
+                chunk_declared_map = {i: [] for i in range(len(parsed_content))}
+                chunk_called_map = {i: [] for i in range(len(parsed_content))}
+            chunk_tasks = []
+            for i, chunk in enumerate(parsed_content):
+                chunk_id = f'{file_path}_{i}'
+                self.logger.debug(f"Scheduling processing for chunk {chunk_id} of file {file_path}")
+                async def process_chunk(i=i, chunk=chunk, chunk_id=chunk_id):
+                    self.logger.debug(f"Creating chunk node: {chunk_id}")
+                    declared_entities = chunk_declared_map.get(i, [])
+                    called_entities = chunk_called_map.get(i, [])
+                    # FIRST PASS: Register all declared entities with aliases
+                    # Build temporary alias map for checking existing entities
+                    temp_alias_map = build_entity_alias_map(self.entities)
+                    for entity in declared_entities:
+                        name = entity.get("name")
+                        if not name:
+                            continue
+                        # Check if this entity already exists under any of its aliases
+                        entity_aliases = entity.get("aliases", [])
+                        canonical_name = None
+                        # First check if the name itself already exists or is an alias
+                        if name in temp_alias_map:
+                            canonical_name = temp_alias_map[name]
+                            self.logger.debug(f"Entity '{name}' already exists as '{canonical_name}'")
+                        else:
+                            # Check if any of the entity's aliases match existing entities
+                            for alias in entity_aliases:
+                                if alias in temp_alias_map:
+                                    canonical_name = temp_alias_map[alias]
+                                    self.logger.debug(f"Entity '{name}' matches existing entity '{canonical_name}' via alias '{alias}'")
+                                    break
+                        # If we found a match, use the canonical name; otherwise use the entity name
+                        if canonical_name:
+                            entity_key = canonical_name
+                        else:
+                            entity_key = name
+                            self.logger.debug(f"Registering new declared entity '{name}' in chunk {chunk_id}")
+                            self.entities[entity_key] = {
+                                "declaring_chunk_ids": [],
+                                "calling_chunk_ids": [],
+                                "type": [],
+                                "dtype": None,
+                                "aliases": []
+                            }
+                            # Update temp alias map with new entity
+                            temp_alias_map[entity_key] = entity_key
+                        if chunk_id not in self.entities[entity_key]["declaring_chunk_ids"]:
+                            self.entities[entity_key]["declaring_chunk_ids"].append(chunk_id)
+                        entity_type = entity.get("type")
+                        if entity_type and entity_type not in self.entities[entity_key]["type"]:
+                            self.entities[entity_key]["type"].append(entity_type)
+                        dtype = entity.get("dtype")
+                        if dtype:
+                            self.entities[entity_key]["dtype"] = dtype
+                        # Store aliases (add new ones, avoiding duplicates)
+                        for alias in [name] + entity_aliases:
+                            if alias and alias not in self.entities[entity_key]["aliases"]:
+                                self.entities[entity_key]["aliases"].append(alias)
+                                temp_alias_map[alias] = entity_key  # Update temp map
+                        self.logger.debug(f"Declared entity '{name}' registered as '{entity_key}' in chunk {chunk_id} with aliases: {self.entities[entity_key]['aliases']}")
+                    # Logging for node creation
+                    if describe_nodes:
+                        self.logger.info(f"Generating description for chunk {chunk_id}")
+                        try:
+                            description = await self.model_service.query_async(
+                                f'Summarize this {get_language_from_filename(file_path)} code chunk in a few sentences: {chunk}')
+                        except Exception as e:
+                            self.logger.error(f"Error generating description for chunk {chunk_id}: {e}")
+                            description = ''
+                    else:
+                        self.logger.debug(f"No description requested for chunk {chunk_id}")
+                        description = ''
+                    chunk_node = ChunkNode(
+                        id=chunk_id,
+                        name=chunk_id,
+                        path=file_path,
+                        content=chunk,
+                        order_in_file=i,
+                        called_entities=called_entities,
+                        declared_entities=declared_entities,
+                        language=get_language_from_filename(file_path),
+                        description=description,
+                    )
+                    self.logger.debug(f"Chunk node created: {chunk_id}")
+                    # NOTE: Embeddings are now deferred to CodeIndex for efficient batch processing
+                    # This avoids the slow one-at-a-time embedding during chunk creation
+                    chunk_node.embedding = None
+                    return (chunk_id, chunk_node, declared_entities, called_entities)
+                chunk_tasks.append(process_chunk())
+            chunk_results = await asyncio.gather(*chunk_tasks)
+            self.logger.debug(f"Finished processing {len(chunk_results)} chunks for file {file_path}.")
+            chunk_info[file_path] = {
+                'chunk_results': chunk_results,
+                'file_declared_entities': file_declared_entities,
+                'file_called_entities': file_called_entities
+            }
+        # Log summary
+        self.logger.info(f"Created {total_chunks} chunk nodes from {len(level1_node_contents)} files")
+        # SECOND PASS: Now that all declared entities are registered, resolve called entities
+        self.logger.info("Starting second pass: resolving called entities using alias map...")
+        alias_map = build_entity_alias_map(self.entities)
+        self.logger.info(f"Built alias map with {len(alias_map)} entries for resolution")
+        resolved_count = 0
+        for file_path, file_data in tqdm.tqdm(chunk_info.items(), desc="Resolving called entities"):
+            chunk_results = file_data['chunk_results']
+            for chunk_id, chunk_node, declared_entities, called_entities in chunk_results:
+                for called_name in called_entities:
+                    # Skip empty or whitespace-only names
+                    if not called_name or not called_name.strip():
+                        continue
+                    # Try to resolve this called entity to an existing declared entity using aliases
+                    resolved_name = resolve_entity_call(called_name, alias_map)
+                    # Use the resolved name if found, otherwise check if called_name is already an alias
+                    if resolved_name:
+                        entity_key = resolved_name
+                    elif called_name in alias_map:
+                        # The called_name itself is an alias of an existing entity
+                        entity_key = alias_map[called_name]
+                    else:
+                        # No match found, use the original called name
+                        entity_key = called_name
+                    if entity_key not in self.entities:
+                        self.logger.debug(f"Registering new called entity '{entity_key}' (called as '{called_name}') in chunk {chunk_id}")
+                        self.entities[entity_key] = {
+                            "declaring_chunk_ids": [],
+                            "calling_chunk_ids": [],
+                            "type": [],
+                            "dtype": None,
+                            "aliases": []
+                        }
+                        # Add called_name as an alias if it's different from entity_key
+                        if called_name != entity_key:
+                            self.entities[entity_key]["aliases"].append(called_name)
+                            alias_map[called_name] = entity_key  # Update alias map
+                    if chunk_id not in self.entities[entity_key]["calling_chunk_ids"]:
+                        self.entities[entity_key]["calling_chunk_ids"].append(chunk_id)
+                    if resolved_name and resolved_name != called_name:
+                        resolved_count += 1
+                        self.logger.debug(f"Called entity '{called_name}' resolved to '{entity_key}' in chunk {chunk_id}")
+        self.logger.info(f"Resolved {resolved_count} entity calls to existing declarations via aliases")
+        self.logger.info("All chunk nodes have been created for all files.")
+        return chunk_info
+    def _create_file_nodes(self, chunk_info, level1_node_contents):
+        self.logger.info("Starting file node creation.")
+        """
+        For each file, aggregate chunk information and create FileNode objects.
+        This method remains mostly the same.
+        """
+        def merge_entities(target, source):
+            # Merge entity lists, avoiding duplicates by (name, type)
+            existing = set((e.get('name'), e.get('type')) for e in target)
+            for e in source:
+                k = (e.get('name'), e.get('type'))
+                if k not in existing:
+                    target.append(e)
+                    existing.add(k)
+        def merge_called_entities(target, source):
+            # Merge called entity lists, avoiding duplicates
+            existing = set(target)
+            for e in source:
+                if e not in existing:
+                    target.append(e)
+                    existing.add(e)
+        file_info = {}
+        for file_path, file_data in tqdm.tqdm(chunk_info.items(), desc="Creating file nodes"):
+            self.logger.info(f"Creating file node for: {file_path}")
+            parts = os.path.normpath(file_path).split(os.sep)
+            # Extract file-level entities and chunk results from the stored data
+            chunk_results = file_data['chunk_results']
+            file_declared_entities = list(file_data['file_declared_entities'])  # Use file-level entities directly
+            file_called_entities = list(file_data['file_called_entities'])      # Use file-level entities directly
+            chunk_ids = []
+            for chunk_id, chunk_node, declared_entities, called_entities in chunk_results:
+                self.logger.info(f"Adding chunk node {chunk_id} to graph for file {file_path}")
+                self.graph.add_node(chunk_id, data=chunk_node, level=2)
+                chunk_ids.append(chunk_id)
+                # Note: We're using file-level entities for the FileNode, so we don't need to merge from chunks
+                # The chunks already have their entities set correctly
+            file_node = FileNode(
+                id=file_path,
+                name=parts[-1],
+                path=file_path,
+                node_type='file',
+                content=level1_node_contents[file_path],
+                declared_entities=file_declared_entities,
+                called_entities=file_called_entities,
+                language=get_language_from_filename(file_path),
+            )
+            self.logger.debug(f"Adding file node {file_path} to graph.")
+            self.graph.add_node(file_path, data=file_node, level=1)
+            for chunk_id in chunk_ids:
+                self.graph.add_edge(file_path, chunk_id, relation='contains')
+            file_info[file_path] = {
+                'declared_entities': file_declared_entities,
+                'called_entities': file_called_entities,
+                'chunk_ids': chunk_ids,
+                'parts': parts,
+            }
+            self.logger.info(f"File node {file_path} added to graph with {len(chunk_ids)} chunks.")
+        self.logger.info("All file nodes have been created.")
+        return file_info
+    def _create_directory_nodes(self, file_info):
+        self.logger.info("Starting directory node creation.")
+        """
+        For each directory, aggregate file information and create DirectoryNode objects.
+        Args:
+            file_info (dict): Mapping file_path -> file info dict.
+        Returns:
+            dict: Mapping dir_path -> aggregated entity info.
+        """
+        def merge_entities(target, source):
+            # Merge entity lists, avoiding duplicates by (name, type)
+            existing = set((e.get('name'), e.get('type')) for e in target)
+            for e in source:
+                k = (e.get('name'), e.get('type'))
+                if k not in existing:
+                    target.append(e)
+                    existing.add(k)
+        def merge_called_entities(target, source):
+            # Merge called entity lists, avoiding duplicates
+            existing = set(target)
+            for e in source:
+                if e not in existing:
+                    target.append(e)
+                    existing.add(e)
+        dir_agg = {}
+        for file_path, info in tqdm.tqdm(file_info.items(), desc="Creating directory nodes"):
+            self.logger.info(f"Processing directory nodes for file: {file_path}")
+            parts = os.path.normpath(file_path).split(os.sep)
+            file_declared_entities = info['declared_entities']
+            file_called_entities = info['called_entities']
+            current_parent = 'root'
+            path_accum = ''
+            for part in parts[:-1]:  # Skip file itself
+                path_accum = os.path.join(path_accum, part) if path_accum else part
+                if path_accum not in self.graph:
+                    self.logger.info(f"Adding new directory node: {path_accum}")
+                    dir_node = DirectoryNode(id=path_accum, name=part, path=path_accum)
+                    self.graph.add_node(path_accum, data=dir_node, level=1)
+                    self.graph.add_edge(current_parent, path_accum, relation='contains')
+                if path_accum not in dir_agg:
+                    dir_agg[path_accum] = {'declared_entities': [], 'called_entities': []}
+                merge_entities(dir_agg[path_accum]['declared_entities'], file_declared_entities)
+                merge_called_entities(dir_agg[path_accum]['called_entities'], file_called_entities)
+                current_parent = path_accum
+            # Connect file to its parent directory
+            self.graph.add_edge(current_parent, file_path, relation='contains')
+        self.logger.info("All directory nodes created.")
+        return dir_agg
+    def _aggregate_to_root(self, dir_agg):
+        self.logger.info("Aggregating directory information to root node.")
+        """
+        Aggregate all directory entity information to the root node.
+        Args:
+            dir_agg (dict): Mapping dir_path -> aggregated entity info.
+        """
+        def merge_entities(target, source):
+            # Merge entity lists, avoiding duplicates by (name, type)
+            existing = set((e.get('name'), e.get('type')) for e in target)
+            for e in source:
+                k = (e.get('name'), e.get('type'))
+                if k not in existing:
+                    target.append(e)
+                    existing.add(k)
+        def merge_called_entities(target, source):
+            # Merge called entity lists, avoiding duplicates
+            existing = set(target)
+            for e in source:
+                if e not in existing:
+                    target.append(e)
+                    existing.add(e)
+        root_node = Node(id='root', name='root', node_type='root')
+        self.graph.add_node('root', data=root_node, level=0)
+        root_declared_entities = []
+        root_called_entities = []
+        for dir_path, agg in tqdm.tqdm(dir_agg.items(), desc="Aggregating to root"):
+            node = self.graph.nodes[dir_path]['data']
+            if not hasattr(node, 'declared_entities'):
+                node.declared_entities = []
+            if not hasattr(node, 'called_entities'):
+                node.called_entities = []
+            merge_entities(node.declared_entities, agg['declared_entities'])
+            merge_called_entities(node.called_entities, agg['called_entities'])
+            merge_entities(root_declared_entities, agg['declared_entities'])
+            merge_called_entities(root_called_entities, agg['called_entities'])
+        if not hasattr(root_node, 'declared_entities'):
+            root_node.declared_entities = []
+        if not hasattr(root_node, 'called_entities'):
+            root_node.called_entities = []
+        merge_entities(root_node.declared_entities, root_declared_entities)
+        merge_called_entities(root_node.called_entities, root_called_entities)
+        self.logger.info("Aggregation to root node complete.")
+    def _build_relationships(self):
+        self.logger.info("Building relationships between chunk nodes based on entities.")
+        """
+        Build relationships between chunk nodes and entity nodes based on self.entities.
+        For each entity in self.entities:
+        1. Create an EntityNode with entity_name as the id
+        2. Create edges from declaring chunks to entity node (declares relationship)
+        3. Create edges from entity node to calling chunks (called_by relationship)
+        4. Resolve called entity names using aliases for better matching
+        """
+        from .Node import EntityNode
+        edges_created = 0
+        entity_nodes_created = 0
+        # Build alias map for quick lookups
+        self.logger.info("Building entity alias map for call resolution...")
+        alias_map = build_entity_alias_map(self.entities)
+        self.logger.info(f"Built alias map with {len(alias_map)} entries")
+        # First pass: Create all entity nodes
+        for entity_name, info in tqdm.tqdm(self.entities.items(), desc="Creating entity nodes"):
+            # Entity type is stored as a list in 'type' key, get first type or empty string
+            entity_types = info.get('type', [])
+            entity_type = entity_types[0] if entity_types else ''
+            declaring_chunks = info.get('declaring_chunk_ids', [])
+            calling_chunks = info.get('calling_chunk_ids', [])
+            aliases = info.get('aliases', [])
+            # Create EntityNode with entity_name as id
+            entity_node = EntityNode(
+                id=entity_name,
+                name=entity_name,
+                entity_type=entity_type,
+                declaring_chunk_ids=declaring_chunks,
+                calling_chunk_ids=calling_chunks,
+                aliases=aliases
+            )
+            # Add entity node to graph
+            self.graph.add_node(entity_name, data=entity_node, level=3)
+            entity_nodes_created += 1
+            # Log aliases for debugging
+            if aliases:
+                self.logger.debug(f"Created EntityNode '{entity_name}' with aliases: {aliases}")
+            # Create edges from declaring chunks to entity node
+            for declarer_id in declaring_chunks:
+                if declarer_id in self.graph:
+                    self.graph.add_edge(declarer_id, entity_name, relation='declares')
+                    edges_created += 1
+            # Create edges from entity node to calling chunks
+            for caller_id in calling_chunks:
+                if caller_id in self.graph and caller_id not in declaring_chunks:
+                    self.graph.add_edge(entity_name, caller_id, relation='called_by')
+                    edges_created += 1
+        # Second pass: Resolve unmatched entity calls using alias matching
+        self.logger.info("Resolving entity calls using alias matching...")
+        resolved_calls = 0
+        for entity_name, info in tqdm.tqdm(self.entities.items(), desc="Resolving entity calls"):
+            # Skip entities that already have declarations (they were matched directly)
+            if info.get('declaring_chunk_ids'):
+                continue
+            # Try to resolve this called entity to a declared entity using aliases
+            resolved_name = resolve_entity_call(entity_name, alias_map)
+            if resolved_name and resolved_name != entity_name:
+                # Found a match! Update the calling_chunk_ids of the resolved entity
+                calling_chunks = info.get('calling_chunk_ids', [])
+                if resolved_name in self.entities:
+                    for caller_id in calling_chunks:
+                        if caller_id in self.graph:
+                            # Add edge from resolved entity to calling chunk
+                            if not self.graph.has_edge(resolved_name, caller_id):
+                                self.graph.add_edge(resolved_name, caller_id, relation='called_by')
+                                edges_created += 1
+                                resolved_calls += 1
+                                self.logger.debug(f"Resolved call: '{entity_name}' -> '{resolved_name}' in chunk {caller_id}")
+        self.logger.info(f"_build_relationships: Created {entity_nodes_created} entity nodes, "
+                        f"{edges_created} edges, and resolved {resolved_calls} entity calls using aliases.")
+    def get_entity_by_alias(self, alias: str) -> Optional[str]:
+        """
+        Get the canonical entity name for a given alias.
+        Args:
+            alias: An alias of an entity (e.g., 'MyClass' or 'module.MyClass')
+        Returns:
+            Canonical entity name if found, None otherwise
+        """
+        alias_map = build_entity_alias_map(self.entities)
+        return alias_map.get(alias)
+    def resolve_entity_references(self) -> Dict[str, List[str]]:
+        """
+        Resolve all entity references in the knowledge graph using aliases.
+        Returns a mapping of unresolved entity calls to their potential matches.
+        Returns:
+            Dictionary mapping called entity names to list of potential canonical matches
+        """
+        alias_map = build_entity_alias_map(self.entities)
+        resolutions = {}
+        for entity_name, info in self.entities.items():
+            # Only look at entities that are called but not declared
+            if not info.get('declaring_chunk_ids') and info.get('calling_chunk_ids'):
+                resolved = resolve_entity_call(entity_name, alias_map)
+                if resolved:
+                    resolutions[entity_name] = resolved
+        return resolutions
+    def print_tree(self, max_depth=None, start_node_id='root', level=0, prefix=""):
+        """
+        Print the repository tree structure using the graph with 'contains' edges.
+        Args:
+            max_depth (int, optional): Maximum depth to print. None = unlimited.
+            start_node_id (str): ID of the node to start from. Default is 'root'.
+            level (int): Internal use only (used for recursion).
+            prefix (str): Internal use only (used for formatting output).
+        """
+        if max_depth is not None and level > max_depth:
+            self.logger.debug(f"Max depth {max_depth} reached at node {start_node_id}.")
+            return
+        if start_node_id not in self.graph:
+            self.logger.warning(f"Start node '{start_node_id}' not found in graph.")
+            return
+        try:
+            node_data = self[start_node_id]
+        except KeyError as e:
+            self.logger.error(f"KeyError when accessing node {start_node_id}: {e}")
+            self.logger.error(f"Available node attributes: {list(self.graph.nodes[start_node_id].keys())}")
+            # Use a fallback approach if 'data' is missing
+            if 'data' not in self.graph.nodes[start_node_id]:
+                self.logger.warning(f"Node {start_node_id} has no 'data' attribute, using node itself")
+                # Create a fallback node if 'data' is missing
+                if start_node_id == 'root':
+                    # Create a default root node
+                    node_data = Node(id='root', name='root', node_type='root')
+                    # Update the graph node with the fallback data
+                    self.graph.nodes[start_node_id]['data'] = node_data
+                else:
+                    # Try to infer node type from ID or structure
+                    name = start_node_id.split('/')[-1] if '/' in start_node_id else start_node_id
+                    if '_' in start_node_id and start_node_id.split('_')[-1].isdigit():
+                        # Looks like a chunk ID
+                        node_data = ChunkNode(id=start_node_id, name=name, node_type='chunk')
+                    elif '.' in name:
+                        # Looks like a file
+                        node_data = FileNode(id=start_node_id, name=name, node_type='file', path=start_node_id)
+                    else:
+                        # Fallback to directory or generic node
+                        node_data = DirectoryNode(id=start_node_id, name=name, node_type='directory',
+                                                  path=start_node_id)
+                    # Update the graph node with the fallback data
+                    self.graph.nodes[start_node_id]['data'] = node_data
+            return
+        # Choose icon based on node type
+        if node_data.node_type == 'file':
+            node_symbol = "📄"
+        elif node_data.node_type == 'chunk':
+            node_symbol = "📝"
+        elif node_data.node_type == 'root':
+            node_symbol = "📁"
+        elif node_data.node_type == 'directory':
+            node_symbol = "📂"
+        else:
+            node_symbol = "📦"
+        if level == 0:
+            print(f"{node_symbol} {node_data.name} ({node_data.node_type})")
+        else:
+            print(f"{prefix}└── {node_symbol} {node_data.name} ({node_data.node_type})")
+        # Get children via 'contains' edges
+        children = [
+            child for child in self.graph.successors(start_node_id)
+            if self.graph.edges[start_node_id, child].get('relation') == 'contains'
+        ]
+        child_count = len(children)
+        for i, child_id in enumerate(children):
+            is_last = i == child_count - 1
+            new_prefix = prefix + ("    " if is_last else "│   ")
+            self.print_tree(max_depth, start_node_id=child_id, level=level + 1, prefix=new_prefix)
+    def to_dict(self):
+        self.logger.info("Serializing graph to dictionary.")
+        from .Node import EntityNode
+        graph_data = {
+            'nodes': [],
+            'edges': []
+        }
+        for node_id, node_attrs in tqdm.tqdm(self.graph.nodes(data=True), desc="Serializing nodes"):
+            if 'data' not in node_attrs:
+                self.logger.warning(f"Node {node_id} has no 'data' attribute, skipping in serialization")
+                continue
+            node = node_attrs['data']
+            node_dict = {
+                'id': node.id or node_id,
+                'class': node.__class__.__name__,
+                'data': {
+                    'id': node.id or node_id,
+                    'name': node.name,
+                    'node_type': node.node_type,
+                    'description': getattr(node, 'description', ''),
+                    'declared_entities': list(getattr(node, 'declared_entities', [])),
+                    'called_entities': list(getattr(node, 'called_entities', [])),
+                }
+            }
+            # FileNode-specific
+            if isinstance(node, FileNode):
+                node_dict['data']['path'] = node.path
+                node_dict['data']['content'] = node.content
+                node_dict['data']['language'] = getattr(node, 'language', '')
+            # ChunkNode-specific
+            if isinstance(node, ChunkNode):
+                node_dict['data']['order_in_file'] = getattr(node, 'order_in_file', 0)
+                node_dict['data']['embedding'] = getattr(node, 'embedding', None)
+            # EntityNode-specific
+            if isinstance(node, EntityNode):
+                node_dict['data']['entity_type'] = getattr(node, 'entity_type', '')
+                node_dict['data']['declaring_chunk_ids'] = list(getattr(node, 'declaring_chunk_ids', []))
+                node_dict['data']['calling_chunk_ids'] = list(getattr(node, 'calling_chunk_ids', []))
+                node_dict['data']['aliases'] = list(getattr(node, 'aliases', []))
+            graph_data['nodes'].append(node_dict)
+        for u, v, attrs in tqdm.tqdm(self.graph.edges(data=True), desc="Serializing edges"):
+            edge_data = {
+                'source': u,
+                'target': v,
+                'relation': attrs.get('relation', '')
+            }
+            if 'entities' in attrs:
+                edge_data['entities'] = list(attrs['entities'])
+            graph_data['edges'].append(edge_data)
+        self.logger.info("Serialization complete.")
+        return graph_data
+    @classmethod
+    def from_dict(cls, data_dict, index_nodes: bool = True, use_embed: bool = True,
+                  model_service_kwargs: Optional[dict] = None, code_index_kwargs: Optional[dict] = None):
+        # ...existing code...
+        instance = cls.__new__(cls)  # bypass __init__
+        instance._initialize(model_service_kwargs=model_service_kwargs, code_index_kwargs=code_index_kwargs)
+        instance.logger.info("Deserializing graph from dictionary.")
+        node_classes = {
+            'Node': Node,
+            'FileNode': FileNode,
+            'ChunkNode': ChunkNode,
+            'DirectoryNode': DirectoryNode,
+            'EntityNode': EntityNode,
+        }
+        # Create a root node if not present in the data
+        root_found = any(node_data['id'] == 'root' for node_data in data_dict['nodes'])
+        if not root_found:
+            instance.logger.warning("Root node not found in the data, creating one")
+            root_node = Node(id='root', name='root', node_type='root')
+            instance.graph.add_node('root', data=root_node, level=0)
+        # --- Rebuild nodes ---
+        for node_data in tqdm.tqdm(data_dict['nodes'], desc="Rebuilding nodes"):
+            cls_name = node_data['class']
+            node_cls = node_classes.get(cls_name, Node)
+            kwargs = node_data['data']
+            # Ensure ID is properly set
+            if not kwargs.get('id'):
+                kwargs['id'] = node_data['id']
+            # Always use lists for declared_entities and called_entities
+            kwargs['declared_entities'] = list(kwargs.get('declared_entities', []))
+            kwargs['called_entities'] = list(kwargs.get('called_entities', []))
+            # FileNode-specific
+            if node_cls in (FileNode, ChunkNode):
+                kwargs.setdefault('path', '')
+                kwargs.setdefault('content', '')
+                kwargs.setdefault('language', '')
+            if node_cls == ChunkNode:
+                kwargs.setdefault('order_in_file', 0)
+                kwargs.setdefault('embedding', [])
+            # EntityNode-specific
+            if node_cls == EntityNode:
+                kwargs.setdefault('entity_type', '')
+                kwargs.setdefault('declaring_chunk_ids', [])
+                kwargs.setdefault('calling_chunk_ids', [])
+                kwargs.setdefault('aliases', [])
+            node_instance = node_cls(**kwargs)
+            instance.graph.add_node(node_data['id'], data=node_instance, level=instance._infer_level(node_instance))
+        # --- Rebuild edges ---
+        for edge in tqdm.tqdm(data_dict['edges'], desc="Rebuilding edges"):
+            source = edge['source']
+            target = edge['target']
+            if source in instance.graph and target in instance.graph:
+                edge_kwargs = {'relation': edge.get('relation', '')}
+                if 'entities' in edge:
+                    edge_kwargs['entities'] = list(edge['entities'])
+                instance.graph.add_edge(source, target, **edge_kwargs)
+            else:
+                instance.logger.warning(f"Cannot add edge {source} -> {target}, nodes don't exist")
+        # --- Rebuild instance.entities ---
+        instance.entities = {}
+        for node_id, node_attrs in tqdm.tqdm(instance.graph.nodes(data=True), desc="Rebuilding entities"):
+            node = node_attrs['data']
+            declared_entities = getattr(node, 'declared_entities', [])
+            called_entities = getattr(node, 'called_entities', [])
+            for entity in declared_entities:
+                if isinstance(entity, dict):
+                    name = entity.get('name')
+                else:
+                    name = entity
+                if not name:
+                    continue
+                if name not in instance.entities:
+                    instance.entities[name] = {
+                        "declaring_chunk_ids": [],
+                        "calling_chunk_ids": [],
+                        "type": [],
+                        "dtype": None
+                    }
+                # Only add node_id if it is a ChunkNode
+                if node_id not in instance.entities[name]["declaring_chunk_ids"]:
+                    if node_id in instance.graph and isinstance(instance.graph.nodes[node_id]["data"], ChunkNode):
+                        instance.entities[name]["declaring_chunk_ids"].append(node_id)
+                if isinstance(entity, dict):
+                    entity_type = entity.get("type")
+                    if entity_type and entity_type not in instance.entities[name]["type"]:
+                        instance.entities[name]["type"].append(entity_type)
+                    dtype = entity.get("dtype")
+                    if dtype:
+                        instance.entities[name]["dtype"] = dtype
+            for called_name in called_entities:
+                if not called_name:
+                    continue
+                if called_name not in instance.entities:
+                    instance.entities[called_name] = {
+                        "declaring_chunk_ids": [],
+                        "calling_chunk_ids": [],
+                        "type": [],
+                        "dtype": None
+                    }
+                if node_id not in instance.entities[called_name]["calling_chunk_ids"]:
+                    if node_id in instance.graph and isinstance(instance.graph.nodes[node_id]["data"], ChunkNode):
+                        instance.entities[called_name]["calling_chunk_ids"].append(node_id)
+        if index_nodes:
+            instance.logger.info("Building code index after deserialization.")
+            # Merge use_embed with code_index_kwargs, avoiding duplicate keyword arguments
+            code_idx_kwargs = code_index_kwargs or {}
+            if 'use_embed' not in code_idx_kwargs:
+                code_idx_kwargs['use_embed'] = use_embed
+            instance.code_index = CodeIndex(list(instance), model_service=instance.model_service, **code_idx_kwargs)
+        instance.logger.info("Deserialization complete.")
+        return instance
+    def _infer_level(self, node):
+        """Infer the level of a node based on its type"""
+        if node.node_type == 'root':
+            return 0
+        elif node.node_type in ('file', 'directory'):
+            return 1
+        elif node.node_type == 'chunk':
+            return 2
+        return 1  # Default level
+    def save_graph_to_file(self, filepath: str):
+        self.logger.info(f"Saving graph to file: {filepath}")
+        with open(filepath, 'w') as f:
+            json.dump(self.to_dict(), f, indent=2)
+        self.logger.info("Graph saved successfully.")
+    @classmethod
+    def load_graph_from_file(cls, filepath: str, index_nodes=True, use_embed: bool = True,
+                             model_service_kwargs: Optional[dict] = None, code_index_kwargs: Optional[dict] = None):
+        if model_service_kwargs is None:
+            model_service_kwargs = {}
+        with open(filepath, 'r') as f:
+            data = json.load(f)
+        logging.getLogger(LOGGER_NAME).info(f"Loaded graph data from file: {filepath}")
+        return cls.from_dict(data, use_embed=use_embed, index_nodes=index_nodes,
+                             model_service_kwargs=model_service_kwargs, code_index_kwargs=code_index_kwargs)
+    def to_hf_dataset(
+        self,
+        repo_id: str,
+        save_embeddings: bool = True,
+        private: bool = False,
+        token: Optional[str] = None,
+        commit_message: Optional[str] = None,
+    ):
+        """
+        Save the knowledge graph to a HuggingFace dataset on the Hub.
+        The graph is serialized into two splits:
+        - 'nodes': Contains all node data
+        - 'edges': Contains all edge relationships
+        Args:
+            repo_id (str): The HuggingFace dataset repository ID (e.g., 'username/dataset-name')
+            save_embeddings (bool): If True, saves embedding vectors for chunk nodes.
+                                   If False, embeddings are excluded to reduce dataset size.
+            private (bool): Whether the dataset should be private. Defaults to False.
+            token (str, optional): HuggingFace API token. If not provided, uses the token
+                                  from huggingface_hub login or HF_TOKEN environment variable.
+            commit_message (str, optional): Custom commit message for the upload.
+        Returns:
+            str: URL of the uploaded dataset
+        """
+        try:
+            from datasets import Dataset, DatasetDict
+            from huggingface_hub import HfApi
+        except ImportError:
+            raise ImportError(
+                "huggingface_hub and datasets are required for HuggingFace integration. "
+                "Install them with: pip install huggingface_hub datasets"
+            )
+        self.logger.info(f"Preparing to save knowledge graph to HuggingFace dataset: {repo_id}")
+        self.logger.info(f"save_embeddings={save_embeddings}")
+        # Serialize nodes
+        nodes_data = []
+        for node_id, node_attrs in tqdm.tqdm(self.graph.nodes(data=True), desc="Serializing nodes for HF dataset"):
+            if 'data' not in node_attrs:
+                self.logger.warning(f"Node {node_id} has no 'data' attribute, skipping")
+                continue
+            node = node_attrs['data']
+            node_record = {
+                'node_id': node.id or node_id,
+                'node_class': node.__class__.__name__,
+                'name': node.name,
+                'node_type': node.node_type,
+                'description': getattr(node, 'description', '') or '',
+                'declared_entities': json.dumps(list(getattr(node, 'declared_entities', []))),
+                'called_entities': json.dumps(list(getattr(node, 'called_entities', []))),
+            }
+            # FileNode-specific fields
+            if isinstance(node, FileNode):
+                node_record['path'] = node.path
+                node_record['content'] = node.content
+                node_record['language'] = getattr(node, 'language', '')
+            else:
+                node_record['path'] = ''
+                node_record['content'] = ''
+                node_record['language'] = ''
+            # ChunkNode-specific fields
+            if isinstance(node, ChunkNode):
+                node_record['order_in_file'] = getattr(node, 'order_in_file', 0)
+                if save_embeddings:
+                    embedding = getattr(node, 'embedding', None)
+                    node_record['embedding'] = json.dumps(embedding if embedding is not None else [])
+                else:
+                    node_record['embedding'] = json.dumps([])
+            else:
+                node_record['order_in_file'] = -1
+                node_record['embedding'] = json.dumps([])
+            # EntityNode-specific fields
+            if isinstance(node, EntityNode):
+                node_record['entity_type'] = getattr(node, 'entity_type', '')
+                node_record['declaring_chunk_ids'] = json.dumps(list(getattr(node, 'declaring_chunk_ids', [])))
+                node_record['calling_chunk_ids'] = json.dumps(list(getattr(node, 'calling_chunk_ids', [])))
+                node_record['aliases'] = json.dumps(list(getattr(node, 'aliases', [])))
+            else:
+                node_record['entity_type'] = ''
+                node_record['declaring_chunk_ids'] = json.dumps([])
+                node_record['calling_chunk_ids'] = json.dumps([])
+                node_record['aliases'] = json.dumps([])
+            nodes_data.append(node_record)
+        # Serialize edges
+        edges_data = []
+        for source, target, attrs in tqdm.tqdm(self.graph.edges(data=True), desc="Serializing edges for HF dataset"):
+            edge_record = {
+                'source': source,
+                'target': target,
+                'relation': attrs.get('relation', ''),
+                'entities': json.dumps(list(attrs.get('entities', []))) if 'entities' in attrs else json.dumps([])
+            }
+            edges_data.append(edge_record)
+        # Create datasets
+        nodes_dataset = Dataset.from_list(nodes_data)
+        edges_dataset = Dataset.from_list(edges_data)
+        self.logger.info(f"Created dataset with {len(nodes_data)} nodes and {len(edges_data)} edges")
+        # Push to Hub - nodes and edges are pushed separately as different configs
+        # because they have different schemas
+        if commit_message is None:
+            base_commit_message = f"Upload knowledge graph ({len(nodes_data)} nodes, {len(edges_data)} edges)"
+            if not save_embeddings:
+                base_commit_message += " [embeddings excluded]"
+        else:
+            base_commit_message = commit_message
+        self.logger.info(f"Pushing nodes dataset to HuggingFace Hub: {repo_id}")
+        nodes_dataset.push_to_hub(
+            repo_id=repo_id,
+            config_name="nodes",
+            private=private,
+            token=token,
+            commit_message=f"{base_commit_message} - nodes"
+        )
+        self.logger.info(f"Pushing edges dataset to HuggingFace Hub: {repo_id}")
+        edges_dataset.push_to_hub(
+            repo_id=repo_id,
+            config_name="edges",
+            private=private,
+            token=token,
+            commit_message=f"{base_commit_message} - edges"
+        )
+        url = f"https://huggingface.co/datasets/{repo_id}"
+        self.logger.info(f"Dataset successfully uploaded to: {url}")
+        return url
+    @classmethod
+    def from_hf_dataset(
+        cls,
+        repo_id: str,
+        index_nodes: bool = True,
+        use_embed: bool = True,
+        model_service_kwargs: Optional[dict] = None,
+        code_index_kwargs: Optional[dict] = None,
+        token: Optional[str] = None,
+        revision: Optional[str] = None,
+    ):
+        """
+        Load a knowledge graph from a HuggingFace dataset on the Hub.
+        Args:
+            repo_id (str): The HuggingFace dataset repository ID (e.g., 'username/dataset-name')
+            index_nodes (bool): Whether to build a code index after loading. Defaults to True.
+            use_embed (bool): Whether to use existing embeddings from the dataset. Defaults to True.
+            model_service_kwargs (dict, optional): Arguments for the model service.
+            code_index_kwargs (dict, optional): Arguments for the code index.
+            token (str, optional): HuggingFace API token for private datasets.
+            revision (str, optional): Git revision (branch, tag, or commit) to load from.
+        Returns:
+            RepoKnowledgeGraph: The loaded knowledge graph instance.
+        """
+        try:
+            from datasets import load_dataset
+        except ImportError:
+            raise ImportError(
+                "datasets library is required for HuggingFace integration. "
+                "Install it with: pip install datasets"
+            )
+        if model_service_kwargs is None:
+            model_service_kwargs = {}
+        logger = logging.getLogger(LOGGER_NAME)
+        logger.info(f"Loading knowledge graph from HuggingFace dataset: {repo_id}")
+        # Load dataset from Hub - nodes and edges are stored as separate configs
+        logger.info("Loading nodes config...")
+        nodes_dataset = load_dataset(repo_id, name="nodes", token=token, revision=revision)
+        logger.info("Loading edges config...")
+        edges_dataset = load_dataset(repo_id, name="edges", token=token, revision=revision)
+        # Get the train split (default split when pushing with config_name)
+        nodes_data = nodes_dataset['train']
+        edges_data = edges_dataset['train']
+        logger.info(f"Loaded {len(nodes_data)} nodes and {len(edges_data)} edges from dataset")
+        # Convert to the dict format expected by from_dict
+        graph_data = {
+            'nodes': [],
+            'edges': []
+        }
+        # Reconstruct nodes
+        for record in tqdm.tqdm(nodes_data, desc="Reconstructing nodes from HF dataset"):
+            node_dict = {
+                'id': record['node_id'],
+                'class': record['node_class'],
+                'data': {
+                    'id': record['node_id'],
+                    'name': record['name'],
+                    'node_type': record['node_type'],
+                    'description': record['description'],
+                    'declared_entities': json.loads(record['declared_entities']),
+                    'called_entities': json.loads(record['called_entities']),
+                }
+            }
+            # FileNode-specific fields
+            if record['node_class'] in ('FileNode', 'ChunkNode'):
+                node_dict['data']['path'] = record['path']
+                node_dict['data']['content'] = record['content']
+                node_dict['data']['language'] = record['language']
+            # ChunkNode-specific fields
+            if record['node_class'] == 'ChunkNode':
+                node_dict['data']['order_in_file'] = record['order_in_file']
+                embedding = json.loads(record['embedding'])
+                # Only use embedding if use_embed is True and embedding is non-empty
+                if use_embed and embedding:
+                    node_dict['data']['embedding'] = embedding
+                else:
+                    node_dict['data']['embedding'] = []
+            # EntityNode-specific fields
+            if record['node_class'] == 'EntityNode':
+                node_dict['data']['entity_type'] = record['entity_type']
+                node_dict['data']['declaring_chunk_ids'] = json.loads(record['declaring_chunk_ids'])
+                node_dict['data']['calling_chunk_ids'] = json.loads(record['calling_chunk_ids'])
+                node_dict['data']['aliases'] = json.loads(record['aliases'])
+            graph_data['nodes'].append(node_dict)
+        # Reconstruct edges
+        for record in tqdm.tqdm(edges_data, desc="Reconstructing edges from HF dataset"):
+            edge_dict = {
+                'source': record['source'],
+                'target': record['target'],
+                'relation': record['relation'],
+            }
+            entities = json.loads(record['entities'])
+            if entities:
+                edge_dict['entities'] = entities
+            graph_data['edges'].append(edge_dict)
+        logger.info("Dataset reconstruction complete, building graph...")
+        # Use from_dict to build the graph
+        return cls.from_dict(
+            graph_data,
+            index_nodes=index_nodes,
+            use_embed=use_embed,
+            model_service_kwargs=model_service_kwargs,
+            code_index_kwargs=code_index_kwargs
+        )
+    def get_neighbors(self, node_id):
+        self.logger.debug(f"Getting neighbors for node: {node_id}")
+        # Return all nodes that are directly connected to node_id (successors and predecessors) for any edge type
+        neighbors = set()
+        for n in self.graph.successors(node_id):
+            neighbors.add(n)
+        for n in self.graph.predecessors(node_id):
+            neighbors.add(n)
+        # Also include nodes connected by any edge (not just 'contains')
+        for u, v in self.graph.edges(node_id):
+            if u == node_id:
+                neighbors.add(v)
+            else:
+                neighbors.add(u)
+        for u, v in self.graph.in_edges(node_id):
+            if v == node_id:
+                neighbors.add(u)
+            else:
+                neighbors.add(v)
+        return [self.graph.nodes[n]['data'] for n in neighbors if 'data' in self.graph.nodes[n]]
+    def get_previous_chunk(self, node_id: str) -> ChunkNode:
+        self.logger.debug(f"Getting previous chunk for node: {node_id}")
+        node = self[node_id]
+        # Check if node is of type ChunkNode
+        if not isinstance(node, ChunkNode):
+            raise Exception(f'Cannot get previous chunk on node of type {type(node)}')
+        if node.order_in_file == 0:
+            self.logger.warning(f'Cannot get previous chunk for first node')
+            return None
+        file_path = node.path
+        previous_chunk_id = f'{file_path}_{node.order_in_file - 1}'
+        if previous_chunk_id not in self.graph:
+            raise Exception(f'Previous chunk {previous_chunk_id} not found in graph')
+        previous_chunk = self[previous_chunk_id]
+        return previous_chunk
+    def get_next_chunk(self, node_id: str) -> ChunkNode:
+        self.logger.debug(f"Getting next chunk for node: {node_id}")
+        node = self[node_id]
+        # Check if node is of type ChunkNode
+        if not isinstance(node, ChunkNode):
+            raise Exception(f'Cannot get previous chunk on node of type {type(node)}')
+        file_path = node.path
+        next_chunk_id = f'{file_path}_{node.order_in_file + 1}'
+        if next_chunk_id not in self.graph:
+            self.logger.warning(f'Next chunk {next_chunk_id} not found in graph, it might be the last chunk')
+            return None
+        previous_chunk = self[next_chunk_id]
+        return previous_chunk
+    def get_all_chunks(self) -> List[ChunkNode]:
+        self.logger.debug("Getting all chunk nodes.")
+        chunk_nodes = []
+        for node in self:
+            if isinstance(node, ChunkNode):
+                chunk_nodes.append(node)
+        return chunk_nodes
+    def get_all_files(self) -> List[FileNode]:
+        self.logger.debug("Getting all file nodes.")
+        """
+        Get all FileNodes in the knowledge graph.
+        Returns:
+            List[FileNode]: A list of FileNodes in the graph.
+        """
+        file_nodes = []
+        for node in self.graph.nodes(data=True):
+            node_data = node[1]['data']
+            # Check for exact FileNode type, not ChunkNode (which inherits from FileNode)
+            if isinstance(node_data, FileNode) and node_data.node_type == 'file':
+                file_nodes.append(node_data)
+        return file_nodes
+    def get_chunks_of_file(self, file_node_id: str) -> List[ChunkNode]:
+        self.logger.debug(f"Getting chunks for file node: {file_node_id}")
+        """
+        Get all ChunkNodes associated with a specific FileNode.
+        Args:
+            file_node (FileNode): The file node to get chunks for.
+        Returns:
+            List[ChunkNode]: A list of ChunkNodes associated with the file.
+        """
+        chunk_nodes = []
+        for node in self.graph.neighbors(file_node_id):
+            # Only include ChunkNodes that are connected by a 'contains' edge
+            edge_data = self.graph.get_edge_data(file_node_id, node)
+            node_data = self.graph.nodes[node]['data']
+            if (
+                    isinstance(node_data, ChunkNode)
+                    and node_data.node_type == 'chunk'
+                    and edge_data is not None
+                    and edge_data.get('relation') == 'contains'
+            ):
+                chunk_nodes.append(node_data)
+        return chunk_nodes
+    def find_path(self, source_id: str, target_id: str, max_depth: int = 5) -> dict:
+        """
+        Find the shortest path between two nodes in the knowledge graph.
+        Args:
+            source_id (str): The ID of the source node.
+            target_id (str): The ID of the target node.
+            max_depth (int): Maximum depth to search for a path. Defaults to 5.
+        Returns:
+            dict: A dictionary containing path information or error message.
+        """
+        self.logger.debug(f"Finding path from {source_id} to {target_id} with max_depth={max_depth}")
+        g = self.graph
+        if source_id not in g:
+            return {"error": f"Source node '{source_id}' not found."}
+        if target_id not in g:
+            return {"error": f"Target node '{target_id}' not found."}
+        try:
+            path = nx.shortest_path(g, source=source_id, target=target_id)
+            if len(path) - 1 > max_depth:
+                return {
+                    "source_id": source_id,
+                    "target_id": target_id,
+                    "path": [],
+                    "length": len(path) - 1,
+                    "text": f"Path exists but exceeds max_depth of {max_depth} (actual length: {len(path) - 1})"
+                }
+            # Build detailed path information
+            path_details = []
+            for i, node_id in enumerate(path):
+                node = g.nodes[node_id]['data']
+                node_info = {
+                    "node_id": node_id,
+                    "name": getattr(node, 'name', 'Unknown'),
+                    "type": getattr(node, 'node_type', 'Unknown'),
+                    "step": i
+                }
+                # Add edge information for all but the last node
+                if i < len(path) - 1:
+                    next_node_id = path[i + 1]
+                    edge_data = g.get_edge_data(node_id, next_node_id)
+                    node_info["edge_to_next"] = edge_data.get('relation', 'Unknown') if edge_data else 'Unknown'
+                path_details.append(node_info)
+            # Format text output
+            text = f"Path from '{source_id}' to '{target_id}' (length: {len(path) - 1}):\n\n"
+            for i, node_info in enumerate(path_details):
+                text += f"{i}. {node_info['name']} ({node_info['type']})\n"
+                text += f"   Node ID: {node_info['node_id']}\n"
+                if 'edge_to_next' in node_info:
+                    text += f"   --[{node_info['edge_to_next']}]--> \n"
+            return {
+                "source_id": source_id,
+                "target_id": target_id,
+                "path": path_details,
+                "length": len(path) - 1,
+                "text": text
+            }
+        except nx.NetworkXNoPath:
+            return {
+                "source_id": source_id,
+                "target_id": target_id,
+                "path": [],
+                "length": -1,
+                "text": f"No path found between '{source_id}' and '{target_id}'"
+            }
+        except Exception as e:
+            self.logger.error(f"Error finding path: {str(e)}")
+            return {"error": f"Error finding path: {str(e)}"}
+    def get_subgraph(self, node_id: str, depth: int = 2, edge_types: Optional[List[str]] = None) -> dict:
+        """
+        Extract a subgraph around a node up to a specified depth.
+        Args:
+            node_id (str): The ID of the central node.
+            depth (int): The depth/radius of the subgraph to extract. Defaults to 2.
+            edge_types (Optional[List[str]]): Optional list of edge types to include (e.g., ['calls', 'contains']).
+        Returns:
+            dict: A dictionary containing subgraph information or error message.
+        """
+        self.logger.debug(f"Getting subgraph for node {node_id} with depth={depth}, edge_types={edge_types}")
+        g = self.graph
+        if node_id not in g:
+            return {"error": f"Node '{node_id}' not found."}
+        # Collect nodes within specified depth
+        nodes_at_depth = {node_id}
+        all_nodes = {node_id}
+        for d in range(depth):
+            next_level = set()
+            for n in nodes_at_depth:
+                # Get all neighbors (both incoming and outgoing)
+                for neighbor in g.successors(n):
+                    if edge_types is None:
+                        next_level.add(neighbor)
+                    else:
+                        edge_data = g.get_edge_data(n, neighbor)
+                        if edge_data and edge_data.get('relation') in edge_types:
+                            next_level.add(neighbor)
+                for neighbor in g.predecessors(n):
+                    if edge_types is None:
+                        next_level.add(neighbor)
+                    else:
+                        edge_data = g.get_edge_data(neighbor, n)
+                        if edge_data and edge_data.get('relation') in edge_types:
+                            next_level.add(neighbor)
+            nodes_at_depth = next_level - all_nodes
+            all_nodes.update(next_level)
+        # Extract subgraph
+        subgraph = g.subgraph(all_nodes).copy()
+        # Build node details
+        nodes = []
+        for n in subgraph.nodes():
+            node = subgraph.nodes[n]['data']
+            nodes.append({
+                "node_id": n,
+                "name": getattr(node, 'name', 'Unknown'),
+                "type": getattr(node, 'node_type', 'Unknown')
+            })
+        # Build edge details
+        edges = []
+        for source, target, data in subgraph.edges(data=True):
+            edges.append({
+                "source": source,
+                "target": target,
+                "relation": data.get('relation', 'Unknown')
+            })
+        # Format text output
+        text = f"Subgraph around '{node_id}' (depth: {depth}):\n"
+        if edge_types:
+            text += f"Edge types filter: {', '.join(edge_types)}\n"
+        text += f"\nNodes: {len(nodes)}\n"
+        text += f"Edges: {len(edges)}\n\n"
+        # Group nodes by type
+        nodes_by_type = {}
+        for node in nodes:
+            node_type = node['type']
+            if node_type not in nodes_by_type:
+                nodes_by_type[node_type] = []
+            nodes_by_type[node_type].append(node)
+        for node_type, type_nodes in nodes_by_type.items():
+            text += f"{node_type} ({len(type_nodes)}):\n"
+            for node in type_nodes[:5]:
+                text += f"  - {node['name']} ({node['node_id']})\n"
+            if len(type_nodes) > 5:
+                text += f"  ... and {len(type_nodes) - 5} more\n"
+            text += "\n"
+        # Show edge statistics
+        edge_by_relation = {}
+        for edge in edges:
+            relation = edge['relation']
+            edge_by_relation[relation] = edge_by_relation.get(relation, 0) + 1
+        if edge_by_relation:
+            text += "Edge types:\n"
+            for relation, count in edge_by_relation.items():
+                text += f"  - {relation}: {count}\n"
+        return {
+            "center_node_id": node_id,
+            "depth": depth,
+            "edge_types_filter": edge_types,
+            "node_count": len(nodes),
+            "edge_count": len(edges),
+            "nodes": nodes,
+            "edges": edges,
+            "nodes_by_type": nodes_by_type,
+            "edge_by_relation": edge_by_relation,
+            "text": text
+        }

RepoKnowledgeGraphLib/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""
+RepoKnowledgeGraphLib - Knowledge Graph Library for Code Repositories
+This library provides tools for creating and querying knowledge graphs from code repositories.
+"""

RepoKnowledgeGraphLib/utils/__init__.py ADDED Viewed

File without changes

RepoKnowledgeGraphLib/utils/chunk_utils.py ADDED Viewed

	@@ -0,0 +1,88 @@

+from ..Node import ChunkNode
+from typing import List, Dict
+def dict_to_chunknode(d: dict) -> ChunkNode:
+    """
+    Converts a dictionary to a ChunkNode instance.
+    """
+    return ChunkNode(**d)
+def extract_filename_from_chunk(chunk:ChunkNode) -> str:
+    """
+    Extracts the file name from a chunk.
+    Args:
+        chunk (str): The chunk from which to extract the file name.
+    Returns:
+        str: The extracted file name.
+    """
+    if isinstance(chunk, dict):
+        chunk = dict_to_chunknode(chunk)
+    return '_'.join(chunk.id.split('_')[:-1])
+def order_chunks_by_order_in_file(chunks:List[ChunkNode]) -> list:
+    """
+    Orders a list of chunks by their order in the file.
+    Args:
+        chunks (list): The list of chunks to order.
+    Returns:
+        list: The ordered list of chunks.
+    """
+    # Convert dicts to ChunkNode if needed
+    chunks = [dict_to_chunknode(c) if isinstance(c, dict) else c for c in chunks]
+    return sorted(chunks, key=lambda x: int(x.order_in_file))
+def organize_chunks_by_file_name(chunks: List[ChunkNode]) -> Dict[str, List[ChunkNode]]:
+    """
+    Organizes a list of chunks by their file names.
+    Args:
+        chunks (list): The list of chunks to organize.
+    Returns:
+        dict: A dictionary mapping file names to lists of chunks.
+    """
+    # Convert dicts to ChunkNode if needed
+    chunks = [dict_to_chunknode(c) if isinstance(c, dict) else c for c in chunks]
+    organized_chunks = {}
+    for chunk in chunks:
+        file_name = extract_filename_from_chunk(chunk)
+        if file_name not in organized_chunks:
+            organized_chunks[file_name] = []
+        organized_chunks[file_name].append(chunk)
+    for file_name in organized_chunks:
+        organized_chunks[file_name] = order_chunks_by_order_in_file(organized_chunks[file_name])
+    return organized_chunks
+def join_organized_chunks(organized_chunks: Dict[str, List[ChunkNode]]) -> str:
+    """
+    Joins organized chunks into a single string.
+    Args:
+        organized_chunks (dict): The dictionary of organized chunks.
+    Returns:
+        str: The joined string of organized chunks.
+    """
+    joined_chunks_list = []
+    separator = "=" * 48
+    for filename in organized_chunks:
+        joined_chunks_list.append(separator)
+        joined_chunks_list.append(f"File: {filename}")
+        joined_chunks_list.append(separator)
+        # Convert dicts to ChunkNode if needed
+        chunks = [dict_to_chunknode(c) if isinstance(c, dict) else c for c in organized_chunks[filename]]
+        if len(chunks) == 0:
+            continue
+        if int(chunks[0].order_in_file) > 0:
+            joined_chunks_list.append("\n[...]")
+        for i, chunk in enumerate(chunks):
+            joined_chunks_list.append(chunk.content)
+            if i < len(chunks) - 1:
+                if int(chunks[i+1].order_in_file) - int(chunk.order_in_file) > 1:
+                    joined_chunks_list.append("\n[...]")
+    return "\n".join(joined_chunks_list)

RepoKnowledgeGraphLib/utils/data_utils.py ADDED Viewed

	@@ -0,0 +1,18 @@

+def flatten_list(my_list: list) -> list:
+    """
+    Args:
+        my_list: list composed of lists (of lists of lists...)
+    Returns: flattened list
+    """
+    flattened_list = []
+    for item in my_list:
+        if isinstance(item, list) and len(item) > 0:
+            print(item)
+            flattened_list += flatten_list(item)
+        elif not isinstance(item, list):
+            flattened_list.append(item)
+    return flattened_list

RepoKnowledgeGraphLib/utils/logger_utils.py ADDED Viewed

	@@ -0,0 +1,74 @@

+import logging
+import os
+import sys
+import atexit
+# Global registry to track initialized loggers
+_initialized_loggers = set()
+# Get log level from environment variable (default to INFO for visibility in docker logs)
+DEFAULT_LOG_LEVEL = os.getenv('LOG_LEVEL', 'INFO').upper()
+LOG_TO_FILE = os.getenv('LOG_TO_FILE', 'false').lower() == 'true'
+def setup_logger(logger_name: str, log_file: str = '',
+                 level: int = None) -> None:
+    """
+    :param logger_name: name to give to logger
+    :param log_file: file to save log to
+    :param level: which base level of importance to set logger to (defaults to LOG_LEVEL env var)
+    :return: *None*
+    """
+    # Check if logger has already been set up
+    if logger_name in _initialized_loggers:
+        return
+    log = logging.getLogger(logger_name)
+    # Determine log level from parameter, env var, or default
+    if level is None:
+        level = getattr(logging, DEFAULT_LOG_LEVEL, logging.INFO)
+    formatter = logging.Formatter(
+        fmt="%(name)s - %(levelname)s: %(asctime)-15s %(message)s")
+    # Always add stream handler for stdout (docker logs visibility)
+    stream_handler = logging.StreamHandler(sys.stdout)
+    stream_handler.setFormatter(formatter)
+    stream_handler.setLevel(level)
+    log.setLevel(level)
+    if not log.hasHandlers():
+        log.addHandler(stream_handler)
+        # Optionally add file handler if LOG_TO_FILE is enabled
+        if LOG_TO_FILE:
+            os.makedirs('logs', exist_ok=True)
+            if log_file == '':
+                log_file = f"{logger_name}.log"
+            log_file_path = os.path.join('logs', log_file)
+            file_handler = logging.FileHandler(log_file_path, mode='w')
+            file_handler.setFormatter(formatter)
+            file_handler.setLevel(level)
+            log.addHandler(file_handler)
+    # Prevent log propagation to avoid duplicate logs
+    log.propagate = False
+    # Mark this logger as initialized
+    _initialized_loggers.add(logger_name)
+    # Register cleanup function to close handlers on exit
+    atexit.register(_cleanup_logger, logger_name)
+def _cleanup_logger(logger_name: str) -> None:
+    """
+    Clean up logger handlers on program exit.
+    :param logger_name: name of the logger to clean up
+    """
+    log = logging.getLogger(logger_name)
+    handlers = log.handlers[:]
+    for handler in handlers:
+        handler.close()
+        log.removeHandler(handler)

RepoKnowledgeGraphLib/utils/parsing_utils.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import os
+import re
+def read_directory_files_recursively(directory_path: str, skip_dirs:list, skip_pattern: str = None) -> dict:
+    """
+    Recursively reads all files in a directory and its subdirectories.
+    Skips files and directories that match the given regex pattern or are in skip_dirs.
+    Args:
+        directory_path (str): The path to start reading files from.
+        skip_dirs (list): List of directory names to skip.
+        skip_pattern (str, optional): Regex pattern to skip files/directories.
+    Returns:
+        dict: A dictionary where keys are relative file paths and values are file contents.
+    """
+    file_contents = {}
+    compiled_pattern = re.compile(skip_pattern) if skip_pattern else None
+    for root, dirs, files in os.walk(directory_path):
+        # Skip directories listed in skip_dirs
+        dirs[:] = [d for d in dirs if d not in skip_dirs and not (compiled_pattern and compiled_pattern.search(os.path.join(root, d)))]
+        for file in files:
+            full_path = os.path.join(root, file)
+            relative_path = os.path.relpath(full_path, directory_path)
+            # Skip matching files
+            if compiled_pattern and compiled_pattern.search(relative_path):
+                continue
+            try:
+                with open(full_path, 'r', encoding='utf-8') as f:
+                    file_contents[relative_path] = f.read()
+            except (UnicodeDecodeError, OSError) as e:
+                print(f'Failed to read {relative_path}: {e}')
+                continue
+                #file_contents[relative_path] = f"<<Error reading file: {e}>>"
+    return file_contents
+def get_language_from_filename(file_name:str) -> str:
+    file_extension = file_name.split('.')[-1]
+    extension_mapping = {
+        'c': 'c',
+        'h': 'c',
+        'cpp': 'c++',
+        'cc': 'c++',
+        'cxx': 'c++',
+        'hpp': 'c++',
+        'hh': 'c++',
+        'hxx': 'c++',
+        'go': 'go',
+        'java': 'java',
+        'py': 'python',
+        'pyc': 'python',
+        'pyw':'python',
+        'js': 'javascript',
+        'mjs': 'javascript',
+        'cjs': 'javascript',
+    }
+    # Throws error if language not defined
+    return extension_mapping.get(file_extension, file_extension)

RepoKnowledgeGraphLib/utils/path_utils.py ADDED Viewed

	@@ -0,0 +1,308 @@

+import os
+import tempfile
+import shutil
+import zipfile
+import tarfile
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+def _extract_zip(path: Path) -> str:
+    temp_dir = tempfile.mkdtemp()
+    with zipfile.ZipFile(path, 'r') as zip_ref:
+        zip_ref.extractall(temp_dir)
+    return temp_dir
+def _extract_tgz(path: Path) -> str:
+    temp_dir = tempfile.mkdtemp()
+    with tarfile.open(path, 'r:gz') as tar_ref:
+        tar_ref.extractall(temp_dir)
+    return temp_dir
+def prepare_input_path(path: str) -> str:
+    """Handles different input types: directories, files, zip or tgz archives."""
+    path_obj = Path(path)
+    if path_obj.is_dir():
+        return str(path_obj)
+    if path_obj.suffix == '.zip':
+        return _extract_zip(path_obj)
+    elif path_obj.suffix in {'.tgz', '.tar.gz'}:
+        return _extract_tgz(path_obj)
+    elif path_obj.is_file():
+        # Copy single file to a temporary directory
+        temp_dir = tempfile.mkdtemp()
+        shutil.copy(path_obj, temp_dir)
+        return temp_dir
+    else:
+        raise ValueError(f"Unsupported path type or extension: {path}")
+def file_path_to_module_path(file_path: str) -> str:
+    """
+    Convert a file path to a module path by replacing path separators with dots
+    and removing the file extension.
+    Examples:
+        path/to/repo/python_script.py -> path.to.repo.python_script
+        src/utils/helper.py -> src.utils.helper
+        module.py -> module
+    Args:
+        file_path: File path string
+    Returns:
+        Module path with dots instead of slashes
+    """
+    # Normalize path separators
+    normalized = file_path.replace('\\', '/').replace(os.sep, '/')
+    # Remove file extension
+    without_ext = os.path.splitext(normalized)[0]
+    # Replace / with .
+    module_path = without_ext.replace('/', '.')
+    return module_path
+def generate_entity_aliases(entity_name: str, file_path: str) -> list:
+    """
+    Generate all possible aliases for an entity based on its name and file path.
+    For example, if a file 'path/to/repo/python_script.py' defines 'Class_1',
+    the aliases would be:
+    - Class_1 (simple name)
+    - path.to.repo.python_script.Class_1 (fully qualified from file path)
+    For C++ namespaced entities like 'math::Calculator':
+    - math::Calculator (fully qualified name)
+    - Calculator (unqualified name, for use with 'using namespace')
+    - math.calculator.math::Calculator (module-based fully qualified)
+    For temporary paths like '.tmp.tmptqky4yk4..pyinstaller.run_astropy_tests.pos':
+    - pos (simple name)
+    - .run_astropy_tests.pos (progressive path removal)
+    - pyinstaller.run_astropy_tests.pos (further removal)
+    - .tmp.tmptqky4yk4..pyinstaller.run_astropy_tests.pos (full path)
+    Args:
+        entity_name: The name of the entity (e.g., 'Class_1', 'my_function', 'math::Calculator')
+        file_path: The file path where the entity is defined
+    Returns:
+        List of alias strings
+    """
+    aliases = []
+    # Always include the simple entity name
+    aliases.append(entity_name)
+    # For C++/C-style namespaced entities (using ::), add the unqualified name
+    if '::' in entity_name:
+        # Extract the unqualified name (last part after ::)
+        unqualified_name = entity_name.split('::')[-1]
+        if unqualified_name != entity_name:
+            aliases.append(unqualified_name)
+    # Generate module-based alias
+    module_path = file_path_to_module_path(file_path)
+    # If entity_name already contains scope separators (., ::),
+    # it might be a nested entity (e.g., 'MyClass.my_method')
+    # In this case, add the module path before the entire qualified name
+    fully_qualified = f"{module_path}.{entity_name}"
+    # Generate progressive path aliases by removing temporary/noise components
+    # Split the module path into components
+    components = module_path.split('.')
+    # Filter out components that look like temporary directories or UUIDs
+    def is_temp_component(component: str) -> bool:
+        """Check if a path component looks like a temporary directory."""
+        if not component:
+            return True
+        # Check for common temp directory patterns
+        if component.startswith('tmp') and len(component) > 3:
+            return True
+        if component.startswith('.tmp'):
+            return True
+        # Check for UUID-like patterns (long alphanumeric strings)
+        if len(component) > 8 and component.replace('_', '').replace('-', '').isalnum():
+            # If it's mostly lowercase and has mix of letters and numbers, likely a temp ID
+            if sum(c.islower() for c in component) > len(component) / 2:
+                if sum(c.isdigit() for c in component) > 2:
+                    return True
+        return False
+    # Generate aliases by progressively including more path components
+    # Start from the rightmost meaningful components and work backwards
+    clean_components = []
+    for component in components:
+        if not is_temp_component(component):
+            clean_components.append(component)
+    # Generate aliases with increasing path depth from meaningful components
+    if clean_components:
+        for i in range(1, len(clean_components) + 1):
+            # Take the last i components
+            partial_path = '.'.join(clean_components[-i:])
+            partial_alias = f".{partial_path}.{entity_name}"
+            if partial_alias != entity_name and partial_alias not in aliases:
+                aliases.append(partial_alias)
+            # Also add without leading dot for the full clean path
+            if i == len(clean_components):
+                no_dot_alias = f"{partial_path}.{entity_name}"
+                if no_dot_alias != entity_name and no_dot_alias not in aliases:
+                    aliases.append(no_dot_alias)
+    # Always add the fully qualified path at the end (even if it contains temp components)
+    if fully_qualified != entity_name and fully_qualified not in aliases:
+        aliases.append(fully_qualified)
+    return aliases
+def normalize_include_path(include_path: str) -> str:
+    """
+    Normalize an include path from #include directive to a module-like path.
+    Examples:
+        <vector> -> vector
+        <iostream> -> iostream
+        "myheader.h" -> myheader
+        "utils/helper.h" -> utils.helper
+        <boost/algorithm/string.hpp> -> boost.algorithm.string
+    Args:
+        include_path: The include path from #include directive
+    Returns:
+        Normalized module-like path
+    """
+    # Remove angle brackets and quotes
+    path = include_path.strip('<>"')
+    # Convert to module path
+    module_path = file_path_to_module_path(path)
+    return module_path
+def build_entity_alias_map(entities: Dict[str, Dict]) -> Dict[str, str]:
+    """
+    Build a mapping from all entity aliases to their canonical entity names.
+    This allows quick lookup when matching called entities to their definitions.
+    Args:
+        entities: Dictionary of entity info keyed by canonical entity name
+    Returns:
+        Dictionary mapping alias -> canonical entity name
+    """
+    alias_map = {}
+    for entity_name, info in entities.items():
+        # Map the canonical name to itself
+        alias_map[entity_name] = entity_name
+        # Map all aliases to the canonical name
+        aliases = info.get('aliases', [])
+        for alias in aliases:
+            if alias and alias not in alias_map:
+                alias_map[alias] = entity_name
+    return alias_map
+def resolve_entity_call(called_name: str, alias_map: Dict[str, str],
+                        imports: List[str] = None) -> Optional[str]:
+    """
+    Resolve a called entity name to its canonical definition using aliases.
+    This handles cases like:
+    - Direct call: 'MyClass' -> 'MyClass'
+    - Qualified call: 'module.MyClass' -> 'MyClass' (if alias exists)
+    - Imported call: 'helper' -> 'utils.helper' (if imported)
+    - Simple name to qualified: 'Calculator' -> 'utils::Calculator'
+    Args:
+        called_name: The name of the called entity
+        alias_map: Mapping from aliases to canonical entity names
+        imports: List of import paths (optional, for context)
+    Returns:
+        Canonical entity name if found, None otherwise
+    """
+    # Don't try to resolve empty strings
+    if not called_name or not called_name.strip():
+        return None
+    # Direct match
+    if called_name in alias_map:
+        return alias_map[called_name]
+    # Try partial matches if imports are provided
+    if imports:
+        for import_path in imports:
+            # Try combining import path with called name
+            qualified = f"{import_path}.{called_name}"
+            if qualified in alias_map:
+                return alias_map[qualified]
+            # Try with :: separator (C++/Rust style)
+            qualified_cpp = f"{import_path}::{called_name}"
+            if qualified_cpp in alias_map:
+                return alias_map[qualified_cpp]
+    # Try fuzzy matching - look for canonical names that end with the called name
+    # This helps match 'Calculator' to 'utils::Calculator' or 'MyClass' to 'module.MyClass'
+    simple_name = extract_simple_name(called_name)
+    candidates = []
+    for alias, canonical in alias_map.items():
+        alias_simple = extract_simple_name(alias)
+        # If the simple names match, this could be a match
+        if alias_simple == simple_name:
+            candidates.append(canonical)
+    # If we found exactly one candidate, return it
+    if len(candidates) == 1:
+        return candidates[0]
+    # If we have multiple candidates, prefer the shortest qualified name
+    # (most likely to be the direct definition rather than an alias)
+    if len(candidates) > 1:
+        return min(candidates, key=lambda x: len(x))
+    return None
+def extract_simple_name(qualified_name: str) -> str:
+    """
+    Extract the simple name from a qualified name.
+    Examples:
+        'namespace::MyClass' -> 'MyClass'
+        'module.MyClass' -> 'MyClass'
+        'MyClass' -> 'MyClass'
+    Args:
+        qualified_name: Fully or partially qualified name
+    Returns:
+        Simple name without namespace/module prefix
+    """
+    # Handle C++ style namespace separator
+    if '::' in qualified_name:
+        return qualified_name.split('::')[-1]
+    # Handle Python/JS style module separator
+    if '.' in qualified_name:
+        return qualified_name.split('.')[-1]
+    return qualified_name