Spaces:

lailaelkoussy
/

code-knowledge-graph-explorer-transformers-library

Sleeping

code-knowledge-graph-explorer-transformers-library

File size: 21,066 Bytes

a100cc5

import logging
import re
from typing import List, Tuple, Dict, Any, Set, Optional
from enum import Enum


class Language(Enum):
    """Supported programming languages"""
    PYTHON = "python"
    C = "c"
    CPP = "cpp"
    JAVA = "java"


class EntityChunkMapper:
    """Maps entities from file-level extraction back to their respective chunks"""

    def __init__(self):
        self.logger = logging.getLogger("ENTITY_CHUNK_MAPPER")
        self.extension_to_language = {
            'py': Language.PYTHON,
            'pyw': Language.PYTHON,
            'c': Language.C,
            'h': Language.C,
            'cpp': Language.CPP,
            'cc': Language.CPP,
            'cxx': Language.CPP,
            'hpp': Language.CPP,
            'hh': Language.CPP,
            'hxx': Language.CPP,
            'java': Language.JAVA,
        }

    def _detect_language(self, file_name: Optional[str] = None) -> Language:
        """
        Detect the programming language from file extension

        Args:
            file_name: Name of the file (optional)

        Returns:
            Language enum value, defaults to PYTHON if not detected
        """
        if file_name:
            extension = file_name.split('.')[-1].lower()
            return self.extension_to_language.get(extension, Language.PYTHON)
        return Language.PYTHON

    def _is_comment_or_docstring(self, line: str, in_docstring: bool, language: Language) -> Tuple[bool, bool]:
        """
        Check if a line is a comment or part of a docstring/multi-line comment

        Args:
            line: The line to check
            in_docstring: Whether we're currently inside a docstring/multi-line comment
            language: The programming language

        Returns:
            Tuple of (is_comment_or_docstring, new_in_docstring_state)
        """
        stripped = line.strip()

        if language == Language.PYTHON:
            # Check for single-line comments
            if stripped.startswith('#'):
                return True, in_docstring

            # Check for docstring delimiters (""" or ''')
            triple_double = '"""'
            triple_single = "'''"

            # Count occurrences of triple quotes
            if triple_double in stripped or triple_single in stripped:
                # Check if it's a single-line docstring
                if (stripped.count(triple_double) >= 2 or
                    stripped.count(triple_single) >= 2):
                    # Single-line docstring
                    return True, in_docstring
                else:
                    # Toggle docstring state
                    return True, not in_docstring

            # If we're in a docstring, this line is part of it
            if in_docstring:
                return True, in_docstring

        elif language in [Language.C, Language.CPP, Language.JAVA]:
            # Check for single-line comments
            if stripped.startswith('//'):
                return True, in_docstring

            # Check for multi-line comment delimiters /* */
            if '/*' in line and '*/' in line:
                # Single-line multi-line comment
                return True, in_docstring
            elif '/*' in line:
                # Start of multi-line comment
                return True, True
            elif '*/' in line:
                # End of multi-line comment
                return True, False

            # If we're in a multi-line comment
            if in_docstring:
                return True, in_docstring

        return False, in_docstring

    def _get_code_lines(self, chunk_lines: List[str], language: Language) -> List[str]:
        """
        Filter out comments and docstrings from chunk lines

        Args:
            chunk_lines: List of lines in the chunk
            language: The programming language

        Returns:
            List of lines that are actual code (not comments or docstrings)
        """
        code_lines = []
        in_docstring = False

        for line in chunk_lines:
            is_doc, in_docstring = self._is_comment_or_docstring(line, in_docstring, language)
            if not is_doc:
                code_lines.append(line)

        return code_lines

    def _is_valid_identifier_match(self, text: str, identifier: str, position: int) -> bool:
        """
        Check if an identifier match at a position is valid (not part of another word)

        Args:
            text: The text containing the identifier
            identifier: The identifier to check
            position: The position where the identifier was found

        Returns:
            True if this is a valid standalone identifier match
        """
        # Check character before (if exists)
        if position > 0:
            char_before = text[position - 1]
            if char_before.isalnum() or char_before == '_':
                return False

        # Check character after (if exists)
        end_pos = position + len(identifier)
        if end_pos < len(text):
            char_after = text[end_pos]
            if char_after.isalnum() or char_after == '_':
                return False

        return True

    def _contains_identifier(self, line: str, identifier: str) -> bool:
        """
        Check if a line contains an identifier as a standalone word (not part of another word)

        Args:
            line: The line to check
            identifier: The identifier to find

        Returns:
            True if the identifier appears as a standalone word
        """
        # Use word boundary regex for precise matching
        pattern = r'\b' + re.escape(identifier) + r'\b'
        return bool(re.search(pattern, line))


    def find_entity_in_chunks(self, entity_name: str, chunks: List[str], entity_type: str = None,
                            file_name: Optional[str] = None) -> Set[int]:
        """
        Find which chunks contain a specific entity declaration or call

        Args:
            entity_name: Name of the entity to find
            chunks: List of code chunks
            entity_type: Type of entity (class, function, method, variable)
            file_name: Name of the file to detect language (optional)

        Returns:
            Set of chunk indices that contain this entity
        """
        matching_chunks = set()
        language = self._detect_language(file_name)

        # Split the entity name to handle nested entities like "ClassName.method"
        # For Java/C++, also handle :: separator
        if '::' in entity_name:
            parts = entity_name.split('::')
        else:
            parts = entity_name.split('.')
        base_name = parts[-1]  # The actual identifier

        for chunk_idx, chunk in enumerate(chunks):
            chunk_lines = chunk.strip().split('\n')

            # Look for different patterns based on entity type
            if self._entity_appears_in_chunk(entity_name, base_name, chunk, chunk_lines, entity_type, language):
                matching_chunks.add(chunk_idx)

        return matching_chunks

    def _entity_appears_in_chunk(self, full_name: str, base_name: str, chunk: str, chunk_lines: List[str],
                                 entity_type: str, language: Language) -> bool:
        """Check if an entity appears in a specific chunk (excluding comments and docstrings)"""

        # Filter out comments and docstrings
        code_lines = self._get_code_lines(chunk_lines, language)

        # If no code lines remain, entity doesn't appear in actual code
        if not code_lines:
            return False

        # Language-specific entity matching
        if language == Language.PYTHON:
            return self._entity_appears_in_python(full_name, base_name, code_lines, entity_type)
        elif language in [Language.C, Language.CPP]:
            return self._entity_appears_in_c_cpp(full_name, base_name, code_lines, entity_type)
        elif language == Language.JAVA:
            return self._entity_appears_in_java(full_name, base_name, code_lines, entity_type)

        return False

    def _entity_appears_in_python(self, full_name: str, base_name: str, code_lines: List[str],
                                  entity_type: str) -> bool:
        """Check if entity appears in Python code"""

        if entity_type == "class":
            # Look for class definition
            for line in code_lines:
                stripped = line.strip()
                if re.match(rf'class\s+{re.escape(base_name)}[\s:(]', stripped):
                    return True

        elif entity_type == "api_endpoint":
            # Look for API endpoint definition - the function decorated with @app.get, @app.post, etc.
            # We look for the function definition itself
            for line in code_lines:
                stripped = line.strip()
                # Match the function definition with the endpoint name
                if re.match(rf'(async\s+)?def\s+{re.escape(base_name)}\s*\(', stripped):
                    return True
                # Also check for decorators that might reference the endpoint
                if re.search(rf'@\w+\.(get|post|put|delete|patch|options|head)\s*\(', stripped):
                    return True

        elif entity_type == "function":
            # Look for function definition (not method)
            for line in code_lines:
                stripped = line.strip()
                # Check it's not indented (not a method)
                if not line.startswith("    ") and not line.startswith("\t"):
                    if re.match(rf'(async\s+)?def\s+{re.escape(base_name)}\s*\(', stripped):
                        return True

        elif entity_type == "method":
            # Look for method definition (indented def)
            method_name = full_name.split('.')[-1]
            for line in code_lines:
                stripped = line.strip()
                # Check it's indented (is a method)
                if line.startswith("    ") or line.startswith("\t"):
                    if re.match(rf'(async\s+)?def\s+{re.escape(method_name)}\s*\(', stripped):
                        return True

        elif entity_type == "variable":
            # Look for variable assignment or usage
            if "." in full_name:
                parts = full_name.split('.')
                attr_name = parts[-1]
                for line in code_lines:
                    if re.search(rf'\.\s*{re.escape(attr_name)}\b', line):
                        return True
            else:
                for line in code_lines:
                    stripped = line.strip()
                    if re.match(rf'{re.escape(base_name)}\s*[=:]', stripped):
                        return True

        # For called entities, look for usage patterns
        if entity_type in ["function", "method"] or entity_type is None:
            for line in code_lines:
                if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
                    return True

        if entity_type == "class" or entity_type is None:
            for line in code_lines:
                if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
                    return True

        # General usage as identifier
        if entity_type is None or entity_type == "variable":
            for line in code_lines:
                if self._contains_identifier(line, base_name):
                    return True

        return False

    def _extract_using_namespace_directives(self, code_lines: List[str]) -> List[str]:
        """
        Extract using namespace directives from C++ code.
        Returns a list of namespace names that are being imported.
        """
        namespaces = []
        for line in code_lines:
            stripped = line.strip()
            # Match "using namespace <name>;"
            match = re.match(r'using\s+namespace\s+([a-zA-Z_][a-zA-Z0-9_:]*)\s*;', stripped)
            if match:
                namespaces.append(match.group(1))
        return namespaces

    def _entity_appears_in_c_cpp(self, full_name: str, base_name: str, code_lines: List[str],
                                 entity_type: str) -> bool:
        """Check if entity appears in C/C++ code"""

        # Extract using namespace directives
        using_namespaces = self._extract_using_namespace_directives(code_lines)
        
        # Check if the full_name matches any imported namespace + base_name
        # e.g., if full_name is "math::Calculator" and we have "using namespace math",
        # then "Calculator" in code should match
        namespace_match = False
        if '::' in full_name:
            for ns in using_namespaces:
                # Check if full_name starts with this namespace
                if full_name.startswith(ns + '::'):
                    namespace_match = True
                    break

        if entity_type == "class":
            # Look for class/struct definition
            for line in code_lines:
                stripped = line.strip()
                if re.match(rf'(class|struct)\s+{re.escape(base_name)}[\s:{{]', stripped):
                    return True

        elif entity_type == "function":
            # Look for function definition or declaration
            for line in code_lines:
                stripped = line.strip()
                # Match function patterns: return_type function_name(
                # Also handle constructors and destructors
                if (re.search(rf'\b{re.escape(base_name)}\s*\(', stripped) and
                    not stripped.startswith('//')):
                    # Additional check: likely a function if followed by parameters
                    return True

        elif entity_type == "method":
            # Look for method definition (with class scope)
            method_name = full_name.split('::')[-1] if '::' in full_name else full_name.split('.')[-1]
            for line in code_lines:
                stripped = line.strip()
                # Match ClassName::methodName( or just methodName( inside class
                if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped):
                    return True

        elif entity_type == "variable":
            # Look for variable declaration or usage
            for line in code_lines:
                stripped = line.strip()
                # Match variable declarations and assignments
                if re.search(rf'\b{re.escape(base_name)}\b', stripped):
                    return True

        # For called entities, look for usage patterns
        if entity_type in ["function", "method"] or entity_type is None:
            for line in code_lines:
                if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
                    return True

        if entity_type == "class" or entity_type is None:
            # Look for instantiation or usage
            for line in code_lines:
                if re.search(rf'\b{re.escape(base_name)}\b', line):
                    # If we found base_name and there's a namespace match, this is a match
                    if namespace_match:
                        return True
                    # If full_name doesn't have a namespace, it's a direct match
                    if '::' not in full_name:
                        return True

        # General usage as identifier
        if entity_type is None or entity_type == "variable":
            for line in code_lines:
                if self._contains_identifier(line, base_name):
                    # If we found base_name and there's a namespace match, this is a match
                    if namespace_match:
                        return True
                    # If full_name doesn't have a namespace, it's a direct match
                    if '::' not in full_name:
                        return True

        return False

    def _entity_appears_in_java(self, full_name: str, base_name: str, code_lines: List[str],
                                entity_type: str) -> bool:
        """Check if entity appears in Java code"""

        if entity_type == "class":
            # Look for class/interface/enum definition
            for line in code_lines:
                stripped = line.strip()
                if re.match(rf'(public|private|protected)?\s*(class|interface|enum)\s+{re.escape(base_name)}[\s<{{]', stripped):
                    return True
                # Without modifier
                if re.match(rf'(class|interface|enum)\s+{re.escape(base_name)}[\s<{{]', stripped):
                    return True

        elif entity_type == "api_endpoint":
            # Look for API endpoint definition - the method with Spring annotations
            # Extract just the method name from the full qualified name (e.g., "com.example.Controller::method" -> "method")
            method_name = base_name.split('::')[-1] if '::' in base_name else base_name
            for line in code_lines:
                stripped = line.strip()
                # Match the method definition
                if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped):
                    return True
                # Also check for Spring annotations
                if re.search(r'@(GetMapping|PostMapping|PutMapping|DeleteMapping|PatchMapping|RequestMapping)', stripped):
                    return True

        elif entity_type == "function":
            # In Java, functions are methods
            for line in code_lines:
                stripped = line.strip()
                # Match method signature patterns
                if re.search(rf'\b{re.escape(base_name)}\s*\(', stripped):
                    return True

        elif entity_type == "method":
            # Look for method definition
            method_name = full_name.split('.')[-1]
            for line in code_lines:
                stripped = line.strip()
                if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped):
                    return True

        elif entity_type == "variable":
            # Look for variable declaration or usage
            for line in code_lines:
                stripped = line.strip()
                if re.search(rf'\b{re.escape(base_name)}\b', stripped):
                    return True

        # For called entities, look for usage patterns
        if entity_type in ["function", "method"] or entity_type is None:
            for line in code_lines:
                if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
                    return True

        if entity_type == "class" or entity_type is None:
            # Look for instantiation (new ClassName) or usage
            for line in code_lines:
                if re.search(rf'\b{re.escape(base_name)}\b', line):
                    return True

        # General usage as identifier
        if entity_type is None or entity_type == "variable":
            for line in code_lines:
                if self._contains_identifier(line, base_name):
                    return True

        return False

    def map_entities_to_chunks(self, declared_entities: List[Dict[str, Any]],
                               called_entities: List[str],
                               chunks: List[str],
                               file_name: Optional[str] = None) -> Tuple[Dict[int, List[Dict[str, Any]]],
    Dict[int, List[str]]]:
        """
        Map file-level entities back to their respective chunks

        Args:
            declared_entities: List of declared entities from file-level extraction
            called_entities: List of called entities from file-level extraction
            chunks: List of code chunks
            file_name: Name of the file to detect language (optional)

        Returns:
            Tuple of (chunk_declared_entities, chunk_called_entities)
            - chunk_declared_entities: Dict mapping chunk_index -> list of declared entities
            - chunk_called_entities: Dict mapping chunk_index -> list of called entities
        """
        chunk_declared = {}
        chunk_called = {}

        # Initialize empty lists for all chunks
        for i in range(len(chunks)):
            chunk_declared[i] = []
            chunk_called[i] = []

        # Map declared entities to chunks
        for entity in declared_entities:
            entity_name = entity.get("name", "")
            entity_type = entity.get("type", "")

            matching_chunks = self.find_entity_in_chunks(entity_name, chunks, entity_type, file_name)

            # Add entity to matching chunks
            for chunk_idx in matching_chunks:
                chunk_declared[chunk_idx].append(entity)

        # Map called entities to chunks
        for called_entity in called_entities:
            matching_chunks = self.find_entity_in_chunks(called_entity, chunks, None, file_name)

            # Add called entity to matching chunks
            for chunk_idx in matching_chunks:
                if called_entity not in chunk_called[chunk_idx]:
                    chunk_called[chunk_idx].append(called_entity)

        return chunk_declared, chunk_called