lailaelkoussy's picture
upload repoknowledgegraphlib
a100cc5 verified
import logging
import re
from typing import List, Tuple, Dict, Any, Set, Optional
from enum import Enum
class Language(Enum):
"""Supported programming languages"""
PYTHON = "python"
C = "c"
CPP = "cpp"
JAVA = "java"
class EntityChunkMapper:
"""Maps entities from file-level extraction back to their respective chunks"""
def __init__(self):
self.logger = logging.getLogger("ENTITY_CHUNK_MAPPER")
self.extension_to_language = {
'py': Language.PYTHON,
'pyw': Language.PYTHON,
'c': Language.C,
'h': Language.C,
'cpp': Language.CPP,
'cc': Language.CPP,
'cxx': Language.CPP,
'hpp': Language.CPP,
'hh': Language.CPP,
'hxx': Language.CPP,
'java': Language.JAVA,
}
def _detect_language(self, file_name: Optional[str] = None) -> Language:
"""
Detect the programming language from file extension
Args:
file_name: Name of the file (optional)
Returns:
Language enum value, defaults to PYTHON if not detected
"""
if file_name:
extension = file_name.split('.')[-1].lower()
return self.extension_to_language.get(extension, Language.PYTHON)
return Language.PYTHON
def _is_comment_or_docstring(self, line: str, in_docstring: bool, language: Language) -> Tuple[bool, bool]:
"""
Check if a line is a comment or part of a docstring/multi-line comment
Args:
line: The line to check
in_docstring: Whether we're currently inside a docstring/multi-line comment
language: The programming language
Returns:
Tuple of (is_comment_or_docstring, new_in_docstring_state)
"""
stripped = line.strip()
if language == Language.PYTHON:
# Check for single-line comments
if stripped.startswith('#'):
return True, in_docstring
# Check for docstring delimiters (""" or ''')
triple_double = '"""'
triple_single = "'''"
# Count occurrences of triple quotes
if triple_double in stripped or triple_single in stripped:
# Check if it's a single-line docstring
if (stripped.count(triple_double) >= 2 or
stripped.count(triple_single) >= 2):
# Single-line docstring
return True, in_docstring
else:
# Toggle docstring state
return True, not in_docstring
# If we're in a docstring, this line is part of it
if in_docstring:
return True, in_docstring
elif language in [Language.C, Language.CPP, Language.JAVA]:
# Check for single-line comments
if stripped.startswith('//'):
return True, in_docstring
# Check for multi-line comment delimiters /* */
if '/*' in line and '*/' in line:
# Single-line multi-line comment
return True, in_docstring
elif '/*' in line:
# Start of multi-line comment
return True, True
elif '*/' in line:
# End of multi-line comment
return True, False
# If we're in a multi-line comment
if in_docstring:
return True, in_docstring
return False, in_docstring
def _get_code_lines(self, chunk_lines: List[str], language: Language) -> List[str]:
"""
Filter out comments and docstrings from chunk lines
Args:
chunk_lines: List of lines in the chunk
language: The programming language
Returns:
List of lines that are actual code (not comments or docstrings)
"""
code_lines = []
in_docstring = False
for line in chunk_lines:
is_doc, in_docstring = self._is_comment_or_docstring(line, in_docstring, language)
if not is_doc:
code_lines.append(line)
return code_lines
def _is_valid_identifier_match(self, text: str, identifier: str, position: int) -> bool:
"""
Check if an identifier match at a position is valid (not part of another word)
Args:
text: The text containing the identifier
identifier: The identifier to check
position: The position where the identifier was found
Returns:
True if this is a valid standalone identifier match
"""
# Check character before (if exists)
if position > 0:
char_before = text[position - 1]
if char_before.isalnum() or char_before == '_':
return False
# Check character after (if exists)
end_pos = position + len(identifier)
if end_pos < len(text):
char_after = text[end_pos]
if char_after.isalnum() or char_after == '_':
return False
return True
def _contains_identifier(self, line: str, identifier: str) -> bool:
"""
Check if a line contains an identifier as a standalone word (not part of another word)
Args:
line: The line to check
identifier: The identifier to find
Returns:
True if the identifier appears as a standalone word
"""
# Use word boundary regex for precise matching
pattern = r'\b' + re.escape(identifier) + r'\b'
return bool(re.search(pattern, line))
def find_entity_in_chunks(self, entity_name: str, chunks: List[str], entity_type: str = None,
file_name: Optional[str] = None) -> Set[int]:
"""
Find which chunks contain a specific entity declaration or call
Args:
entity_name: Name of the entity to find
chunks: List of code chunks
entity_type: Type of entity (class, function, method, variable)
file_name: Name of the file to detect language (optional)
Returns:
Set of chunk indices that contain this entity
"""
matching_chunks = set()
language = self._detect_language(file_name)
# Split the entity name to handle nested entities like "ClassName.method"
# For Java/C++, also handle :: separator
if '::' in entity_name:
parts = entity_name.split('::')
else:
parts = entity_name.split('.')
base_name = parts[-1] # The actual identifier
for chunk_idx, chunk in enumerate(chunks):
chunk_lines = chunk.strip().split('\n')
# Look for different patterns based on entity type
if self._entity_appears_in_chunk(entity_name, base_name, chunk, chunk_lines, entity_type, language):
matching_chunks.add(chunk_idx)
return matching_chunks
def _entity_appears_in_chunk(self, full_name: str, base_name: str, chunk: str, chunk_lines: List[str],
entity_type: str, language: Language) -> bool:
"""Check if an entity appears in a specific chunk (excluding comments and docstrings)"""
# Filter out comments and docstrings
code_lines = self._get_code_lines(chunk_lines, language)
# If no code lines remain, entity doesn't appear in actual code
if not code_lines:
return False
# Language-specific entity matching
if language == Language.PYTHON:
return self._entity_appears_in_python(full_name, base_name, code_lines, entity_type)
elif language in [Language.C, Language.CPP]:
return self._entity_appears_in_c_cpp(full_name, base_name, code_lines, entity_type)
elif language == Language.JAVA:
return self._entity_appears_in_java(full_name, base_name, code_lines, entity_type)
return False
def _entity_appears_in_python(self, full_name: str, base_name: str, code_lines: List[str],
entity_type: str) -> bool:
"""Check if entity appears in Python code"""
if entity_type == "class":
# Look for class definition
for line in code_lines:
stripped = line.strip()
if re.match(rf'class\s+{re.escape(base_name)}[\s:(]', stripped):
return True
elif entity_type == "api_endpoint":
# Look for API endpoint definition - the function decorated with @app.get, @app.post, etc.
# We look for the function definition itself
for line in code_lines:
stripped = line.strip()
# Match the function definition with the endpoint name
if re.match(rf'(async\s+)?def\s+{re.escape(base_name)}\s*\(', stripped):
return True
# Also check for decorators that might reference the endpoint
if re.search(rf'@\w+\.(get|post|put|delete|patch|options|head)\s*\(', stripped):
return True
elif entity_type == "function":
# Look for function definition (not method)
for line in code_lines:
stripped = line.strip()
# Check it's not indented (not a method)
if not line.startswith(" ") and not line.startswith("\t"):
if re.match(rf'(async\s+)?def\s+{re.escape(base_name)}\s*\(', stripped):
return True
elif entity_type == "method":
# Look for method definition (indented def)
method_name = full_name.split('.')[-1]
for line in code_lines:
stripped = line.strip()
# Check it's indented (is a method)
if line.startswith(" ") or line.startswith("\t"):
if re.match(rf'(async\s+)?def\s+{re.escape(method_name)}\s*\(', stripped):
return True
elif entity_type == "variable":
# Look for variable assignment or usage
if "." in full_name:
parts = full_name.split('.')
attr_name = parts[-1]
for line in code_lines:
if re.search(rf'\.\s*{re.escape(attr_name)}\b', line):
return True
else:
for line in code_lines:
stripped = line.strip()
if re.match(rf'{re.escape(base_name)}\s*[=:]', stripped):
return True
# For called entities, look for usage patterns
if entity_type in ["function", "method"] or entity_type is None:
for line in code_lines:
if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
return True
if entity_type == "class" or entity_type is None:
for line in code_lines:
if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
return True
# General usage as identifier
if entity_type is None or entity_type == "variable":
for line in code_lines:
if self._contains_identifier(line, base_name):
return True
return False
def _extract_using_namespace_directives(self, code_lines: List[str]) -> List[str]:
"""
Extract using namespace directives from C++ code.
Returns a list of namespace names that are being imported.
"""
namespaces = []
for line in code_lines:
stripped = line.strip()
# Match "using namespace <name>;"
match = re.match(r'using\s+namespace\s+([a-zA-Z_][a-zA-Z0-9_:]*)\s*;', stripped)
if match:
namespaces.append(match.group(1))
return namespaces
def _entity_appears_in_c_cpp(self, full_name: str, base_name: str, code_lines: List[str],
entity_type: str) -> bool:
"""Check if entity appears in C/C++ code"""
# Extract using namespace directives
using_namespaces = self._extract_using_namespace_directives(code_lines)
# Check if the full_name matches any imported namespace + base_name
# e.g., if full_name is "math::Calculator" and we have "using namespace math",
# then "Calculator" in code should match
namespace_match = False
if '::' in full_name:
for ns in using_namespaces:
# Check if full_name starts with this namespace
if full_name.startswith(ns + '::'):
namespace_match = True
break
if entity_type == "class":
# Look for class/struct definition
for line in code_lines:
stripped = line.strip()
if re.match(rf'(class|struct)\s+{re.escape(base_name)}[\s:{{]', stripped):
return True
elif entity_type == "function":
# Look for function definition or declaration
for line in code_lines:
stripped = line.strip()
# Match function patterns: return_type function_name(
# Also handle constructors and destructors
if (re.search(rf'\b{re.escape(base_name)}\s*\(', stripped) and
not stripped.startswith('//')):
# Additional check: likely a function if followed by parameters
return True
elif entity_type == "method":
# Look for method definition (with class scope)
method_name = full_name.split('::')[-1] if '::' in full_name else full_name.split('.')[-1]
for line in code_lines:
stripped = line.strip()
# Match ClassName::methodName( or just methodName( inside class
if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped):
return True
elif entity_type == "variable":
# Look for variable declaration or usage
for line in code_lines:
stripped = line.strip()
# Match variable declarations and assignments
if re.search(rf'\b{re.escape(base_name)}\b', stripped):
return True
# For called entities, look for usage patterns
if entity_type in ["function", "method"] or entity_type is None:
for line in code_lines:
if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
return True
if entity_type == "class" or entity_type is None:
# Look for instantiation or usage
for line in code_lines:
if re.search(rf'\b{re.escape(base_name)}\b', line):
# If we found base_name and there's a namespace match, this is a match
if namespace_match:
return True
# If full_name doesn't have a namespace, it's a direct match
if '::' not in full_name:
return True
# General usage as identifier
if entity_type is None or entity_type == "variable":
for line in code_lines:
if self._contains_identifier(line, base_name):
# If we found base_name and there's a namespace match, this is a match
if namespace_match:
return True
# If full_name doesn't have a namespace, it's a direct match
if '::' not in full_name:
return True
return False
def _entity_appears_in_java(self, full_name: str, base_name: str, code_lines: List[str],
entity_type: str) -> bool:
"""Check if entity appears in Java code"""
if entity_type == "class":
# Look for class/interface/enum definition
for line in code_lines:
stripped = line.strip()
if re.match(rf'(public|private|protected)?\s*(class|interface|enum)\s+{re.escape(base_name)}[\s<{{]', stripped):
return True
# Without modifier
if re.match(rf'(class|interface|enum)\s+{re.escape(base_name)}[\s<{{]', stripped):
return True
elif entity_type == "api_endpoint":
# Look for API endpoint definition - the method with Spring annotations
# Extract just the method name from the full qualified name (e.g., "com.example.Controller::method" -> "method")
method_name = base_name.split('::')[-1] if '::' in base_name else base_name
for line in code_lines:
stripped = line.strip()
# Match the method definition
if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped):
return True
# Also check for Spring annotations
if re.search(r'@(GetMapping|PostMapping|PutMapping|DeleteMapping|PatchMapping|RequestMapping)', stripped):
return True
elif entity_type == "function":
# In Java, functions are methods
for line in code_lines:
stripped = line.strip()
# Match method signature patterns
if re.search(rf'\b{re.escape(base_name)}\s*\(', stripped):
return True
elif entity_type == "method":
# Look for method definition
method_name = full_name.split('.')[-1]
for line in code_lines:
stripped = line.strip()
if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped):
return True
elif entity_type == "variable":
# Look for variable declaration or usage
for line in code_lines:
stripped = line.strip()
if re.search(rf'\b{re.escape(base_name)}\b', stripped):
return True
# For called entities, look for usage patterns
if entity_type in ["function", "method"] or entity_type is None:
for line in code_lines:
if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
return True
if entity_type == "class" or entity_type is None:
# Look for instantiation (new ClassName) or usage
for line in code_lines:
if re.search(rf'\b{re.escape(base_name)}\b', line):
return True
# General usage as identifier
if entity_type is None or entity_type == "variable":
for line in code_lines:
if self._contains_identifier(line, base_name):
return True
return False
def map_entities_to_chunks(self, declared_entities: List[Dict[str, Any]],
called_entities: List[str],
chunks: List[str],
file_name: Optional[str] = None) -> Tuple[Dict[int, List[Dict[str, Any]]],
Dict[int, List[str]]]:
"""
Map file-level entities back to their respective chunks
Args:
declared_entities: List of declared entities from file-level extraction
called_entities: List of called entities from file-level extraction
chunks: List of code chunks
file_name: Name of the file to detect language (optional)
Returns:
Tuple of (chunk_declared_entities, chunk_called_entities)
- chunk_declared_entities: Dict mapping chunk_index -> list of declared entities
- chunk_called_entities: Dict mapping chunk_index -> list of called entities
"""
chunk_declared = {}
chunk_called = {}
# Initialize empty lists for all chunks
for i in range(len(chunks)):
chunk_declared[i] = []
chunk_called[i] = []
# Map declared entities to chunks
for entity in declared_entities:
entity_name = entity.get("name", "")
entity_type = entity.get("type", "")
matching_chunks = self.find_entity_in_chunks(entity_name, chunks, entity_type, file_name)
# Add entity to matching chunks
for chunk_idx in matching_chunks:
chunk_declared[chunk_idx].append(entity)
# Map called entities to chunks
for called_entity in called_entities:
matching_chunks = self.find_entity_in_chunks(called_entity, chunks, None, file_name)
# Add called entity to matching chunks
for chunk_idx in matching_chunks:
if called_entity not in chunk_called[chunk_idx]:
chunk_called[chunk_idx].append(called_entity)
return chunk_declared, chunk_called