|
|
import logging |
|
|
import re |
|
|
from typing import List, Tuple, Dict, Any, Set, Optional |
|
|
from enum import Enum |
|
|
|
|
|
|
|
|
class Language(Enum): |
|
|
"""Supported programming languages""" |
|
|
PYTHON = "python" |
|
|
C = "c" |
|
|
CPP = "cpp" |
|
|
JAVA = "java" |
|
|
|
|
|
|
|
|
class EntityChunkMapper: |
|
|
"""Maps entities from file-level extraction back to their respective chunks""" |
|
|
|
|
|
def __init__(self): |
|
|
self.logger = logging.getLogger("ENTITY_CHUNK_MAPPER") |
|
|
self.extension_to_language = { |
|
|
'py': Language.PYTHON, |
|
|
'pyw': Language.PYTHON, |
|
|
'c': Language.C, |
|
|
'h': Language.C, |
|
|
'cpp': Language.CPP, |
|
|
'cc': Language.CPP, |
|
|
'cxx': Language.CPP, |
|
|
'hpp': Language.CPP, |
|
|
'hh': Language.CPP, |
|
|
'hxx': Language.CPP, |
|
|
'java': Language.JAVA, |
|
|
} |
|
|
|
|
|
def _detect_language(self, file_name: Optional[str] = None) -> Language: |
|
|
""" |
|
|
Detect the programming language from file extension |
|
|
|
|
|
Args: |
|
|
file_name: Name of the file (optional) |
|
|
|
|
|
Returns: |
|
|
Language enum value, defaults to PYTHON if not detected |
|
|
""" |
|
|
if file_name: |
|
|
extension = file_name.split('.')[-1].lower() |
|
|
return self.extension_to_language.get(extension, Language.PYTHON) |
|
|
return Language.PYTHON |
|
|
|
|
|
def _is_comment_or_docstring(self, line: str, in_docstring: bool, language: Language) -> Tuple[bool, bool]: |
|
|
""" |
|
|
Check if a line is a comment or part of a docstring/multi-line comment |
|
|
|
|
|
Args: |
|
|
line: The line to check |
|
|
in_docstring: Whether we're currently inside a docstring/multi-line comment |
|
|
language: The programming language |
|
|
|
|
|
Returns: |
|
|
Tuple of (is_comment_or_docstring, new_in_docstring_state) |
|
|
""" |
|
|
stripped = line.strip() |
|
|
|
|
|
if language == Language.PYTHON: |
|
|
|
|
|
if stripped.startswith('#'): |
|
|
return True, in_docstring |
|
|
|
|
|
|
|
|
triple_double = '"""' |
|
|
triple_single = "'''" |
|
|
|
|
|
|
|
|
if triple_double in stripped or triple_single in stripped: |
|
|
|
|
|
if (stripped.count(triple_double) >= 2 or |
|
|
stripped.count(triple_single) >= 2): |
|
|
|
|
|
return True, in_docstring |
|
|
else: |
|
|
|
|
|
return True, not in_docstring |
|
|
|
|
|
|
|
|
if in_docstring: |
|
|
return True, in_docstring |
|
|
|
|
|
elif language in [Language.C, Language.CPP, Language.JAVA]: |
|
|
|
|
|
if stripped.startswith('//'): |
|
|
return True, in_docstring |
|
|
|
|
|
|
|
|
if '/*' in line and '*/' in line: |
|
|
|
|
|
return True, in_docstring |
|
|
elif '/*' in line: |
|
|
|
|
|
return True, True |
|
|
elif '*/' in line: |
|
|
|
|
|
return True, False |
|
|
|
|
|
|
|
|
if in_docstring: |
|
|
return True, in_docstring |
|
|
|
|
|
return False, in_docstring |
|
|
|
|
|
def _get_code_lines(self, chunk_lines: List[str], language: Language) -> List[str]: |
|
|
""" |
|
|
Filter out comments and docstrings from chunk lines |
|
|
|
|
|
Args: |
|
|
chunk_lines: List of lines in the chunk |
|
|
language: The programming language |
|
|
|
|
|
Returns: |
|
|
List of lines that are actual code (not comments or docstrings) |
|
|
""" |
|
|
code_lines = [] |
|
|
in_docstring = False |
|
|
|
|
|
for line in chunk_lines: |
|
|
is_doc, in_docstring = self._is_comment_or_docstring(line, in_docstring, language) |
|
|
if not is_doc: |
|
|
code_lines.append(line) |
|
|
|
|
|
return code_lines |
|
|
|
|
|
def _is_valid_identifier_match(self, text: str, identifier: str, position: int) -> bool: |
|
|
""" |
|
|
Check if an identifier match at a position is valid (not part of another word) |
|
|
|
|
|
Args: |
|
|
text: The text containing the identifier |
|
|
identifier: The identifier to check |
|
|
position: The position where the identifier was found |
|
|
|
|
|
Returns: |
|
|
True if this is a valid standalone identifier match |
|
|
""" |
|
|
|
|
|
if position > 0: |
|
|
char_before = text[position - 1] |
|
|
if char_before.isalnum() or char_before == '_': |
|
|
return False |
|
|
|
|
|
|
|
|
end_pos = position + len(identifier) |
|
|
if end_pos < len(text): |
|
|
char_after = text[end_pos] |
|
|
if char_after.isalnum() or char_after == '_': |
|
|
return False |
|
|
|
|
|
return True |
|
|
|
|
|
def _contains_identifier(self, line: str, identifier: str) -> bool: |
|
|
""" |
|
|
Check if a line contains an identifier as a standalone word (not part of another word) |
|
|
|
|
|
Args: |
|
|
line: The line to check |
|
|
identifier: The identifier to find |
|
|
|
|
|
Returns: |
|
|
True if the identifier appears as a standalone word |
|
|
""" |
|
|
|
|
|
pattern = r'\b' + re.escape(identifier) + r'\b' |
|
|
return bool(re.search(pattern, line)) |
|
|
|
|
|
|
|
|
def find_entity_in_chunks(self, entity_name: str, chunks: List[str], entity_type: str = None, |
|
|
file_name: Optional[str] = None) -> Set[int]: |
|
|
""" |
|
|
Find which chunks contain a specific entity declaration or call |
|
|
|
|
|
Args: |
|
|
entity_name: Name of the entity to find |
|
|
chunks: List of code chunks |
|
|
entity_type: Type of entity (class, function, method, variable) |
|
|
file_name: Name of the file to detect language (optional) |
|
|
|
|
|
Returns: |
|
|
Set of chunk indices that contain this entity |
|
|
""" |
|
|
matching_chunks = set() |
|
|
language = self._detect_language(file_name) |
|
|
|
|
|
|
|
|
|
|
|
if '::' in entity_name: |
|
|
parts = entity_name.split('::') |
|
|
else: |
|
|
parts = entity_name.split('.') |
|
|
base_name = parts[-1] |
|
|
|
|
|
for chunk_idx, chunk in enumerate(chunks): |
|
|
chunk_lines = chunk.strip().split('\n') |
|
|
|
|
|
|
|
|
if self._entity_appears_in_chunk(entity_name, base_name, chunk, chunk_lines, entity_type, language): |
|
|
matching_chunks.add(chunk_idx) |
|
|
|
|
|
return matching_chunks |
|
|
|
|
|
def _entity_appears_in_chunk(self, full_name: str, base_name: str, chunk: str, chunk_lines: List[str], |
|
|
entity_type: str, language: Language) -> bool: |
|
|
"""Check if an entity appears in a specific chunk (excluding comments and docstrings)""" |
|
|
|
|
|
|
|
|
code_lines = self._get_code_lines(chunk_lines, language) |
|
|
|
|
|
|
|
|
if not code_lines: |
|
|
return False |
|
|
|
|
|
|
|
|
if language == Language.PYTHON: |
|
|
return self._entity_appears_in_python(full_name, base_name, code_lines, entity_type) |
|
|
elif language in [Language.C, Language.CPP]: |
|
|
return self._entity_appears_in_c_cpp(full_name, base_name, code_lines, entity_type) |
|
|
elif language == Language.JAVA: |
|
|
return self._entity_appears_in_java(full_name, base_name, code_lines, entity_type) |
|
|
|
|
|
return False |
|
|
|
|
|
def _entity_appears_in_python(self, full_name: str, base_name: str, code_lines: List[str], |
|
|
entity_type: str) -> bool: |
|
|
"""Check if entity appears in Python code""" |
|
|
|
|
|
if entity_type == "class": |
|
|
|
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
if re.match(rf'class\s+{re.escape(base_name)}[\s:(]', stripped): |
|
|
return True |
|
|
|
|
|
elif entity_type == "api_endpoint": |
|
|
|
|
|
|
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
|
|
|
if re.match(rf'(async\s+)?def\s+{re.escape(base_name)}\s*\(', stripped): |
|
|
return True |
|
|
|
|
|
if re.search(rf'@\w+\.(get|post|put|delete|patch|options|head)\s*\(', stripped): |
|
|
return True |
|
|
|
|
|
elif entity_type == "function": |
|
|
|
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
|
|
|
if not line.startswith(" ") and not line.startswith("\t"): |
|
|
if re.match(rf'(async\s+)?def\s+{re.escape(base_name)}\s*\(', stripped): |
|
|
return True |
|
|
|
|
|
elif entity_type == "method": |
|
|
|
|
|
method_name = full_name.split('.')[-1] |
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
|
|
|
if line.startswith(" ") or line.startswith("\t"): |
|
|
if re.match(rf'(async\s+)?def\s+{re.escape(method_name)}\s*\(', stripped): |
|
|
return True |
|
|
|
|
|
elif entity_type == "variable": |
|
|
|
|
|
if "." in full_name: |
|
|
parts = full_name.split('.') |
|
|
attr_name = parts[-1] |
|
|
for line in code_lines: |
|
|
if re.search(rf'\.\s*{re.escape(attr_name)}\b', line): |
|
|
return True |
|
|
else: |
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
if re.match(rf'{re.escape(base_name)}\s*[=:]', stripped): |
|
|
return True |
|
|
|
|
|
|
|
|
if entity_type in ["function", "method"] or entity_type is None: |
|
|
for line in code_lines: |
|
|
if re.search(rf'\b{re.escape(base_name)}\s*\(', line): |
|
|
return True |
|
|
|
|
|
if entity_type == "class" or entity_type is None: |
|
|
for line in code_lines: |
|
|
if re.search(rf'\b{re.escape(base_name)}\s*\(', line): |
|
|
return True |
|
|
|
|
|
|
|
|
if entity_type is None or entity_type == "variable": |
|
|
for line in code_lines: |
|
|
if self._contains_identifier(line, base_name): |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
def _extract_using_namespace_directives(self, code_lines: List[str]) -> List[str]: |
|
|
""" |
|
|
Extract using namespace directives from C++ code. |
|
|
Returns a list of namespace names that are being imported. |
|
|
""" |
|
|
namespaces = [] |
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
|
|
|
match = re.match(r'using\s+namespace\s+([a-zA-Z_][a-zA-Z0-9_:]*)\s*;', stripped) |
|
|
if match: |
|
|
namespaces.append(match.group(1)) |
|
|
return namespaces |
|
|
|
|
|
def _entity_appears_in_c_cpp(self, full_name: str, base_name: str, code_lines: List[str], |
|
|
entity_type: str) -> bool: |
|
|
"""Check if entity appears in C/C++ code""" |
|
|
|
|
|
|
|
|
using_namespaces = self._extract_using_namespace_directives(code_lines) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
namespace_match = False |
|
|
if '::' in full_name: |
|
|
for ns in using_namespaces: |
|
|
|
|
|
if full_name.startswith(ns + '::'): |
|
|
namespace_match = True |
|
|
break |
|
|
|
|
|
if entity_type == "class": |
|
|
|
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
if re.match(rf'(class|struct)\s+{re.escape(base_name)}[\s:{{]', stripped): |
|
|
return True |
|
|
|
|
|
elif entity_type == "function": |
|
|
|
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
|
|
|
|
|
|
if (re.search(rf'\b{re.escape(base_name)}\s*\(', stripped) and |
|
|
not stripped.startswith('//')): |
|
|
|
|
|
return True |
|
|
|
|
|
elif entity_type == "method": |
|
|
|
|
|
method_name = full_name.split('::')[-1] if '::' in full_name else full_name.split('.')[-1] |
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
|
|
|
if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped): |
|
|
return True |
|
|
|
|
|
elif entity_type == "variable": |
|
|
|
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
|
|
|
if re.search(rf'\b{re.escape(base_name)}\b', stripped): |
|
|
return True |
|
|
|
|
|
|
|
|
if entity_type in ["function", "method"] or entity_type is None: |
|
|
for line in code_lines: |
|
|
if re.search(rf'\b{re.escape(base_name)}\s*\(', line): |
|
|
return True |
|
|
|
|
|
if entity_type == "class" or entity_type is None: |
|
|
|
|
|
for line in code_lines: |
|
|
if re.search(rf'\b{re.escape(base_name)}\b', line): |
|
|
|
|
|
if namespace_match: |
|
|
return True |
|
|
|
|
|
if '::' not in full_name: |
|
|
return True |
|
|
|
|
|
|
|
|
if entity_type is None or entity_type == "variable": |
|
|
for line in code_lines: |
|
|
if self._contains_identifier(line, base_name): |
|
|
|
|
|
if namespace_match: |
|
|
return True |
|
|
|
|
|
if '::' not in full_name: |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
def _entity_appears_in_java(self, full_name: str, base_name: str, code_lines: List[str], |
|
|
entity_type: str) -> bool: |
|
|
"""Check if entity appears in Java code""" |
|
|
|
|
|
if entity_type == "class": |
|
|
|
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
if re.match(rf'(public|private|protected)?\s*(class|interface|enum)\s+{re.escape(base_name)}[\s<{{]', stripped): |
|
|
return True |
|
|
|
|
|
if re.match(rf'(class|interface|enum)\s+{re.escape(base_name)}[\s<{{]', stripped): |
|
|
return True |
|
|
|
|
|
elif entity_type == "api_endpoint": |
|
|
|
|
|
|
|
|
method_name = base_name.split('::')[-1] if '::' in base_name else base_name |
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
|
|
|
if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped): |
|
|
return True |
|
|
|
|
|
if re.search(r'@(GetMapping|PostMapping|PutMapping|DeleteMapping|PatchMapping|RequestMapping)', stripped): |
|
|
return True |
|
|
|
|
|
elif entity_type == "function": |
|
|
|
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
|
|
|
if re.search(rf'\b{re.escape(base_name)}\s*\(', stripped): |
|
|
return True |
|
|
|
|
|
elif entity_type == "method": |
|
|
|
|
|
method_name = full_name.split('.')[-1] |
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped): |
|
|
return True |
|
|
|
|
|
elif entity_type == "variable": |
|
|
|
|
|
for line in code_lines: |
|
|
stripped = line.strip() |
|
|
if re.search(rf'\b{re.escape(base_name)}\b', stripped): |
|
|
return True |
|
|
|
|
|
|
|
|
if entity_type in ["function", "method"] or entity_type is None: |
|
|
for line in code_lines: |
|
|
if re.search(rf'\b{re.escape(base_name)}\s*\(', line): |
|
|
return True |
|
|
|
|
|
if entity_type == "class" or entity_type is None: |
|
|
|
|
|
for line in code_lines: |
|
|
if re.search(rf'\b{re.escape(base_name)}\b', line): |
|
|
return True |
|
|
|
|
|
|
|
|
if entity_type is None or entity_type == "variable": |
|
|
for line in code_lines: |
|
|
if self._contains_identifier(line, base_name): |
|
|
return True |
|
|
|
|
|
return False |
|
|
|
|
|
def map_entities_to_chunks(self, declared_entities: List[Dict[str, Any]], |
|
|
called_entities: List[str], |
|
|
chunks: List[str], |
|
|
file_name: Optional[str] = None) -> Tuple[Dict[int, List[Dict[str, Any]]], |
|
|
Dict[int, List[str]]]: |
|
|
""" |
|
|
Map file-level entities back to their respective chunks |
|
|
|
|
|
Args: |
|
|
declared_entities: List of declared entities from file-level extraction |
|
|
called_entities: List of called entities from file-level extraction |
|
|
chunks: List of code chunks |
|
|
file_name: Name of the file to detect language (optional) |
|
|
|
|
|
Returns: |
|
|
Tuple of (chunk_declared_entities, chunk_called_entities) |
|
|
- chunk_declared_entities: Dict mapping chunk_index -> list of declared entities |
|
|
- chunk_called_entities: Dict mapping chunk_index -> list of called entities |
|
|
""" |
|
|
chunk_declared = {} |
|
|
chunk_called = {} |
|
|
|
|
|
|
|
|
for i in range(len(chunks)): |
|
|
chunk_declared[i] = [] |
|
|
chunk_called[i] = [] |
|
|
|
|
|
|
|
|
for entity in declared_entities: |
|
|
entity_name = entity.get("name", "") |
|
|
entity_type = entity.get("type", "") |
|
|
|
|
|
matching_chunks = self.find_entity_in_chunks(entity_name, chunks, entity_type, file_name) |
|
|
|
|
|
|
|
|
for chunk_idx in matching_chunks: |
|
|
chunk_declared[chunk_idx].append(entity) |
|
|
|
|
|
|
|
|
for called_entity in called_entities: |
|
|
matching_chunks = self.find_entity_in_chunks(called_entity, chunks, None, file_name) |
|
|
|
|
|
|
|
|
for chunk_idx in matching_chunks: |
|
|
if called_entity not in chunk_called[chunk_idx]: |
|
|
chunk_called[chunk_idx].append(called_entity) |
|
|
|
|
|
return chunk_declared, chunk_called |
|
|
|