File size: 21,066 Bytes
a100cc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
import logging
import re
from typing import List, Tuple, Dict, Any, Set, Optional
from enum import Enum


class Language(Enum):
    """Supported programming languages"""
    PYTHON = "python"
    C = "c"
    CPP = "cpp"
    JAVA = "java"


class EntityChunkMapper:
    """Maps entities from file-level extraction back to their respective chunks"""

    def __init__(self):
        self.logger = logging.getLogger("ENTITY_CHUNK_MAPPER")
        self.extension_to_language = {
            'py': Language.PYTHON,
            'pyw': Language.PYTHON,
            'c': Language.C,
            'h': Language.C,
            'cpp': Language.CPP,
            'cc': Language.CPP,
            'cxx': Language.CPP,
            'hpp': Language.CPP,
            'hh': Language.CPP,
            'hxx': Language.CPP,
            'java': Language.JAVA,
        }

    def _detect_language(self, file_name: Optional[str] = None) -> Language:
        """
        Detect the programming language from file extension

        Args:
            file_name: Name of the file (optional)

        Returns:
            Language enum value, defaults to PYTHON if not detected
        """
        if file_name:
            extension = file_name.split('.')[-1].lower()
            return self.extension_to_language.get(extension, Language.PYTHON)
        return Language.PYTHON

    def _is_comment_or_docstring(self, line: str, in_docstring: bool, language: Language) -> Tuple[bool, bool]:
        """
        Check if a line is a comment or part of a docstring/multi-line comment

        Args:
            line: The line to check
            in_docstring: Whether we're currently inside a docstring/multi-line comment
            language: The programming language

        Returns:
            Tuple of (is_comment_or_docstring, new_in_docstring_state)
        """
        stripped = line.strip()

        if language == Language.PYTHON:
            # Check for single-line comments
            if stripped.startswith('#'):
                return True, in_docstring

            # Check for docstring delimiters (""" or ''')
            triple_double = '"""'
            triple_single = "'''"

            # Count occurrences of triple quotes
            if triple_double in stripped or triple_single in stripped:
                # Check if it's a single-line docstring
                if (stripped.count(triple_double) >= 2 or
                    stripped.count(triple_single) >= 2):
                    # Single-line docstring
                    return True, in_docstring
                else:
                    # Toggle docstring state
                    return True, not in_docstring

            # If we're in a docstring, this line is part of it
            if in_docstring:
                return True, in_docstring

        elif language in [Language.C, Language.CPP, Language.JAVA]:
            # Check for single-line comments
            if stripped.startswith('//'):
                return True, in_docstring

            # Check for multi-line comment delimiters /* */
            if '/*' in line and '*/' in line:
                # Single-line multi-line comment
                return True, in_docstring
            elif '/*' in line:
                # Start of multi-line comment
                return True, True
            elif '*/' in line:
                # End of multi-line comment
                return True, False

            # If we're in a multi-line comment
            if in_docstring:
                return True, in_docstring

        return False, in_docstring

    def _get_code_lines(self, chunk_lines: List[str], language: Language) -> List[str]:
        """
        Filter out comments and docstrings from chunk lines

        Args:
            chunk_lines: List of lines in the chunk
            language: The programming language

        Returns:
            List of lines that are actual code (not comments or docstrings)
        """
        code_lines = []
        in_docstring = False

        for line in chunk_lines:
            is_doc, in_docstring = self._is_comment_or_docstring(line, in_docstring, language)
            if not is_doc:
                code_lines.append(line)

        return code_lines

    def _is_valid_identifier_match(self, text: str, identifier: str, position: int) -> bool:
        """
        Check if an identifier match at a position is valid (not part of another word)

        Args:
            text: The text containing the identifier
            identifier: The identifier to check
            position: The position where the identifier was found

        Returns:
            True if this is a valid standalone identifier match
        """
        # Check character before (if exists)
        if position > 0:
            char_before = text[position - 1]
            if char_before.isalnum() or char_before == '_':
                return False

        # Check character after (if exists)
        end_pos = position + len(identifier)
        if end_pos < len(text):
            char_after = text[end_pos]
            if char_after.isalnum() or char_after == '_':
                return False

        return True

    def _contains_identifier(self, line: str, identifier: str) -> bool:
        """
        Check if a line contains an identifier as a standalone word (not part of another word)

        Args:
            line: The line to check
            identifier: The identifier to find

        Returns:
            True if the identifier appears as a standalone word
        """
        # Use word boundary regex for precise matching
        pattern = r'\b' + re.escape(identifier) + r'\b'
        return bool(re.search(pattern, line))


    def find_entity_in_chunks(self, entity_name: str, chunks: List[str], entity_type: str = None,
                            file_name: Optional[str] = None) -> Set[int]:
        """
        Find which chunks contain a specific entity declaration or call

        Args:
            entity_name: Name of the entity to find
            chunks: List of code chunks
            entity_type: Type of entity (class, function, method, variable)
            file_name: Name of the file to detect language (optional)

        Returns:
            Set of chunk indices that contain this entity
        """
        matching_chunks = set()
        language = self._detect_language(file_name)

        # Split the entity name to handle nested entities like "ClassName.method"
        # For Java/C++, also handle :: separator
        if '::' in entity_name:
            parts = entity_name.split('::')
        else:
            parts = entity_name.split('.')
        base_name = parts[-1]  # The actual identifier

        for chunk_idx, chunk in enumerate(chunks):
            chunk_lines = chunk.strip().split('\n')

            # Look for different patterns based on entity type
            if self._entity_appears_in_chunk(entity_name, base_name, chunk, chunk_lines, entity_type, language):
                matching_chunks.add(chunk_idx)

        return matching_chunks

    def _entity_appears_in_chunk(self, full_name: str, base_name: str, chunk: str, chunk_lines: List[str],
                                 entity_type: str, language: Language) -> bool:
        """Check if an entity appears in a specific chunk (excluding comments and docstrings)"""

        # Filter out comments and docstrings
        code_lines = self._get_code_lines(chunk_lines, language)

        # If no code lines remain, entity doesn't appear in actual code
        if not code_lines:
            return False

        # Language-specific entity matching
        if language == Language.PYTHON:
            return self._entity_appears_in_python(full_name, base_name, code_lines, entity_type)
        elif language in [Language.C, Language.CPP]:
            return self._entity_appears_in_c_cpp(full_name, base_name, code_lines, entity_type)
        elif language == Language.JAVA:
            return self._entity_appears_in_java(full_name, base_name, code_lines, entity_type)

        return False

    def _entity_appears_in_python(self, full_name: str, base_name: str, code_lines: List[str],
                                  entity_type: str) -> bool:
        """Check if entity appears in Python code"""

        if entity_type == "class":
            # Look for class definition
            for line in code_lines:
                stripped = line.strip()
                if re.match(rf'class\s+{re.escape(base_name)}[\s:(]', stripped):
                    return True

        elif entity_type == "api_endpoint":
            # Look for API endpoint definition - the function decorated with @app.get, @app.post, etc.
            # We look for the function definition itself
            for line in code_lines:
                stripped = line.strip()
                # Match the function definition with the endpoint name
                if re.match(rf'(async\s+)?def\s+{re.escape(base_name)}\s*\(', stripped):
                    return True
                # Also check for decorators that might reference the endpoint
                if re.search(rf'@\w+\.(get|post|put|delete|patch|options|head)\s*\(', stripped):
                    return True

        elif entity_type == "function":
            # Look for function definition (not method)
            for line in code_lines:
                stripped = line.strip()
                # Check it's not indented (not a method)
                if not line.startswith("    ") and not line.startswith("\t"):
                    if re.match(rf'(async\s+)?def\s+{re.escape(base_name)}\s*\(', stripped):
                        return True

        elif entity_type == "method":
            # Look for method definition (indented def)
            method_name = full_name.split('.')[-1]
            for line in code_lines:
                stripped = line.strip()
                # Check it's indented (is a method)
                if line.startswith("    ") or line.startswith("\t"):
                    if re.match(rf'(async\s+)?def\s+{re.escape(method_name)}\s*\(', stripped):
                        return True

        elif entity_type == "variable":
            # Look for variable assignment or usage
            if "." in full_name:
                parts = full_name.split('.')
                attr_name = parts[-1]
                for line in code_lines:
                    if re.search(rf'\.\s*{re.escape(attr_name)}\b', line):
                        return True
            else:
                for line in code_lines:
                    stripped = line.strip()
                    if re.match(rf'{re.escape(base_name)}\s*[=:]', stripped):
                        return True

        # For called entities, look for usage patterns
        if entity_type in ["function", "method"] or entity_type is None:
            for line in code_lines:
                if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
                    return True

        if entity_type == "class" or entity_type is None:
            for line in code_lines:
                if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
                    return True

        # General usage as identifier
        if entity_type is None or entity_type == "variable":
            for line in code_lines:
                if self._contains_identifier(line, base_name):
                    return True

        return False

    def _extract_using_namespace_directives(self, code_lines: List[str]) -> List[str]:
        """
        Extract using namespace directives from C++ code.
        Returns a list of namespace names that are being imported.
        """
        namespaces = []
        for line in code_lines:
            stripped = line.strip()
            # Match "using namespace <name>;"
            match = re.match(r'using\s+namespace\s+([a-zA-Z_][a-zA-Z0-9_:]*)\s*;', stripped)
            if match:
                namespaces.append(match.group(1))
        return namespaces

    def _entity_appears_in_c_cpp(self, full_name: str, base_name: str, code_lines: List[str],
                                 entity_type: str) -> bool:
        """Check if entity appears in C/C++ code"""

        # Extract using namespace directives
        using_namespaces = self._extract_using_namespace_directives(code_lines)
        
        # Check if the full_name matches any imported namespace + base_name
        # e.g., if full_name is "math::Calculator" and we have "using namespace math",
        # then "Calculator" in code should match
        namespace_match = False
        if '::' in full_name:
            for ns in using_namespaces:
                # Check if full_name starts with this namespace
                if full_name.startswith(ns + '::'):
                    namespace_match = True
                    break

        if entity_type == "class":
            # Look for class/struct definition
            for line in code_lines:
                stripped = line.strip()
                if re.match(rf'(class|struct)\s+{re.escape(base_name)}[\s:{{]', stripped):
                    return True

        elif entity_type == "function":
            # Look for function definition or declaration
            for line in code_lines:
                stripped = line.strip()
                # Match function patterns: return_type function_name(
                # Also handle constructors and destructors
                if (re.search(rf'\b{re.escape(base_name)}\s*\(', stripped) and
                    not stripped.startswith('//')):
                    # Additional check: likely a function if followed by parameters
                    return True

        elif entity_type == "method":
            # Look for method definition (with class scope)
            method_name = full_name.split('::')[-1] if '::' in full_name else full_name.split('.')[-1]
            for line in code_lines:
                stripped = line.strip()
                # Match ClassName::methodName( or just methodName( inside class
                if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped):
                    return True

        elif entity_type == "variable":
            # Look for variable declaration or usage
            for line in code_lines:
                stripped = line.strip()
                # Match variable declarations and assignments
                if re.search(rf'\b{re.escape(base_name)}\b', stripped):
                    return True

        # For called entities, look for usage patterns
        if entity_type in ["function", "method"] or entity_type is None:
            for line in code_lines:
                if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
                    return True

        if entity_type == "class" or entity_type is None:
            # Look for instantiation or usage
            for line in code_lines:
                if re.search(rf'\b{re.escape(base_name)}\b', line):
                    # If we found base_name and there's a namespace match, this is a match
                    if namespace_match:
                        return True
                    # If full_name doesn't have a namespace, it's a direct match
                    if '::' not in full_name:
                        return True

        # General usage as identifier
        if entity_type is None or entity_type == "variable":
            for line in code_lines:
                if self._contains_identifier(line, base_name):
                    # If we found base_name and there's a namespace match, this is a match
                    if namespace_match:
                        return True
                    # If full_name doesn't have a namespace, it's a direct match
                    if '::' not in full_name:
                        return True

        return False

    def _entity_appears_in_java(self, full_name: str, base_name: str, code_lines: List[str],
                                entity_type: str) -> bool:
        """Check if entity appears in Java code"""

        if entity_type == "class":
            # Look for class/interface/enum definition
            for line in code_lines:
                stripped = line.strip()
                if re.match(rf'(public|private|protected)?\s*(class|interface|enum)\s+{re.escape(base_name)}[\s<{{]', stripped):
                    return True
                # Without modifier
                if re.match(rf'(class|interface|enum)\s+{re.escape(base_name)}[\s<{{]', stripped):
                    return True

        elif entity_type == "api_endpoint":
            # Look for API endpoint definition - the method with Spring annotations
            # Extract just the method name from the full qualified name (e.g., "com.example.Controller::method" -> "method")
            method_name = base_name.split('::')[-1] if '::' in base_name else base_name
            for line in code_lines:
                stripped = line.strip()
                # Match the method definition
                if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped):
                    return True
                # Also check for Spring annotations
                if re.search(r'@(GetMapping|PostMapping|PutMapping|DeleteMapping|PatchMapping|RequestMapping)', stripped):
                    return True

        elif entity_type == "function":
            # In Java, functions are methods
            for line in code_lines:
                stripped = line.strip()
                # Match method signature patterns
                if re.search(rf'\b{re.escape(base_name)}\s*\(', stripped):
                    return True

        elif entity_type == "method":
            # Look for method definition
            method_name = full_name.split('.')[-1]
            for line in code_lines:
                stripped = line.strip()
                if re.search(rf'\b{re.escape(method_name)}\s*\(', stripped):
                    return True

        elif entity_type == "variable":
            # Look for variable declaration or usage
            for line in code_lines:
                stripped = line.strip()
                if re.search(rf'\b{re.escape(base_name)}\b', stripped):
                    return True

        # For called entities, look for usage patterns
        if entity_type in ["function", "method"] or entity_type is None:
            for line in code_lines:
                if re.search(rf'\b{re.escape(base_name)}\s*\(', line):
                    return True

        if entity_type == "class" or entity_type is None:
            # Look for instantiation (new ClassName) or usage
            for line in code_lines:
                if re.search(rf'\b{re.escape(base_name)}\b', line):
                    return True

        # General usage as identifier
        if entity_type is None or entity_type == "variable":
            for line in code_lines:
                if self._contains_identifier(line, base_name):
                    return True

        return False

    def map_entities_to_chunks(self, declared_entities: List[Dict[str, Any]],
                               called_entities: List[str],
                               chunks: List[str],
                               file_name: Optional[str] = None) -> Tuple[Dict[int, List[Dict[str, Any]]],
    Dict[int, List[str]]]:
        """
        Map file-level entities back to their respective chunks

        Args:
            declared_entities: List of declared entities from file-level extraction
            called_entities: List of called entities from file-level extraction
            chunks: List of code chunks
            file_name: Name of the file to detect language (optional)

        Returns:
            Tuple of (chunk_declared_entities, chunk_called_entities)
            - chunk_declared_entities: Dict mapping chunk_index -> list of declared entities
            - chunk_called_entities: Dict mapping chunk_index -> list of called entities
        """
        chunk_declared = {}
        chunk_called = {}

        # Initialize empty lists for all chunks
        for i in range(len(chunks)):
            chunk_declared[i] = []
            chunk_called[i] = []

        # Map declared entities to chunks
        for entity in declared_entities:
            entity_name = entity.get("name", "")
            entity_type = entity.get("type", "")

            matching_chunks = self.find_entity_in_chunks(entity_name, chunks, entity_type, file_name)

            # Add entity to matching chunks
            for chunk_idx in matching_chunks:
                chunk_declared[chunk_idx].append(entity)

        # Map called entities to chunks
        for called_entity in called_entities:
            matching_chunks = self.find_entity_in_chunks(called_entity, chunks, None, file_name)

            # Add called entity to matching chunks
            for chunk_idx in matching_chunks:
                if called_entity not in chunk_called[chunk_idx]:
                    chunk_called[chunk_idx].append(called_entity)

        return chunk_declared, chunk_called