File size: 6,581 Bytes
d122c3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
"""
Utilities for chunking large code content for AI evaluation
"""

def chunk_text(text, max_chunk_size=3000, overlap=200):
    """
    Split text into overlapping chunks
    
    Args:
        text (str): Text to chunk
        max_chunk_size (int): Maximum characters per chunk
        overlap (int): Number of characters to overlap between chunks
    
    Returns:
        list: List of text chunks
    """
    if len(text) <= max_chunk_size:
        return [text]
    
    chunks = []
    start = 0
    
    while start < len(text):
        # Calculate end position
        end = start + max_chunk_size
        
        # If this is not the last chunk, try to break at a natural boundary
        if end < len(text):
            # Look for line breaks near the end
            for i in range(min(100, max_chunk_size // 10)):  # Look back up to 100 chars
                if text[end - i] == '\n':
                    end = end - i + 1  # Include the newline
                    break
        
        # Extract chunk
        chunk = text[start:end].strip()
        if chunk:
            chunks.append(chunk)
        
        # Move start position (with overlap)
        start = end - overlap if end < len(text) else end
        
        # Prevent infinite loop
        if start >= len(text):
            break
    
    return chunks

def chunk_code_content(code_content, max_chunk_size=3000):
    """
    Intelligently chunk code content, trying to preserve function/class boundaries
    
    Args:
        code_content (str): Code content to chunk
        max_chunk_size (int): Maximum characters per chunk
    
    Returns:
        list: List of code chunks with metadata
    """
    if len(code_content) <= max_chunk_size:
        return [{
            'content': code_content,
            'chunk_id': 1,
            'total_chunks': 1,
            'size': len(code_content)
        }]
    
    # Split by files first (if multiple files are concatenated)
    file_sections = []
    current_section = ""
    
    lines = code_content.split('\n')
    for line in lines:
        # Look for file separators or headers
        if line.startswith('===') or line.startswith('---') or 'File:' in line:
            if current_section.strip():
                file_sections.append(current_section.strip())
            current_section = line + '\n'
        else:
            current_section += line + '\n'
    
    # Add the last section
    if current_section.strip():
        file_sections.append(current_section.strip())
    
    # If no file sections found, treat as single content
    if len(file_sections) <= 1:
        file_sections = [code_content]
    
    # Chunk each file section
    all_chunks = []
    chunk_counter = 1
    
    for section in file_sections:
        if len(section) <= max_chunk_size:
            all_chunks.append({
                'content': section,
                'chunk_id': chunk_counter,
                'size': len(section)
            })
            chunk_counter += 1
        else:
            # Split large sections into smaller chunks
            text_chunks = chunk_text(section, max_chunk_size, overlap=300)
            for chunk_text in text_chunks:
                all_chunks.append({
                    'content': chunk_text,
                    'chunk_id': chunk_counter,
                    'size': len(chunk_text)
                })
                chunk_counter += 1
    
    # Add total_chunks to all chunks
    total_chunks = len(all_chunks)
    for chunk in all_chunks:
        chunk['total_chunks'] = total_chunks
    
    return all_chunks

def create_chunk_summary(chunks):
    """
    Create a summary of all chunks for context
    
    Args:
        chunks (list): List of chunk dictionaries
    
    Returns:
        str: Summary of chunks
    """
    total_size = sum(chunk['size'] for chunk in chunks)
    
    summary = f"""
Code Analysis Summary:
- Total chunks: {len(chunks)}
- Total content size: {total_size:,} characters
- Average chunk size: {total_size // len(chunks):,} characters

Chunk breakdown:
"""
    
    for i, chunk in enumerate(chunks, 1):
        preview = chunk['content'][:100].replace('\n', ' ')
        summary += f"  Chunk {i}: {chunk['size']:,} chars - {preview}...\n"
    
    return summary

def combine_chunk_evaluations(chunk_results):
    """
    Combine evaluation results from multiple chunks
    
    Args:
        chunk_results (list): List of evaluation results from each chunk
    
    Returns:
        dict: Combined evaluation result
    """
    if not chunk_results:
        return {
            'relevance_score': 5.0,
            'technical_complexity_score': 5.0,
            'creativity_score': 5.0,
            'documentation_score': 5.0,
            'productivity_score': 5.0,
            'overall_score': 5.0,
            'feedback': 'No evaluation results to combine.',
            'detailed_scores': '{}'
        }
    
    if len(chunk_results) == 1:
        return chunk_results[0]
    
    # Calculate weighted averages based on chunk sizes
    total_weight = sum(result.get('chunk_weight', 1) for result in chunk_results)
    
    combined_scores = {
        'relevance_score': 0,
        'technical_complexity_score': 0,
        'creativity_score': 0,
        'documentation_score': 0,
        'productivity_score': 0
    }
    
    feedbacks = []
    
    for result in chunk_results:
        weight = result.get('chunk_weight', 1) / total_weight
        
        for score_key in combined_scores:
            combined_scores[score_key] += result.get(score_key, 5.0) * weight
        
        if result.get('feedback'):
            feedbacks.append(f"Chunk {result.get('chunk_id', '?')}: {result['feedback']}")
    
    # Calculate overall score
    overall_score = sum(combined_scores.values()) / len(combined_scores)
    
    # Combine feedback
    combined_feedback = f"""
Multi-chunk evaluation completed ({len(chunk_results)} chunks analyzed):

""" + "\n\n".join(feedbacks)
    
    return {
        'relevance_score': round(combined_scores['relevance_score'], 1),
        'technical_complexity_score': round(combined_scores['technical_complexity_score'], 1),
        'creativity_score': round(combined_scores['creativity_score'], 1),
        'documentation_score': round(combined_scores['documentation_score'], 1),
        'productivity_score': round(combined_scores['productivity_score'], 1),
        'overall_score': round(overall_score, 1),
        'feedback': combined_feedback,
        'detailed_scores': '{"note": "Combined from multiple chunks"}'
    }