Spaces:
Configuration error
Configuration error
| """ | |
| Utilities for chunking large code content for AI evaluation | |
| """ | |
| def chunk_text(text, max_chunk_size=3000, overlap=200): | |
| """ | |
| Split text into overlapping chunks | |
| Args: | |
| text (str): Text to chunk | |
| max_chunk_size (int): Maximum characters per chunk | |
| overlap (int): Number of characters to overlap between chunks | |
| Returns: | |
| list: List of text chunks | |
| """ | |
| if len(text) <= max_chunk_size: | |
| return [text] | |
| chunks = [] | |
| start = 0 | |
| while start < len(text): | |
| # Calculate end position | |
| end = start + max_chunk_size | |
| # If this is not the last chunk, try to break at a natural boundary | |
| if end < len(text): | |
| # Look for line breaks near the end | |
| for i in range(min(100, max_chunk_size // 10)): # Look back up to 100 chars | |
| if text[end - i] == '\n': | |
| end = end - i + 1 # Include the newline | |
| break | |
| # Extract chunk | |
| chunk = text[start:end].strip() | |
| if chunk: | |
| chunks.append(chunk) | |
| # Move start position (with overlap) | |
| start = end - overlap if end < len(text) else end | |
| # Prevent infinite loop | |
| if start >= len(text): | |
| break | |
| return chunks | |
| def chunk_code_content(code_content, max_chunk_size=3000): | |
| """ | |
| Intelligently chunk code content, trying to preserve function/class boundaries | |
| Args: | |
| code_content (str): Code content to chunk | |
| max_chunk_size (int): Maximum characters per chunk | |
| Returns: | |
| list: List of code chunks with metadata | |
| """ | |
| if len(code_content) <= max_chunk_size: | |
| return [{ | |
| 'content': code_content, | |
| 'chunk_id': 1, | |
| 'total_chunks': 1, | |
| 'size': len(code_content) | |
| }] | |
| # Split by files first (if multiple files are concatenated) | |
| file_sections = [] | |
| current_section = "" | |
| lines = code_content.split('\n') | |
| for line in lines: | |
| # Look for file separators or headers | |
| if line.startswith('===') or line.startswith('---') or 'File:' in line: | |
| if current_section.strip(): | |
| file_sections.append(current_section.strip()) | |
| current_section = line + '\n' | |
| else: | |
| current_section += line + '\n' | |
| # Add the last section | |
| if current_section.strip(): | |
| file_sections.append(current_section.strip()) | |
| # If no file sections found, treat as single content | |
| if len(file_sections) <= 1: | |
| file_sections = [code_content] | |
| # Chunk each file section | |
| all_chunks = [] | |
| chunk_counter = 1 | |
| for section in file_sections: | |
| if len(section) <= max_chunk_size: | |
| all_chunks.append({ | |
| 'content': section, | |
| 'chunk_id': chunk_counter, | |
| 'size': len(section) | |
| }) | |
| chunk_counter += 1 | |
| else: | |
| # Split large sections into smaller chunks | |
| text_chunks = chunk_text(section, max_chunk_size, overlap=300) | |
| for chunk_text in text_chunks: | |
| all_chunks.append({ | |
| 'content': chunk_text, | |
| 'chunk_id': chunk_counter, | |
| 'size': len(chunk_text) | |
| }) | |
| chunk_counter += 1 | |
| # Add total_chunks to all chunks | |
| total_chunks = len(all_chunks) | |
| for chunk in all_chunks: | |
| chunk['total_chunks'] = total_chunks | |
| return all_chunks | |
| def create_chunk_summary(chunks): | |
| """ | |
| Create a summary of all chunks for context | |
| Args: | |
| chunks (list): List of chunk dictionaries | |
| Returns: | |
| str: Summary of chunks | |
| """ | |
| total_size = sum(chunk['size'] for chunk in chunks) | |
| summary = f""" | |
| Code Analysis Summary: | |
| - Total chunks: {len(chunks)} | |
| - Total content size: {total_size:,} characters | |
| - Average chunk size: {total_size // len(chunks):,} characters | |
| Chunk breakdown: | |
| """ | |
| for i, chunk in enumerate(chunks, 1): | |
| preview = chunk['content'][:100].replace('\n', ' ') | |
| summary += f" Chunk {i}: {chunk['size']:,} chars - {preview}...\n" | |
| return summary | |
| def combine_chunk_evaluations(chunk_results): | |
| """ | |
| Combine evaluation results from multiple chunks | |
| Args: | |
| chunk_results (list): List of evaluation results from each chunk | |
| Returns: | |
| dict: Combined evaluation result | |
| """ | |
| if not chunk_results: | |
| return { | |
| 'relevance_score': 5.0, | |
| 'technical_complexity_score': 5.0, | |
| 'creativity_score': 5.0, | |
| 'documentation_score': 5.0, | |
| 'productivity_score': 5.0, | |
| 'overall_score': 5.0, | |
| 'feedback': 'No evaluation results to combine.', | |
| 'detailed_scores': '{}' | |
| } | |
| if len(chunk_results) == 1: | |
| return chunk_results[0] | |
| # Calculate weighted averages based on chunk sizes | |
| total_weight = sum(result.get('chunk_weight', 1) for result in chunk_results) | |
| combined_scores = { | |
| 'relevance_score': 0, | |
| 'technical_complexity_score': 0, | |
| 'creativity_score': 0, | |
| 'documentation_score': 0, | |
| 'productivity_score': 0 | |
| } | |
| feedbacks = [] | |
| for result in chunk_results: | |
| weight = result.get('chunk_weight', 1) / total_weight | |
| for score_key in combined_scores: | |
| combined_scores[score_key] += result.get(score_key, 5.0) * weight | |
| if result.get('feedback'): | |
| feedbacks.append(f"Chunk {result.get('chunk_id', '?')}: {result['feedback']}") | |
| # Calculate overall score | |
| overall_score = sum(combined_scores.values()) / len(combined_scores) | |
| # Combine feedback | |
| combined_feedback = f""" | |
| Multi-chunk evaluation completed ({len(chunk_results)} chunks analyzed): | |
| """ + "\n\n".join(feedbacks) | |
| return { | |
| 'relevance_score': round(combined_scores['relevance_score'], 1), | |
| 'technical_complexity_score': round(combined_scores['technical_complexity_score'], 1), | |
| 'creativity_score': round(combined_scores['creativity_score'], 1), | |
| 'documentation_score': round(combined_scores['documentation_score'], 1), | |
| 'productivity_score': round(combined_scores['productivity_score'], 1), | |
| 'overall_score': round(overall_score, 1), | |
| 'feedback': combined_feedback, | |
| 'detailed_scores': '{"note": "Combined from multiple chunks"}' | |
| } | |