Upload 6 files

Browse files

Files changed (6) hide show

coding_expert/config/config.py +169 -0
coding_expert/config/requirements.txt +12 -0
coding_expert/data/prepare_data.py +221 -0
coding_expert/expert.py +21 -0
coding_expert/tasks/validation.py +245 -0
coding_expert/utils/data_processor.py +253 -0

coding_expert/config/config.py ADDED Viewed

	@@ -0,0 +1,169 @@

+"""
+Configuration for the Coding Expert model
+"""
+# Core programming domains
+CODING_DOMAINS = {
+    "programming_languages": {
+        "python": {
+            "level": "expert",
+            "focus": ["data structures", "algorithms", "web development", "machine learning"]
+        },
+        "javascript": {
+            "level": "expert",
+            "focus": ["frontend", "backend", "frameworks", "performance"]
+        },
+        "java": {
+            "level": "expert",
+            "focus": ["enterprise", "concurrency", "frameworks", "design patterns"]
+        },
+        "c++": {
+            "level": "expert",
+            "focus": ["systems", "performance", "templates", "memory management"]
+        },
+        "go": {
+            "level": "expert",
+            "focus": ["concurrency", "networking", "performance", "cloud"]
+        }
+    },
+    "frameworks": {
+        "web": {
+            "django": "expert",
+            "flask": "expert",
+            "fastapi": "expert",
+            "react": "expert",
+            "vue": "expert",
+            "angular": "expert"
+        },
+        "mobile": {
+            "flutter": "expert",
+            "react_native": "expert",
+            "swift": "expert",
+            "kotlin": "expert"
+        },
+        "cloud": {
+            "aws": "expert",
+            "gcp": "expert",
+            "azure": "expert",
+            "kubernetes": "expert"
+        }
+    },
+    "tools": {
+        "ci_cd": ["github_actions", "jenkins", "circleci", "gitlab_ci"],
+        "version_control": ["git", "mercurial"],
+        "package_management": ["pip", "npm", "maven", "gradle", "cargo"],
+        "ide": ["vscode", "pycharm", "intellij", "vim", "emacs"]
+    }
+}
+# Core coding tasks
+CODING_TASKS = {
+    "problem_solving": {
+        "level": "expert",
+        "subtasks": [
+            "algorithm_design",
+            "data_structure_selection",
+            "complexity_analysis",
+            "optimization"
+        ]
+    },
+    "code_review": {
+        "level": "expert",
+        "subtasks": [
+            "architecture_review",
+            "security_review",
+            "performance_review",
+            "code_style_review"
+        ]
+    },
+    "debugging": {
+        "level": "expert",
+        "subtasks": [
+            "memory_leaks",
+            "race_conditions",
+            "performance_bottlenecks",
+            "concurrency_issues"
+        ]
+    },
+    "testing": {
+        "level": "expert",
+        "subtasks": [
+            "unit_testing",
+            "integration_testing",
+            "performance_testing",
+            "security_testing"
+        ]
+    },
+    "architecture_design": {
+        "level": "expert",
+        "subtasks": [
+            "microservices",
+            "distributed_systems",
+            "scalability",
+            "fault_tolerance"
+        ]
+    }
+}
+# Core datasets
+CODING_DATASETS = {
+    "CodeSearchNet": {
+        "source": "codeium/codeium",
+        "split": "train",
+        "fields": ["code", "docstring", "language", "function_name"],
+        "description": "HuggingFace - multi-language code corpus",
+        "tasks": ["code_search", "code_completion", "documentation"]
+    },
+    "HumanEval": {
+        "source": "openai/human_eval",
+        "split": "test",
+        "fields": ["task_id", "prompt", "canonical_solution", "test", "entry_point"],
+        "description": "OpenAI's functional code evaluation dataset",
+        "tasks": ["code_generation", "function_implementation", "unit_testing"]
+    },
+    "MBPP": {
+        "source": "mbpp/mbpp",
+        "split": "train",
+        "fields": ["task_id", "text", "code", "test_list", "challenge_test_list"],
+        "description": "Mostly Basic Python Problems",
+        "tasks": ["problem_solving", "code_generation", "unit_testing"]
+    },
+    "Spider": {
+        "source": "yale-lily/spider",
+        "split": "train",
+        "fields": ["query", "question", "db_id", "sql"],
+        "description": "Text-to-SQL mapping",
+        "tasks": ["sql_generation", "text_to_sql", "database_queries"]
+    },
+    "DeepFix": {
+        "source": "deepfix/deepfix",
+        "split": "train",
+        "fields": ["code", "fixed_code", "error_type"],
+        "description": "Bug fixing dataset",
+        "tasks": ["bug_fixing", "error_detection", "code_correction"]
+    },
+    "CodeXGLUE": {
+        "source": "microsoft/CodeXGLUE",
+        "split": "train",
+        "fields": ["code", "docstring", "task", "language"],
+        "description": "Multitask code understanding/generation benchmark",
+        "tasks": ["code_translation", "code_summarization", "code_generation"]
+    }
+}
+# Print configuration summary
+def print_config_summary():
+    print("\nCoding Expert Configuration Summary:")
+    print(f"Number of domains: {len(CODING_DOMAINS)}")
+    print(f"Number of languages: {len(CODING_DOMAINS['programming_languages'])}")
+    print(f"Number of tasks: {len(CODING_TASKS)}")
+    print(f"Number of datasets: {len(CODING_DATASETS)}")
+    print("\nDataset Details:")
+    for name, config in CODING_DATASETS.items():
+        print(f"\n{name}:")
+        print(f"Description: {config['description']}")
+        print(f"Tasks: {', '.join(config['tasks'])}")
+        print(f"Fields: {', '.join(config['fields'])}")
+if __name__ == "__main__":
+    print_config_summary()

coding_expert/config/requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+transformers>=4.30.0
+sympy>=1.11.1
+torch>=2.0.0
+numpy>=1.24.0
+scipy>=1.10.0
+pandas>=2.0.0
+huggingface_hub>=0.16.0
+jsonlines>=3.0.0
+pyyaml>=5.4.1
+datasets>=2.14.0
+psutil>=5.9.0
+astroid>=2.16.0

coding_expert/data/prepare_data.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""
+Data preparation script for the Coding Expert model
+"""
+import os
+import json
+from pathlib import Path
+import jsonlines
+from typing import Dict, List, Any
+import sys
+import psutil
+from datasets import load_dataset
+import ast
+import numpy as np
+from data_processor import CodeDataProcessor
+class CodeDataPreparer:
+    def __init__(self, output_dir: str = "processed_data"):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(exist_ok=True)
+        self.datasets = {
+            "CodeSearchNet": {
+                "source": "codeium/codeium",
+                "split": "train",
+                "fields": ["code", "docstring", "language", "function_name"]
+            },
+            "HumanEval": {
+                "source": "openai/human_eval",
+                "split": "test",
+                "fields": ["task_id", "prompt", "canonical_solution", "test", "entry_point"]
+            },
+            "MBPP": {
+                "source": "mbpp/mbpp",
+                "split": "train",
+                "fields": ["task_id", "text", "code", "test_list", "challenge_test_list"]
+            },
+            "Spider": {
+                "source": "yale-lily/spider",
+                "split": "train",
+                "fields": ["query", "question", "db_id", "sql"]
+            },
+            "DeepFix": {
+                "source": "deepfix/deepfix",
+                "split": "train",
+                "fields": ["code", "fixed_code", "error_type"]
+            },
+            "CodeXGLUE": {
+                "source": "microsoft/CodeXGLUE",
+                "split": "train",
+                "fields": ["code", "docstring", "task", "language"]
+            }
+        }
+    def process_dataset(self, dataset: List[Dict[str, Any]], dataset_name: str) -> List[Dict[str, Any]]:
+        """Process a specific dataset"""
+        processed = []
+        error_count = 0
+        print(f"\nProcessing {dataset_name} dataset...")
+        for idx, example in enumerate(dataset):
+            try:
+                processed_example = self._process_example(dataset_name, example)
+                processed.append(processed_example)
+            except Exception as e:
+                print(f"Error processing example {idx} in {dataset_name}: {str(e)}")
+                error_count += 1
+        print(f"Processed {len(processed)} examples from {dataset_name}")
+        print(f"Encountered {error_count} errors during processing")
+        return processed
+    def _process_example(self, dataset_name: str, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process a single example based on its dataset type"""
+        if dataset_name == "CodeSearchNet":
+            return self._process_code_search_net(example)
+        elif dataset_name == "HumanEval":
+            return self._process_human_eval(example)
+        elif dataset_name == "MBPP":
+            return self._process_mbpp(example)
+        elif dataset_name == "Spider":
+            return self._process_spider(example)
+        elif dataset_name == "DeepFix":
+            return self._process_deep_fix(example)
+        elif dataset_name == "CodeXGLUE":
+            return self._process_codexglue(example)
+        else:
+            raise ValueError(f"Unknown dataset: {dataset_name}")
+    def _process_code_search_net(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process CodeSearchNet example"""
+        return {
+            "code": example["code"].strip(),
+            "docstring": example["docstring"].strip(),
+            "language": example["language"],
+            "function_name": example["function_name"],
+            "code_analysis": self._analyze_code(example["code"])
+        }
+    def _process_human_eval(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process HumanEval example"""
+        return {
+            "task_id": example["task_id"],
+            "prompt": example["prompt"].strip(),
+            "solution": example["canonical_solution"].strip(),
+            "test": example["test"].strip(),
+            "entry_point": example["entry_point"],
+            "code_analysis": self._analyze_code(example["canonical_solution"])
+        }
+    def _process_mbpp(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process MBPP example"""
+        return {
+            "task_id": example["task_id"],
+            "problem": example["text"].strip(),
+            "solution": example["code"].strip(),
+            "test_list": example["test_list"],
+            "challenge_test_list": example["challenge_test_list"],
+            "code_analysis": self._analyze_code(example["code"])
+        }
+    def _process_spider(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process Spider example"""
+        return {
+            "query": example["query"].strip(),
+            "question": example["question"].strip(),
+            "db_id": example["db_id"],
+            "sql": example["sql"].strip(),
+            "code_analysis": self._analyze_code(example["sql"])
+        }
+    def _process_deep_fix(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process DeepFix example"""
+        return {
+            "original_code": example["code"].strip(),
+            "fixed_code": example["fixed_code"].strip(),
+            "error_type": example["error_type"],
+            "code_analysis": self._analyze_code(example["fixed_code"])
+        }
+    def _process_codexglue(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process CodeXGLUE example"""
+        return {
+            "code": example["code"].strip(),
+            "docstring": example["docstring"].strip(),
+            "task": example["task"],
+            "language": example["language"],
+            "code_analysis": self._analyze_code(example["code"])
+        }
+    def _analyze_code(self, code: str) -> Dict[str, Any]:
+        """Analyze code structure and complexity"""
+        try:
+            tree = ast.parse(code)
+            return {
+                "num_functions": len([node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]),
+                "num_classes": len([node for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]),
+                "complexity": self._calculate_complexity(tree)
+            }
+        except Exception as e:
+            return {"error": str(e)}
+    def _calculate_complexity(self, tree: ast.AST) -> int:
+        """Calculate cyclomatic complexity"""
+        complexity = 1  # Start with 1 for the main program
+        for node in ast.walk(tree):
+            if isinstance(node, (ast.If, ast.For, ast.While, ast.Try, ast.ExceptHandler)):
+                complexity += 1
+        return complexity
+    def save_to_jsonl(self, data: List[Dict[str, Any]], filename: str):
+        """Save data to JSONL file"""
+        filepath = self.output_dir / filename
+        with jsonlines.open(filepath, mode='w') as writer:
+            writer.write_all(data)
+        return filepath
+    def print_sample(self, data: List[Dict[str, Any]], count: int = 3):
+        """Print sample of processed data"""
+        print("\nSample data:")
+        for i, example in enumerate(data[:count]):
+            print(f"\nSample {i+1}:")
+            print(json.dumps(example, indent=2))
+    def print_memory_usage(self):
+        """Print current memory usage"""
+        process = psutil.Process()
+        memory_info = process.memory_info()
+        print(f"Current memory usage: {memory_info.rss / 1024 / 1024:.2f} MB")
+def main():
+    preparer = CodeDataPreparer()
+    # Process each dataset
+    for dataset_name, config in preparer.datasets.items():
+        try:
+            print(f"\nLoading {dataset_name} dataset...")
+            dataset = load_dataset(config["source"], split=config["split"])
+            print(f"Loaded {len(dataset)} samples from {dataset_name}")
+            processed_data = preparer.process_dataset(dataset, dataset_name)
+            print(f"Processed {len(processed_data)} samples")
+            preparer.print_sample(processed_data)
+            # Save processed data
+            output_path = preparer.save_to_jsonl(
+                processed_data,
+                f"{dataset_name.lower()}_processed.jsonl"
+            )
+            print(f"\nSaved {dataset_name} data to: {output_path}")
+        except Exception as e:
+            print(f"Error processing {dataset_name} dataset: {str(e)}")
+            print("Continuing with next dataset...")
+    # Print memory usage
+    preparer.print_memory_usage()
+if __name__ == "__main__":
+    main()

coding_expert/expert.py ADDED Viewed

	@@ -0,0 +1,21 @@

+from .config import config
+from .utils.data_processor import DataProcessor
+from .tasks.validation import Validation
+class CodingExpert:
+    def __init__(self):
+        self.config = config
+        self.data_processor = DataProcessor()
+        self.validator = Validation()
+    def process_data(self, input_data):
+        """Process coding-related data using the data processor."""
+        return self.data_processor.process(input_data)
+    def validate(self, code):
+        """Validate code using the validation system."""
+        return self.validator.validate(code)
+    def get_config(self):
+        """Return the current configuration."""
+        return self.config

coding_expert/tasks/validation.py ADDED Viewed

	@@ -0,0 +1,245 @@

+"""
+Validation module for the Coding Expert model
+"""
+import os
+import json
+from pathlib import Path
+import hashlib
+import datetime
+from typing import Dict, Any, List, Optional
+import subprocess
+import ast
+import sys
+import psutil
+class CodeValidator:
+    def __init__(self, checkpoint_dir: str = "checkpoints"):
+        self.checkpoint_dir = Path(checkpoint_dir)
+        self.checkpoint_dir.mkdir(exist_ok=True)
+        self.validation_dir = self.checkpoint_dir / "validation"
+        self.validation_dir.mkdir(exist_ok=True)
+        # Initialize validation metrics
+        self.metrics = {
+            "code_quality": [],
+            "performance": [],
+            "memory_usage": [],
+            "error_count": []
+        }
+    def validate_code(self, code: str, language: str = "python") -> Dict[str, Any]:
+        """Validate code quality and performance"""
+        try:
+            # Parse the code to check syntax
+            tree = ast.parse(code)
+            # Calculate code metrics
+            metrics = self._calculate_code_metrics(tree)
+            # Run static analysis
+            static_analysis = self._run_static_analysis(code, language)
+            # Check for common issues
+            issues = self._check_common_issues(tree)
+            return {
+                "is_valid": not issues,
+                "metrics": metrics,
+                "static_analysis": static_analysis,
+                "issues": issues,
+                "validation_score": self._calculate_validation_score(metrics, issues)
+            }
+        except Exception as e:
+            return {
+                "is_valid": False,
+                "error": str(e),
+                "validation_score": 0.0
+            }
+    def _calculate_code_metrics(self, tree: ast.AST) -> Dict[str, Any]:
+        """Calculate various code metrics"""
+        return {
+            "complexity": self._calculate_complexity(tree),
+            "num_functions": len([node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]),
+            "num_classes": len([node for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]),
+            "num_imports": len([node for node in ast.walk(tree) if isinstance(node, ast.Import)]),
+            "num_statements": len([node for node in ast.walk(tree) if isinstance(node, ast.stmt)])
+        }
+    def _calculate_complexity(self, tree: ast.AST) -> int:
+        """Calculate cyclomatic complexity"""
+        complexity = 1  # Start with 1 for the main program
+        for node in ast.walk(tree):
+            if isinstance(node, (ast.If, ast.For, ast.While, ast.Try, ast.ExceptHandler)):
+                complexity += 1
+        return complexity
+    def _run_static_analysis(self, code: str, language: str) -> Dict[str, Any]:
+        """Run static analysis tools"""
+        if language == "python":
+            try:
+                # Run pylint
+                process = subprocess.run(
+                    ["pylint", "-"],
+                    input=code,
+                    capture_output=True,
+                    text=True,
+                    timeout=5
+                )
+                score = float(process.stdout.split("Your code has been rated at")[1].split()[0])
+                return {
+                    "pylint_score": score,
+                    "issues": process.stdout.count("error")
+                }
+            except Exception as e:
+                return {
+                    "pylint_score": 0.0,
+                    "error": str(e)
+                }
+        return {}
+    def _check_common_issues(self, tree: ast.AST) -> List[str]:
+        """Check for common code issues"""
+        issues = []
+        # Check for global variables
+        for node in ast.walk(tree):
+            if isinstance(node, ast.Global):
+                issues.append("Global variables detected")
+        # Check for long functions
+        for node in ast.walk(tree):
+            if isinstance(node, ast.FunctionDef):
+                if len(node.body) > 50:
+                    issues.append(f"Function {node.name} is too long")
+        # Check for complex if statements
+        for node in ast.walk(tree):
+            if isinstance(node, ast.If):
+                if len(node.body) > 20:
+                    issues.append("Complex if statement detected")
+        return issues
+    def _calculate_validation_score(self, metrics: Dict[str, Any], issues: List[str]) -> float:
+        """Calculate overall validation score"""
+        score = 1.0
+        # Penalize for code complexity
+        score *= 0.9 if metrics["complexity"] > 10 else 1.0
+        # Penalize for issues
+        score *= 0.9 ** len(issues)
+        return max(0.0, min(1.0, score))
+    def create_checkpoint(self, data: Dict[str, Any], name: str = None) -> str:
+        """Create a checkpoint of validation data"""
+        if name is None:
+            name = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        checkpoint_path = self.validation_dir / f"checkpoint_{name}.json"
+        # Add timestamp and hash
+        data["timestamp"] = str(datetime.datetime.now())
+        data["hash"] = hashlib.sha256(str(data).encode()).hexdigest()
+        with open(checkpoint_path, 'w') as f:
+            json.dump(data, f, indent=2)
+        return str(checkpoint_path)
+    def load_checkpoint(self, name: str) -> Optional[Dict[str, Any]]:
+        """Load a validation checkpoint"""
+        checkpoint_path = self.validation_dir / f"checkpoint_{name}.json"
+        if not checkpoint_path.exists():
+            return None
+        with open(checkpoint_path, 'r') as f:
+            return json.load(f)
+    def validate_dataset(self, dataset: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """Validate a complete dataset"""
+        results = []
+        error_count = 0
+        for idx, example in enumerate(dataset):
+            try:
+                # Validate code
+                if "code" in example:
+                    code_result = self.validate_code(
+                        example["code"],
+                        example.get("language", "python")
+                    )
+                    results.append(code_result)
+                # Validate code review
+                if "review" in example:
+                    review_result = self._validate_code_review(
+                        example["code"],
+                        example["review"]
+                    )
+                    results.append(review_result)
+            except Exception as e:
+                error_count += 1
+                results.append({
+                    "error": str(e),
+                    "validation_score": 0.0
+                })
+        # Calculate overall metrics
+        scores = [r["validation_score"] for r in results if "validation_score" in r]
+        if scores:
+            avg_score = np.mean(scores)
+        else:
+            avg_score = 0.0
+        return {
+            "total_examples": len(dataset),
+            "processed_examples": len(results),
+            "error_count": error_count,
+            "average_score": float(avg_score),
+            "detailed_results": results
+        }
+    def save_validation_report(self, report: Dict[str, Any], name: str = None) -> str:
+        """Save a validation report"""
+        if name is None:
+            name = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+        report_path = self.validation_dir / f"report_{name}.json"
+        # Add timestamp and summary metrics
+        report["timestamp"] = str(datetime.datetime.now())
+        report["summary"] = {
+            "accuracy": report.get("average_score", 0.0),
+            "error_rate": report.get("error_count", 0) / report.get("total_examples", 1)
+        }
+        with open(report_path, 'w') as f:
+            json.dump(report, f, indent=2)
+        return str(report_path)
+    def _validate_code_review(self, code: str, review: str) -> Dict[str, Any]:
+        """Validate code review comments"""
+        try:
+            # Validate code
+            code_result = self.validate_code(code)
+            # Check if review addresses key issues
+            issues = self._check_common_issues(ast.parse(code))
+            review_issues = [issue for issue in issues if issue.lower() in review.lower()]
+            return {
+                "is_valid": len(review_issues) > 0,
+                "review_issues_covered": len(review_issues),
+                "total_issues": len(issues),
+                "validation_score": code_result["validation_score"] * (len(review_issues) / len(issues) if issues else 1.0)
+            }
+        except Exception as e:
+            return {
+                "is_valid": False,
+                "error": str(e),
+                "validation_score": 0.0
+            }

coding_expert/utils/data_processor.py ADDED Viewed

	@@ -0,0 +1,253 @@

+"""
+Data processing utilities for the Coding Expert model
+"""
+import json
+import os
+from pathlib import Path
+import jsonlines
+from typing import Dict, List, Any, Optional, Tuple
+import hashlib
+import datetime
+import logging
+import numpy as np
+import pandas as pd
+from datasets import Dataset
+from tqdm import tqdm
+import ast
+import re
+from collections import Counter
+class CodeDataProcessor:
+    def __init__(self, output_dir: str = "processed_data"):
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(exist_ok=True)
+        self.logger = self._setup_logger()
+    def _setup_logger(self) -> logging.Logger:
+        """Setup logging specific to code processing"""
+        logger = logging.getLogger(__name__)
+        logger.setLevel(logging.INFO)
+        handler = logging.StreamHandler()
+        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+        handler.setFormatter(formatter)
+        logger.addHandler(handler)
+        return logger
+    def process_code(self, code: str, language: str = "python") -> Dict[str, Any]:
+        """Process and analyze code snippet"""
+        try:
+            # Basic cleaning
+            code = self._clean_code(code)
+            # Parse AST if possible
+            ast_info = self._parse_ast(code, language)
+            # Extract code metrics
+            metrics = self._extract_code_metrics(code, ast_info)
+            # Identify patterns and anti-patterns
+            patterns = self._identify_patterns(code)
+            return {
+                "code": code,
+                "language": language,
+                "ast_info": ast_info,
+                "metrics": metrics,
+                "patterns": patterns
+            }
+        except Exception as e:
+            self.logger.warning(f"Error processing code: {str(e)}")
+            return {"error": str(e)}
+    def _clean_code(self, code: str) -> str:
+        """Clean code by removing unnecessary whitespace and comments"""
+        # Remove trailing whitespace
+        code = code.strip()
+        # Remove empty lines
+        lines = [line.strip() for line in code.split('\n') if line.strip()]
+        code = '\n'.join(lines)
+        return code
+    def _parse_ast(self, code: str, language: str) -> Dict[str, Any]:
+        """Parse code into AST and extract structure"""
+        try:
+            if language == "python":
+                tree = ast.parse(code)
+                return {
+                    "num_functions": len([node for node in ast.walk(tree) if isinstance(node, ast.FunctionDef)]),
+                    "num_classes": len([node for node in ast.walk(tree) if isinstance(node, ast.ClassDef)]),
+                    "complexity": self._calculate_complexity(tree)
+                }
+            return {}
+        except Exception as e:
+            return {"error": str(e)}
+    def _calculate_complexity(self, tree: ast.AST) -> int:
+        """Calculate cyclomatic complexity"""
+        complexity = 1  # Start with 1 for the main program
+        for node in ast.walk(tree):
+            if isinstance(node, (ast.If, ast.For, ast.While, ast.Try, ast.ExceptHandler)):
+                complexity += 1
+        return complexity
+    def _extract_code_metrics(self, code: str, ast_info: Dict[str, Any]) -> Dict[str, Any]:
+        """Extract various code metrics"""
+        metrics = {
+            "length": len(code),
+            "lines": len(code.split('\n')),
+            "tokens": len(code.split()),
+            "unique_tokens": len(set(code.split())),
+            "ast_complexity": ast_info.get("complexity", 0),
+            "function_count": ast_info.get("num_functions", 0),
+            "class_count": ast_info.get("num_classes", 0)
+        }
+        # Calculate token distribution
+        tokens = code.split()
+        token_dist = Counter(tokens)
+        metrics["token_distribution"] = token_dist.most_common(5)
+        return metrics
+    def _identify_patterns(self, code: str) -> Dict[str, List[str]]:
+        """Identify common code patterns and anti-patterns"""
+        patterns = {
+            "design_patterns": [],
+            "anti_patterns": [],
+            "security_issues": []
+        }
+        # Look for common design patterns
+        if "class" in code and "def" in code:
+            patterns["design_patterns"].append("Class-based design")
+        # Look for anti-patterns
+        if "global" in code:
+            patterns["anti_patterns"].append("Global variables")
+        # Look for security issues
+        if "eval(" in code:
+            patterns["security_issues"].append("Eval usage")
+        return patterns
+    def process_dataset(self, dataset: Dataset, dataset_name: str) -> List[Dict[str, Any]]:
+        """Process a complete dataset"""
+        processed = []
+        error_count = 0
+        self.logger.info(f"Processing {dataset_name} dataset with {len(dataset)} samples")
+        for idx, example in enumerate(tqdm(dataset, desc=f"Processing {dataset_name}")):
+            try:
+                processed_example = self._process_example(example, dataset_name)
+                processed.append(processed_example)
+            except Exception as e:
+                error_count += 1
+                self.logger.error(f"Error processing example {idx} in {dataset_name}: {str(e)}")
+        self.logger.info(f"Processed {len(processed)} examples")
+        self.logger.info(f"Encountered {error_count} errors")
+        return processed
+    def _process_example(self, example: Dict[str, Any], dataset_name: str) -> Dict[str, Any]:
+        """Process a single example based on dataset type"""
+        if dataset_name == "CodeSearchNet":
+            return self._process_code_search_net(example)
+        elif dataset_name == "HumanEval":
+            return self._process_human_eval(example)
+        elif dataset_name == "MBPP":
+            return self._process_mbpp(example)
+        elif dataset_name == "Spider":
+            return self._process_spider(example)
+        elif dataset_name == "DeepFix":
+            return self._process_deep_fix(example)
+        elif dataset_name == "CodeXGLUE":
+            return self._process_codexglue(example)
+        else:
+            raise ValueError(f"Unknown dataset: {dataset_name}")
+    def _process_code_search_net(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process CodeSearchNet example"""
+        return {
+            "code": example["code"].strip(),
+            "docstring": example["docstring"].strip(),
+            "language": example["language"],
+            "function_name": example["function_name"],
+            "code_analysis": self.process_code(example["code"])  # Reuse code processing
+        }
+    def _process_human_eval(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process HumanEval example"""
+        return {
+            "task_id": example["task_id"],
+            "prompt": example["prompt"].strip(),
+            "solution": example["canonical_solution"].strip(),
+            "test": example["test"].strip(),
+            "entry_point": example["entry_point"],
+            "code_analysis": self.process_code(example["canonical_solution"])  # Reuse code processing
+        }
+    def _process_mbpp(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process MBPP example"""
+        return {
+            "task_id": example["task_id"],
+            "problem": example["text"].strip(),
+            "solution": example["code"].strip(),
+            "test_list": example["test_list"],
+            "challenge_test_list": example["challenge_test_list"],
+            "code_analysis": self.process_code(example["code"])  # Reuse code processing
+        }
+    def _process_spider(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process Spider example"""
+        return {
+            "query": example["query"].strip(),
+            "question": example["question"].strip(),
+            "db_id": example["db_id"],
+            "sql": example["sql"].strip(),
+            "code_analysis": self.process_code(example["sql"])  # Reuse code processing
+        }
+    def _process_deep_fix(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process DeepFix example"""
+        return {
+            "original_code": example["code"].strip(),
+            "fixed_code": example["fixed_code"].strip(),
+            "error_type": example["error_type"],
+            "code_analysis": self.process_code(example["fixed_code"])  # Reuse code processing
+        }
+    def _process_codexglue(self, example: Dict[str, Any]) -> Dict[str, Any]:
+        """Process CodeXGLUE example"""
+        return {
+            "code": example["code"].strip(),
+            "docstring": example["docstring"].strip(),
+            "task": example["task"],
+            "language": example["language"],
+            "code_analysis": self.process_code(example["code"])  # Reuse code processing
+        }
+    def save_to_jsonl(self, data: List[Dict[str, Any]], filename: str) -> Path:
+        """Save processed data to JSONL file"""
+        filepath = self.output_dir / filename
+        with jsonlines.open(filepath, mode='w') as writer:
+            writer.write_all(data)
+        self.logger.info(f"Saved data to {filepath}")
+        return filepath
+    def print_sample(self, data: List[Dict[str, Any]], count: int = 3):
+        """Print sample of processed data"""
+        self.logger.info("\nSample data:")
+        for i, example in enumerate(data[:count]):
+            self.logger.info(f"\nSample {i+1}:")
+            self.logger.info(json.dumps(example, indent=2))
+    def print_memory_usage(self):
+        """Print current memory usage"""
+        process = psutil.Process()
+        memory_info = process.memory_info()
+        self.logger.info(f"Current memory usage: {memory_info.rss / 1024 / 1024:.2f} MB")