| | """Main entry point for the distiller package.""" |
| |
|
| | from typing import Annotated |
| |
|
| | import typer |
| |
|
| | app = typer.Typer( |
| | help="Model2Vec Code-Specialized Distillation Pipeline", |
| | no_args_is_help=True, |
| | context_settings={"help_option_names": ["-h", "--help"]}, |
| | ) |
| |
|
| |
|
| | @app.command() |
| | def distill( |
| | use_beam: Annotated[bool, typer.Option(help="Use Beam for distillation")] = False, |
| | train: Annotated[bool, typer.Option(help="Enable advanced training (CodeSearchNet fine-tuning)")] = False, |
| | teacher_models: Annotated[list[str] | None, typer.Option(help="Specific teacher models to distill")] = None, |
| | pca_dims: Annotated[int | None, typer.Option(help="PCA dimensions (uses config default if not specified)")] = None, |
| | clear_cache: Annotated[ |
| | bool, typer.Option(help="Clear HuggingFace cache for problematic models before distillation") |
| | ] = False, |
| | clear_checkpoints: Annotated[ |
| | bool, typer.Option(help="Clear tokenlearn checkpoints to force fresh featurization and training") |
| | ] = False, |
| | use_optimized_dataset: Annotated[ |
| | bool, |
| | typer.Option( |
| | "--use-optimized-dataset", help="Use the pre-created optimized dataset from code_model2vec/dataset" |
| | ), |
| | ] = False, |
| | dataset_path: Annotated[ |
| | str | None, |
| | typer.Option("--dataset-path", help="Path to custom dataset directory (defaults to code_model2vec/dataset)"), |
| | ] = None, |
| | ) -> None: |
| | """Run unified Model2Vec distillation with optional training.""" |
| | from .distill import main as distill_main |
| |
|
| | |
| | distill_main( |
| | use_beam, |
| | train, |
| | teacher_models, |
| | pca_dims, |
| | clear_cache, |
| | clear_checkpoints, |
| | use_optimized_dataset, |
| | dataset_path, |
| | ) |
| |
|
| |
|
| | @app.command() |
| | def evaluate( |
| | use_beam: Annotated[bool, typer.Option(help="Use Beam for evaluation")] = False, |
| | skip_third_party: Annotated[bool, typer.Option(help="Skip third-party models")] = False, |
| | skip_benchmark: Annotated[bool, typer.Option(help="Skip performance benchmarking")] = False, |
| | max_queries: Annotated[int, typer.Option(help="Maximum queries per language")] = 100, |
| | ) -> None: |
| | """Run CodeSearchNet evaluation on models.""" |
| | from .evaluate import main as evaluate_main |
| |
|
| | |
| | evaluate_main(use_beam, skip_third_party, skip_benchmark, max_queries) |
| |
|
| |
|
| | @app.command() |
| | def analyze( |
| | results_dir: Annotated[str | None, typer.Option(help="Results directory")] = None, |
| | model_name: Annotated[str, typer.Option(help="Model name for analysis")] = "gte_qwen2_m2v_code (Ours)", |
| | output: Annotated[str, typer.Option(help="Output report file")] = "REPORT.md", |
| | export_csv: Annotated[str | None, typer.Option(help="Export results to CSV")] = None, |
| | ) -> None: |
| | """Generate comprehensive analysis reports.""" |
| | from .analyze import main as analyze_main |
| |
|
| | |
| | analyze_main(results_dir or "code_model2vec/evaluation_results", model_name, output, export_csv) |
| |
|
| |
|
| | @app.command() |
| | def dataset( |
| | max_samples_per_lang: Annotated[int, typer.Option(help="Maximum samples per language")] = 50000, |
| | min_doc_words: Annotated[int, typer.Option(help="Minimum words in documentation")] = 3, |
| | max_doc_words: Annotated[int, typer.Option(help="Maximum words in documentation")] = 100, |
| | min_code_chars: Annotated[int, typer.Option(help="Minimum characters in code")] = 50, |
| | max_code_chars: Annotated[int, typer.Option(help="Maximum characters in code")] = 2000, |
| | output_dir: Annotated[str | None, typer.Option(help="Output directory for dataset")] = None, |
| | simple_format: Annotated[ |
| | bool, typer.Option(help="Create only simple format (not multiple training formats)") |
| | ] = False, |
| | ) -> None: |
| | """Create optimized training dataset from CodeSearchNet for code search tasks.""" |
| | from .dataset import main as dataset_main |
| |
|
| | |
| | dataset_main( |
| | max_samples_per_lang, min_doc_words, max_doc_words, min_code_chars, max_code_chars, output_dir, simple_format |
| | ) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | app() |
| |
|