File size: 14,396 Bytes
454e47c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef6935e
 
 
 
 
 
 
d820ac9
ef6935e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d820ac9
 
 
 
 
 
 
 
 
ef6935e
 
 
 
 
 
 
d820ac9
ef6935e
 
 
 
 
 
 
 
 
 
454e47c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef6935e
 
454e47c
 
 
 
 
 
 
 
ef6935e
454e47c
 
 
 
 
 
 
ef6935e
 
 
454e47c
 
ef6935e
454e47c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef6935e
454e47c
 
 
 
 
 
 
 
 
 
 
 
 
 
ef6935e
454e47c
 
 
 
 
 
 
ef6935e
454e47c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef6935e
0dbb356
 
 
ef6935e
 
454e47c
0dbb356
 
72121b3
7837959
 
 
 
454e47c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef6935e
 
454e47c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ef6935e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d820ac9
 
ef6935e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d820ac9
 
 
 
 
ef6935e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
"""
Shared configuration for the distiller package.

This module centralizes all configuration constants, default values, and common
settings used across distillation, evaluation, and benchmarking modules.
"""

import logging
from pathlib import Path
from typing import Any

from beam import GpuType, Image
from pydantic import BaseModel

# =============================================================================
# LOGGING CONFIGURATION
# =============================================================================


def setup_logging(level: int = logging.INFO) -> None:
	"""Set up consistent logging across the package."""
	log_dir = Path("logs")
	log_dir.mkdir(parents=True, exist_ok=True)
	log_path = log_dir / "distiller.log"
	logging.basicConfig(
		level=level,
		format="%(asctime)s - %(levelname)s - %(message)s",
		handlers=[logging.StreamHandler(), logging.FileHandler(log_path, mode="a")],
	)


# =============================================================================
# BEAM CLOUD CONFIGURATION
# =============================================================================


# Comprehensive Beam function configuration
class BeamFunctionConfig(BaseModel):
	"""Complete configuration for Beam @function decorator parameters."""

	# Resource allocation
	cpu: float = 2.0  # Number of CPU cores
	memory: int = 8192  # Memory in MiB (8GB)
	gpu: GpuType | list[GpuType] = GpuType.A100_40  # GPU type

	# Execution settings
	timeout: int = 3600 * 12  # 12 hours timeout for long distillation jobs
	retries: int = 2  # Retry failed tasks up to 2 times
	headless: bool = False  # Keep connected during execution

	# Optional settings
	callback_url: str | None = None  # Webhook URL for task completion
	name: str | None = None  # Function name for deployment
	task_policy: Any | None = None  # Task lifecycle policy
	retry_for: list[str] | None = None  # Specific exceptions to retry on

	# Environment and dependencies
	secrets: list[str] = ["HF_ACCESS_TOKEN"]  # Required secrets
	env_vars: dict[str, str] = {
		"TOKENIZERS_PARALLELISM": "false",
		"CUDA_LAUNCH_BLOCKING": "0",
		"PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True",
		"TORCH_CUDNN_V8_API_ENABLED": "1",
		# Flash attention environment variables
		"FLASH_ATTENTION_FORCE_USE": "1",
		"TORCH_COMPILE_DISABLE": "1",
	}


# Configuration for different types of Beam jobs
BEAM_CONFIGS: dict[str, BeamFunctionConfig] = {
	"distillation": BeamFunctionConfig(
		cpu=4.0,
		memory=16384,  # 8GB for distillation
		gpu=GpuType.A100_40,
		timeout=3600 * 12,  # 12 hours
		retries=2,
		secrets=["HF_ACCESS_TOKEN"],
	),
	"training": BeamFunctionConfig(
		cpu=4.0,
		memory=16384,  # 8GB for distillation
		gpu=[GpuType.H100, GpuType.A100_40],
		timeout=3600 * 12,  # 12 hours
		retries=2,
		secrets=["HF_ACCESS_TOKEN"],
	),
	"evaluation": BeamFunctionConfig(
		cpu=2.0,
		memory=8192,  # 8GB for evaluation
		gpu=GpuType.A100_40,  # Smaller GPU for evaluation
		timeout=3600 * 4,  # 4 hours
		retries=3,
		secrets=["HF_ACCESS_TOKEN"],
	),
}

# Default beam configuration
DEFAULT_BEAM_CONFIG = BEAM_CONFIGS["distillation"]


# Volume configurations for different workflows
class VolumeConfig(BaseModel):
	"""Volume configuration container."""

	name: str
	mount_path: str
	description: str = ""


# Define volume configurations - code_model2vec is the primary volume for all workflows
VOLUMES: dict[str, VolumeConfig] = {
	"primary": VolumeConfig(
		name="code_model2vec",
		mount_path="./code_model2vec",
		description="Primary volume for all distillation models, evaluations, benchmarks, and checkpoints",
	),
	# Legacy volume name mapping for backwards compatibility
	"simplified": VolumeConfig(
		name="code_model2vec",
		mount_path="./code_model2vec",
		description="Primary volume for all distillation models, evaluations, benchmarks, and checkpoints",
	),
}

# Default volume name for all workflows
DEFAULT_VOLUME = "primary"

# Legacy environment settings (now part of BeamFunctionConfig)
BEAM_ENV_SETTINGS: dict[str, str] = DEFAULT_BEAM_CONFIG.env_vars

# Common Python packages for Beam images
COMMON_PACKAGES: list[str] = [
	"torch>=2.7.0",
	"transformers>=4.40.0",
	"datasets>=3.2.0",
	"sentence-transformers>=4.1.0",
	"model2vec[train]>=0.5.0",
	"tokenlearn>=0.2.0",
	"numpy>=1.26.4",
	"scikit-learn>=1.6.1",
	"pandas>=2.0.0",
	"tqdm>=4.65.0",
	"plotly>=5.0.0",
	"matplotlib>=3.7.0",
	"seaborn>=0.12.0",
	"typer>=0.16.0",
	"pydantic>=2.11.5",
	"hatchling>=1.27.0",
]

# Create common Beam image without flash-attn due to PyTorch version conflicts
IMAGE = Image(python_version="python3.12").add_python_packages(COMMON_PACKAGES)

# =============================================================================
# MODEL CONFIGURATION
# =============================================================================

# Teacher model configurations
TEACHER_MODELS: list[str] = [
	"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
	"BAAI/bge-m3",
	"jinaai/jina-embeddings-v3",
	"lightonai/Reason-ModernColBERT",
	"Linq-AI-Research/Linq-Embed-Mistral",
	"microsoft/codebert-base",
	"microsoft/graphcodebert-base",
	"nomic-ai/nomic-embed-text-v2-moe",
	"Qodo/Qodo-Embed-1-1.5B",
	"sentence-transformers/all-MiniLM-L6-v2",
	"sentence-transformers/all-mpnet-base-v2",
	"sentence-transformers/paraphrase-MiniLM-L6-v2",
	"jinaai/jina-embeddings-v2-base-code",
]

# Default evaluation models for comparison
DEFAULT_EVALUATION_MODELS: list[str] = [
	"Alibaba-NLP/gte-Qwen2-1.5B-instruct",
	"BAAI/bge-m3",
	"huggingface/CodeBERTa-small-v1",
	"jinaai/jina-embeddings-v3",
	"lightonai/Reason-ModernColBERT",
	"Linq-AI-Research/Linq-Embed-Mistral",
	"microsoft/codebert-base",
	"microsoft/graphcodebert-base",
	"minishlab/potion-base-8M",
	"minishlab/potion-retrieval-32M",
	"minishlab/potion-multilingual-128M",
	"nomic-ai/nomic-embed-text-v2-moe",
	"Qodo/Qodo-Embed-1-1.5B",
	"Salesforce/codet5-base",
	"sentence-transformers/all-MiniLM-L12-v2",
	"sentence-transformers/all-MiniLM-L6-v2",
	"sentence-transformers/all-mpnet-base-v2",
	"sentence-transformers/paraphrase-MiniLM-L6-v2",
	"jinaai/jina-embeddings-v2-base-code",
]


# Model2Vec distillation parameters
class DistillationConfig(BaseModel):
	"""Configuration for Model2Vec distillation parameters."""

	# Teacher models for distillation
	code_teacher_models: list[str] = TEACHER_MODELS

	# Basic distillation parameters
	optimal_pca_dims: int = 256
	sif_coefficient: float = 1e-3
	apply_zipf: bool = True

	# Tokenlearn-specific parameters (POTION approach)
	tokenlearn_dataset: str = "allenai/c4"  # Dataset for tokenlearn featurization (following POTION paper)
	tokenlearn_dataset_name: str = "en"  # Use 'en' configuration for English text
	tokenlearn_text_key: str = "text"  # Text field to use from the dataset
	tokenlearn_timeout_featurize: int = 21600  # 6 hour timeout for featurization (dataset needs ~5 hours)
	tokenlearn_timeout_train: int = 7200  # 2 hour timeout for training

	# Dataset sampling configuration
	tokenlearn_max_samples: int = 50000  # Maximum samples to use for tokenlearn training

	# Dataset configuration
	use_optimized_dataset: bool = True  # Use the pre-created optimized dataset from dataset.py
	custom_dataset_path: str | None = "code_model2vec/dataset"  # Path to custom dataset directory


distillation_config = DistillationConfig()


# =============================================================================
# DATASET CONFIGURATION
# =============================================================================


# Add a LanguagesConfig Pydantic model
class LanguagesConfig(BaseModel):
	"""Configuration for languages used in evaluation."""

	all: list[str] = [
		"python",
		"java",
		"javascript",
		"php",
		"ruby",
		"go",
	]


languages_config = LanguagesConfig()


# Update CodeSearchNetConfig to use languages_config.all as the default for evaluation_languages
class CodeSearchNetConfig(BaseModel):
	"""Configuration for CodeSearchNet evaluation settings."""

	dataset_name: str = "code_search_net"
	evaluation_languages: list[str] = languages_config.all
	max_queries_per_language: int = 1000
	similarity_threshold: float = 0.7
	evaluation_metrics: list[str] = ["ndcg@1", "ndcg@5", "ndcg@10", "mrr", "recall@1", "recall@5", "recall@10"]


codesearchnet_config = CodeSearchNetConfig()

# Training dataset configuration
TRAINING_DATASET: str = "sentence-transformers/codesearchnet"

# =============================================================================
# OUTPUT DIRECTORY CONFIGURATION
# =============================================================================


# Standardized directory structure within code_model2vec
class StandardDirectories(BaseModel):
	"""Standardized directory structure for code_model2vec workspace."""

	# Root directory
	root: str = "code_model2vec"

	# Model directories
	base: str = "code_model2vec/base"  # Basic distilled models
	final: str = "code_model2vec/final"  # Final trained models
	models: str = "code_model2vec/models"  # Legacy/alternative models location

	# Results directories
	evaluation_results: str = "code_model2vec/evaluation_results"
	benchmark_results: str = "code_model2vec/benchmark_results"
	analysis_results: str = "code_model2vec/analysis_results"

	# Working directories
	checkpoints: str = "code_model2vec/checkpoints"
	cache: str = "code_model2vec/cache"
	temp: str = "code_model2vec/temp"


# Create global instance
directories = StandardDirectories()


# Legacy OutputDirs for backwards compatibility
class OutputDirs(BaseModel):
	"""Base output directory structure for storing models, checkpoints, and results."""

	base: str = "base"
	models: str = "final"
	checkpoints: str = "checkpoints"
	evaluation_results: str = "evaluation_results"
	benchmark_results: str = "benchmark_results"
	analysis_results: str = "analysis_results"
	cache: str = "cache"


output_dirs = OutputDirs()


# File naming patterns
class FilenamePatterns(BaseModel):
	"""File naming patterns for evaluation, benchmark, checkpoint, and model files."""

	evaluation: str = "codesearchnet_eval_{model_name}.json"
	bencmark: str = "benchmark_{model_name}.json"
	checkpoint: str = "checkpoints_{stage}_step_{step}.json"
	model: str = "{teacher_model}_{dims}d"


filename_patterns = FilenamePatterns()

# =============================================================================
# ANALYSIS AND VISUALIZATION
# =============================================================================


# Chart configuration
class ChartConfig(BaseModel):
	"""Chart configuration for analysis and visualization."""

	figsize: tuple[int, int] = (12, 8)
	dpi: int = 300
	style: str = "whitegrid"
	color_palette: str = "Set2"
	save_formats: list[str] = ["png", "pdf"]


chart_config = ChartConfig()


# Performance thresholds for analysis
class PerformanceThresholds(BaseModel):
	"""Performance thresholds for analysis results."""

	excellent: float = 0.7
	good: float = 0.5
	fair: float = 0.3
	pour: float = 0.1


performance_thresholds = PerformanceThresholds()

# =============================================================================
# HELPER FUNCTIONS
# =============================================================================


def get_volume_config() -> VolumeConfig:
	"""Get volume configuration for any workflow - always returns the primary code_model2vec volume."""
	return VOLUMES["primary"]


def get_output_path(base_path: str | Path, output_type: str) -> Path:
	"""Get standardized output path for different types of outputs."""
	base = Path(base_path)
	if hasattr(output_dirs, output_type):
		return base / getattr(output_dirs, output_type)
	return base / output_type


def get_standard_directory(dir_type: str) -> str:
	"""Get standardized directory path for any directory type."""
	if hasattr(directories, dir_type):
		return getattr(directories, dir_type)
	# Default to relative path within code_model2vec
	return f"code_model2vec/{dir_type}"


def ensure_checkpoint_directory(stage: str) -> str:
	"""Ensure checkpoint directory exists for a specific stage and return the path."""
	checkpoint_dir = f"{directories.checkpoints}/{stage}"
	Path(checkpoint_dir).mkdir(parents=True, exist_ok=True)
	return checkpoint_dir


def format_filename(pattern_key: str, **kwargs: Any) -> str:
	"""Format filename using predefined patterns."""
	if hasattr(filename_patterns, pattern_key):
		return getattr(filename_patterns, pattern_key).format(**kwargs)
	msg = f"Unknown filename pattern: {pattern_key}"
	raise ValueError(msg)


def get_safe_model_name(model_name: str) -> str:
	"""Convert model name to filesystem-safe name."""
	return "".join(c for c in model_name if c.isalnum() or c in ("-", "_", ".")).replace("/", "_")


def get_beam_config(job_type: str = "distillation") -> BeamFunctionConfig:
	"""Get Beam configuration for a specific job type."""
	if job_type in BEAM_CONFIGS:
		return BEAM_CONFIGS[job_type]
	return DEFAULT_BEAM_CONFIG


def create_beam_function_kwargs(
	job_type: str = "distillation", volume_config: VolumeConfig | None = None
) -> dict[str, Any]:
	"""Create kwargs dictionary for @function decorator."""
	from beam import Volume

	config = get_beam_config(job_type)
	volume_cfg = volume_config or get_volume_config()

	# Convert GPU string to proper type if needed
	gpu_type = config.gpu

	kwargs: dict[str, Any] = {
		"cpu": config.cpu,
		"memory": config.memory,
		"gpu": gpu_type,
		"image": IMAGE,
		"timeout": config.timeout,
		"retries": config.retries,
		"headless": config.headless,
		"volumes": [Volume(name=volume_cfg.name, mount_path=volume_cfg.mount_path)],
		"secrets": config.secrets,
		"env": config.env_vars,
	}

	# Add optional parameters if they're set
	if config.callback_url:
		kwargs["callback_url"] = config.callback_url
	if config.name:
		kwargs["name"] = config.name
	if config.task_policy:
		kwargs["task_policy"] = config.task_policy
	if config.retry_for:
		kwargs["retry_for"] = config.retry_for

	return kwargs


def get_distillation_function_kwargs() -> dict[str, Any]:
	"""Get function kwargs specifically for distillation jobs."""
	return create_beam_function_kwargs("distillation")


def get_training_function_kwargs() -> dict[str, Any]:
	"""Get function kwargs specifically for training jobs."""
	return create_beam_function_kwargs("training")


def get_evaluation_function_kwargs() -> dict[str, Any]:
	"""Get function kwargs specifically for evaluation jobs."""
	return create_beam_function_kwargs("evaluation")