Spaces:
Running
Running
File size: 2,964 Bytes
a39d8ef | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 | """
data_factory/config.py
======================
Central configuration for the NL2SQL Synthetic Data Factory.
Design philosophy:
- SQL ALWAYS comes from human-verified templates β zero SQL errors
- LLM ONLY generates natural language paraphrases β no SQL hallucination
- Every SQL is execution-validated before saving β guaranteed correctness
"""
from __future__ import annotations
from pathlib import Path
# ββ Paths ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
ROOT_DIR = Path(__file__).parent.parent
DATA_DIR = ROOT_DIR / "generated_data"
CHECKPOINT_DIR = DATA_DIR / "checkpoints"
OUTPUT_DIR = DATA_DIR / "output"
# ββ vLLM / Model βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# For H100 with 80GB VRAM β run Llama-3-70B or Qwen-72B at full bf16
GENERATOR_MODEL = "meta-llama/Meta-Llama-3-70B-Instruct" # change to your preferred model
TENSOR_PARALLEL = 4 # Number of GPUs for tensor parallelism (H100 cluster)
MAX_MODEL_LEN = 4096 # Max context length
GPU_MEMORY_UTIL = 0.90 # Leave 10% headroom
# ββ Generation settings ββββββββββββββββββββββββββββββββββββββββββββββββββ
PERSONAS = ["ceo", "chatty", "lazy_typist", "non_techie", "analyst"]
NL_VARIANTS_PER_TEMPLATE = 5 # One per persona
AUGMENTATIONS_PER_NL = 3 # Rule-based variations per NL string
TEMPERATURE = 0.85 # Slightly high for diversity
MAX_NEW_TOKENS = 150 # NL questions are short
# ββ Scale targets ββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 56 base SQL templates Γ 5 personas Γ 3 augmentations = 840 "original" records
# With vLLM generating more NL variants, target: ~500K-1M clean records
VLLM_EXTRA_VARIANTS = 10 # Additional vLLM NL variants per template beyond personas
# ββ Validation βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
RANDOM_SEED = 42
# ββ Domains ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
DOMAINS = ["ecommerce", "healthcare", "finance", "hr"]
DIFFICULTY_LABELS = {
"easy": "Single-table SELECT with basic WHERE/ORDER/LIMIT.",
"medium": "Multi-table JOIN with GROUP BY/HAVING/aggregates.",
"hard": "CTEs, window functions, subqueries.",
}
|