data-gen / conv_data_gen /config.py
ashish-sarvam's picture
Upload folder using huggingface_hub
fc1a684 verified
import os
from dataclasses import dataclass, field
from typing import List, Optional
from pathlib import Path
@dataclass
class Paths:
"""File and directory paths configuration"""
BASE_DIR: Path = field(default_factory=lambda: Path(__file__).parent)
DATA_DIR: Path = field(
default_factory=lambda: Path(__file__).parent / "data"
)
GENERATORS_DIR: Path = field(
default_factory=lambda: Path(__file__).parent / "generators"
)
# Data subdirectories
USE_CASES_DIR: Path = field(init=False)
TOOLS_DIR: Path = field(init=False)
BOTS_DIR: Path = field(init=False)
USERS_DIR: Path = field(init=False)
CONVERSATIONS_DIR: Path = field(init=False)
COMPANIES_DIR: Path = field(init=False)
ARTIFACTS_DIR: Path = field(init=False)
RUNS_DIR: Path = field(init=False)
# Prompt template files
USE_CASES_PROMPT: Path = field(init=False)
PLAN_PROMPT: Path = field(init=False)
NARRATIVE_PROMPT: Path = field(init=False)
TOOL_GENERATION_PROMPT: Path = field(init=False)
TOOL_EVOLUTION_PROMPT: Path = field(init=False)
BOT_GENERATION_PROMPT: Path = field(init=False)
BOT_GENERATION_TEMPLATE: Path = field(init=False)
BOT_GENERATION_STYLE: Path = field(init=False)
# Bot instruction writing style prompt files
BOT_WRITING_PROMPT_DECLARATIVE: Path = field(init=False)
BOT_WRITING_PROMPT_IMPERATIVE: Path = field(init=False)
TOOL_EVOLUTION_KNOBS_YAML: Path = field(init=False)
USER_GENERATION_PROMPT: Path = field(init=False)
USER_PROXIES_GENERATION_PROMPT: Path = field(init=False)
PERSONA_GENERATION_PROMPT: Path = field(init=False)
PERSONA_KNOBS_YAML: Path = field(init=False)
USER_INSTRUCTION_PROMPT: Path = field(init=False)
TOOL_INSTRUCTION_PROMPT: Path = field(init=False)
AGENT_INSTRUCTION_PROMPT: Path = field(init=False)
TURN_RANDOMIZER_YAML: Path = field(init=False)
# Additional prompt files
OPENING_INSTRUCTIONS_PROMPT: Path = field(init=False)
USER_CARD_PROMPT: Path = field(init=False)
USE_CASE_PROMPT_V2: Path = field(init=False)
CHECKER_PROMPT: Path = field(init=False)
ENRICHMENT_PROMPT: Path = field(init=False)
ARCHETYPE_PROMPT: Path = field(init=False)
# Data files
COMPANIES_CSV: Path = field(init=False)
USER_BASE_DATA_CSV: Path = field(init=False)
COMPANY_FACTSHEETS_JSON: Path = field(init=False)
# Output directories for specific generators
USE_CASE_OUTPUT_DIR: Path = field(init=False)
USE_CASE_AGG_OUTPUT: Path = field(init=False)
def __post_init__(self):
# Initialize data subdirectories
self.USE_CASES_DIR = self.DATA_DIR / "use_cases"
self.TOOLS_DIR = self.DATA_DIR / "tools"
self.BOTS_DIR = self.DATA_DIR / "bots"
self.USERS_DIR = self.DATA_DIR / "users"
self.CONVERSATIONS_DIR = self.DATA_DIR / "conversations"
self.COMPANIES_DIR = self.BASE_DIR / "companies"
self.ARTIFACTS_DIR = self.DATA_DIR / "artifacts"
self.RUNS_DIR = self.DATA_DIR / "runs"
# Initialize prompt template paths
self.USE_CASES_PROMPT = (
self.GENERATORS_DIR
/ "use_case"
/ "prompts"
/ "use_cases_generation.txt"
)
self.PLAN_PROMPT = (
self.GENERATORS_DIR / "structured_use_case" / "prompts" / "plan.j2"
)
self.NARRATIVE_PROMPT = (
self.GENERATORS_DIR
/ "structured_use_case"
/ "prompts"
/ "narrative.j2"
)
self.TOOL_GENERATION_PROMPT = (
self.GENERATORS_DIR / "tool" / "prompts" / "tool_generation.txt"
)
self.TOOL_EVOLUTION_PROMPT = (
self.GENERATORS_DIR / "tool" / "prompts" / "tool_evolution.txt"
)
self.BOT_GENERATION_PROMPT = (
self.GENERATORS_DIR / "bot" / "bot_generation.txt"
)
self.BOT_GENERATION_TEMPLATE = (
self.GENERATORS_DIR / "bot" / "prompts" / "bot_generation_v2.j2"
)
self.BOT_GENERATION_STYLE = (
self.GENERATORS_DIR / "bot" / "config" / "generation_style.yaml"
)
# Instruction writing style prompt files
self.BOT_WRITING_PROMPT_DECLARATIVE = (
self.GENERATORS_DIR / "bot" / "prompts" / "declarative_prompt.j2"
)
self.BOT_WRITING_PROMPT_IMPERATIVE = (
self.GENERATORS_DIR / "bot" / "prompts" / "imperative_prompt.j2"
)
self.TOOL_EVOLUTION_KNOBS_YAML = (
self.GENERATORS_DIR / "tool" / "config" / "evolution_knobs.yaml"
)
self.USER_GENERATION_PROMPT = (
self.GENERATORS_DIR / "user" / "user_generation.txt"
)
self.USER_PROXIES_GENERATION_PROMPT = (
self.GENERATORS_DIR
/ "user"
/ "prompts"
/ "user_proxies_generation.txt"
)
self.PERSONA_GENERATION_PROMPT = (
self.GENERATORS_DIR
/ "user"
/ "prompts"
/ "user_persona_generation.txt"
)
self.PERSONA_KNOBS_YAML = (
self.GENERATORS_DIR / "user" / "config" / "persona_knobs.yaml"
)
self.USER_INSTRUCTION_PROMPT = (
self.GENERATORS_DIR
/ "conversation"
/ "prompts"
/ "user_instructions.txt"
)
self.TOOL_INSTRUCTION_PROMPT = (
self.GENERATORS_DIR
/ "conversation"
/ "prompts"
/ "tool_instructions.txt"
)
self.AGENT_INSTRUCTION_PROMPT = (
self.GENERATORS_DIR
/ "conversation"
/ "prompts"
/ "agent_instructions.txt"
)
self.TURN_RANDOMIZER_YAML = (
self.GENERATORS_DIR
/ "conversation"
/ "config"
/ "turn_randomizer.yaml"
)
# Additional prompt files
self.OPENING_INSTRUCTIONS_PROMPT = (
self.GENERATORS_DIR
/ "conversation"
/ "prompts"
/ "opening_instructions.txt"
)
self.USER_CARD_PROMPT = (
self.GENERATORS_DIR
/ "user_structured"
/ "prompts"
/ "user_card_prompt.txt"
)
self.USE_CASE_PROMPT_V2 = (
self.GENERATORS_DIR
/ "structured_use_case"
/ "prompts"
/ "use_case_prompt_v2.j2"
)
self.CHECKER_PROMPT = (
self.GENERATORS_DIR / "checks" / "prompts" / "checker_prompt.txt"
)
self.ENRICHMENT_PROMPT = (
self.GENERATORS_DIR / "enrichment" / "prompts" / "prompt.j2"
)
self.ARCHETYPE_PROMPT = (
self.GENERATORS_DIR / "archetype" / "prompts" / "prompt.j2"
)
# Data files
self.COMPANIES_CSV = self.COMPANIES_DIR / "companies_1000.csv"
self.USER_BASE_DATA_CSV = (
self.GENERATORS_DIR / "user_structured" / "user_base_data.csv"
)
self.COMPANY_FACTSHEETS_JSON = (
self.GENERATORS_DIR
/ "structured_use_case"
/ "company_factsheets.json"
)
# Output directories for specific generators
self.USE_CASE_OUTPUT_DIR = (
self.GENERATORS_DIR
/ "structured_use_case"
/ "company_usecases_delta"
)
self.USE_CASE_AGG_OUTPUT = (
self.GENERATORS_DIR
/ "structured_use_case"
/ "company_usecases_delta_agg.json"
)
# Create directories if they don't exist
self._create_directories()
def _create_directories(self):
"""Create all required directories if they don't exist"""
dirs = [
self.DATA_DIR,
self.ARTIFACTS_DIR,
self.RUNS_DIR,
# self.USE_CASES_DIR,
# self.TOOLS_DIR,
# self.BOTS_DIR,
# self.USERS_DIR,
# self.CONVERSATIONS_DIR,
# self.COMPANIES_DIR,
]
for directory in dirs:
directory.mkdir(parents=True, exist_ok=True)
@dataclass
class ModelConfig:
"""LLM Model configuration"""
# Anthropic models
OPUS_4_1: str = "claude-opus-4-1@20250805"
SONNET_4: str = "claude-sonnet-4-5@20250929"
# Gemini models
GEMINI_FLASH_2_5: str = "gemini-2.5-flash"
# Default models for different tasks
USE_CASE_MODEL: str = field(init=False)
TOOL_MODEL: str = field(init=False)
BOT_MODEL: str = field(init=False)
USER_MODEL: str = field(init=False)
USER_PROXY_MODEL: str = field(init=False)
CONVERSATION_USER_MODEL: str = field(init=False)
CONVERSATION_AGENT_MODEL: str = field(init=False)
USE_CASE_MODEL_TEMPERATURE: float = field(init=False)
TOOL_MODEL_TEMPERATURE: float = field(init=False)
BOT_MODEL_TEMPERATURE: float = field(init=False)
USER_MODEL_TEMPERATURE: float = field(init=False)
USER_PROXY_MODEL_TEMPERATURE: float = field(init=False)
CONVERSATION_USER_MODEL_TEMPERATURE: float = field(init=False)
CONVERSATION_AGENT_MODEL_TEMPERATURE: float = field(init=False)
USER_PROXY_MAX_TOKENS: int = field(init=False)
USE_CASE_MODEL_MAX_TOKENS: int = field(init=False)
BOT_MODEL_MAX_TOKENS: int = field(init=False)
TOOL_MODEL_MAX_TOKENS: int = field(init=False)
def __post_init__(self):
self.USE_CASE_MODEL = self.SONNET_4
self.TOOL_MODEL = self.SONNET_4
self.BOT_MODEL = self.SONNET_4
self.USER_MODEL = self.SONNET_4
self.USER_PROXY_MODEL = self.SONNET_4
self.CONVERSATION_USER_MODEL = self.SONNET_4
self.CONVERSATION_AGENT_MODEL = self.SONNET_4
self.USE_CASE_MODEL_TEMPERATURE = 1.0
self.TOOL_MODEL_TEMPERATURE = 1.0
self.BOT_MODEL_TEMPERATURE = 1.0
self.USER_MODEL_TEMPERATURE = 1.0
self.USER_PROXY_MODEL_TEMPERATURE = 1.0
self.CONVERSATION_USER_MODEL_TEMPERATURE = 1.0
self.CONVERSATION_AGENT_MODEL_TEMPERATURE = 1.0
self.USER_PROXY_MAX_TOKENS = 20000
self.USE_CASE_MODEL_MAX_TOKENS = 15000
self.BOT_MODEL_MAX_TOKENS = 800
self.TOOL_MODEL_MAX_TOKENS = 20000
@dataclass
class GCPConfig:
"""Google Cloud Platform configuration"""
PROJECT_ID: str = field(
default_factory=lambda: os.getenv(
"GOOGLE_CLOUD_PROJECT", "gpu-reservation-sarvam"
)
)
LOCATION: str = field(
default_factory=lambda: os.getenv("GOOGLE_CLOUD_LOCATION", "us-east5")
)
SERVICE_ACCOUNT_PATH: Optional[str] = field(
default_factory=lambda: os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
)
@dataclass
class ConcurrencyConfig:
"""Concurrency settings for parallel generation steps"""
# Global default when a step-specific value isn't set
DEFAULT_MAX_WORKERS: int = field(init=False)
# Optional step-specific overrides
USERS_MAX_WORKERS: Optional[int] = None
USE_CASES_MAX_WORKERS: Optional[int] = None
TOOLS_MAX_WORKERS: Optional[int] = None
BOTS_MAX_WORKERS: Optional[int] = None
CONVERSATIONS_MAX_WORKERS: Optional[int] = None
CHECKS_MAX_WORKERS: Optional[int] = None
def __post_init__(self):
cpu = os.cpu_count() or 2
# Reasonable cap to avoid over-saturation
self.DEFAULT_MAX_WORKERS = max(1, min(16, cpu * 4))
@dataclass
class GenerationConfig:
"""Generation parameters and constants"""
# Use case generation
USE_CASE_MAX_WORKERS: int = 6
USE_CASE_N_CASES: int = 18 # ask the model for ~15–20 per company
# Pipeline configuration
RUN_ID: str = "20250921-agentic-v1"
THEME: str = "agentic"
# Global parameters
MAX_COMPANIES: int = 5
PER_COMPANY_MAX: int = 5
SEED: int = 142857
SIMILARITY_THRESHOLD: float = 0.90
SIMILARITY_THRESHOLD_USE_CASE: float = 0.80
EMBEDDING_MODEL: str = "gemini-embedding-001"
BATCH_SIZE: int = 64
NUM_PERSONAS: int = 5
# Steps configuration
STEPS_ORDER: List[str] = field(
default_factory=lambda: [
"01-enrichment",
"02-usecase-planning",
"03-usecases",
"04-dedup-usecases",
"05-tools",
"06-bots",
"07-proxies",
"08-dedup-proxies",
"09-personas",
"10-conv",
"11-manipulations",
"12-checks",
"13-fine-tuning",
]
)
# Default mode and step
MODE: str = "single" # or "all"
DEFAULT_SINGLE_STEP: str = "02" # default if --step not passed
@dataclass
class ConversationConfig:
"""Conversation generation configuration"""
MAX_TURNS: int = 20
DEFAULT_SEED_UTTERANCE: Optional[str] = None
END_TOKEN: str = "<END>"
MAX_COMPANIES: int = 1
MAX_USE_CASES: int = 2
# Instructions will be loaded from prompt files
USER_INSTRUCTION: str = field(init=False)
TOOL_INSTRUCTION: str = field(init=False)
AGENT_INSTRUCTION: str = field(init=False)
def __post_init__(self):
"""Load instructions from prompt files"""
paths = Paths()
# Load user instructions
if paths.USER_INSTRUCTION_PROMPT.exists():
self.USER_INSTRUCTION = paths.USER_INSTRUCTION_PROMPT.read_text(
encoding="utf-8"
).strip()
else:
self.USER_INSTRUCTION = "Write only the next USER turn."
# Load tool instructions
if paths.TOOL_INSTRUCTION_PROMPT.exists():
self.TOOL_INSTRUCTION = paths.TOOL_INSTRUCTION_PROMPT.read_text(
encoding="utf-8"
).strip()
else:
self.TOOL_INSTRUCTION = "Tool calling instructions."
# Load agent instructions and combine with tool instructions
if paths.AGENT_INSTRUCTION_PROMPT.exists():
agent_base = paths.AGENT_INSTRUCTION_PROMPT.read_text(
encoding="utf-8"
).strip()
self.AGENT_INSTRUCTION = f"{agent_base}"
else:
self.AGENT_INSTRUCTION = "Write only the next AGENT turn."
@dataclass
class Config:
"""Main configuration container"""
paths: Paths = field(default_factory=Paths)
models: ModelConfig = field(default_factory=ModelConfig)
gcp: GCPConfig = field(default_factory=GCPConfig)
conversation: ConversationConfig = field(
default_factory=ConversationConfig
)
concurrency: ConcurrencyConfig = field(default_factory=ConcurrencyConfig)
generation: GenerationConfig = field(default_factory=GenerationConfig)
# Company list
companies: List[str] = field(default_factory=list)
# Logging configuration
log_level: str = "INFO"
log_format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
def __post_init__(self):
"""
Initialize companies list after object creation
"""
if not self.companies: # Only load if empty
try:
self.companies = self._load_companies_from_csv(
self.conversation.MAX_COMPANIES
)
except (FileNotFoundError, RuntimeError) as e:
print(f"Warning: {e}")
print("Using empty companies list for demo")
self.companies = []
@classmethod
def from_env(cls) -> "Config":
"""Create configuration from environment variables"""
return cls()
def _load_companies_from_csv(self, size: int) -> List[str]:
"""Load companies from a CSV file"""
import csv
# Use existing paths from this instance
csv_path = self.paths.COMPANIES_CSV
if not csv_path.exists():
raise FileNotFoundError(
f"Companies CSV file not found at: {csv_path}"
)
companies = []
try:
with open(csv_path, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for row in reader:
if "Company_Name" in row:
companies.append(row["Company_Name"].strip())
if len(companies) >= size:
break
except Exception as e:
raise RuntimeError(f"Error reading companies CSV file: {e}")
if not companies:
# throw error
raise RuntimeError("No companies found in CSV file")
return companies
# Global configuration instance
config = Config()