Spaces:
Runtime error
Runtime error
| import os | |
| from dataclasses import dataclass, field | |
| from typing import List, Optional | |
| from pathlib import Path | |
| class Paths: | |
| """File and directory paths configuration""" | |
| BASE_DIR: Path = field(default_factory=lambda: Path(__file__).parent) | |
| DATA_DIR: Path = field( | |
| default_factory=lambda: Path(__file__).parent / "data" | |
| ) | |
| GENERATORS_DIR: Path = field( | |
| default_factory=lambda: Path(__file__).parent / "generators" | |
| ) | |
| # Data subdirectories | |
| USE_CASES_DIR: Path = field(init=False) | |
| TOOLS_DIR: Path = field(init=False) | |
| BOTS_DIR: Path = field(init=False) | |
| USERS_DIR: Path = field(init=False) | |
| CONVERSATIONS_DIR: Path = field(init=False) | |
| COMPANIES_DIR: Path = field(init=False) | |
| ARTIFACTS_DIR: Path = field(init=False) | |
| RUNS_DIR: Path = field(init=False) | |
| # Prompt template files | |
| USE_CASES_PROMPT: Path = field(init=False) | |
| PLAN_PROMPT: Path = field(init=False) | |
| NARRATIVE_PROMPT: Path = field(init=False) | |
| TOOL_GENERATION_PROMPT: Path = field(init=False) | |
| TOOL_EVOLUTION_PROMPT: Path = field(init=False) | |
| BOT_GENERATION_PROMPT: Path = field(init=False) | |
| BOT_GENERATION_TEMPLATE: Path = field(init=False) | |
| BOT_GENERATION_STYLE: Path = field(init=False) | |
| # Bot instruction writing style prompt files | |
| BOT_WRITING_PROMPT_DECLARATIVE: Path = field(init=False) | |
| BOT_WRITING_PROMPT_IMPERATIVE: Path = field(init=False) | |
| TOOL_EVOLUTION_KNOBS_YAML: Path = field(init=False) | |
| USER_GENERATION_PROMPT: Path = field(init=False) | |
| USER_PROXIES_GENERATION_PROMPT: Path = field(init=False) | |
| PERSONA_GENERATION_PROMPT: Path = field(init=False) | |
| PERSONA_KNOBS_YAML: Path = field(init=False) | |
| USER_INSTRUCTION_PROMPT: Path = field(init=False) | |
| TOOL_INSTRUCTION_PROMPT: Path = field(init=False) | |
| AGENT_INSTRUCTION_PROMPT: Path = field(init=False) | |
| TURN_RANDOMIZER_YAML: Path = field(init=False) | |
| # Additional prompt files | |
| OPENING_INSTRUCTIONS_PROMPT: Path = field(init=False) | |
| USER_CARD_PROMPT: Path = field(init=False) | |
| USE_CASE_PROMPT_V2: Path = field(init=False) | |
| CHECKER_PROMPT: Path = field(init=False) | |
| ENRICHMENT_PROMPT: Path = field(init=False) | |
| ARCHETYPE_PROMPT: Path = field(init=False) | |
| # Data files | |
| COMPANIES_CSV: Path = field(init=False) | |
| USER_BASE_DATA_CSV: Path = field(init=False) | |
| COMPANY_FACTSHEETS_JSON: Path = field(init=False) | |
| # Output directories for specific generators | |
| USE_CASE_OUTPUT_DIR: Path = field(init=False) | |
| USE_CASE_AGG_OUTPUT: Path = field(init=False) | |
| def __post_init__(self): | |
| # Initialize data subdirectories | |
| self.USE_CASES_DIR = self.DATA_DIR / "use_cases" | |
| self.TOOLS_DIR = self.DATA_DIR / "tools" | |
| self.BOTS_DIR = self.DATA_DIR / "bots" | |
| self.USERS_DIR = self.DATA_DIR / "users" | |
| self.CONVERSATIONS_DIR = self.DATA_DIR / "conversations" | |
| self.COMPANIES_DIR = self.BASE_DIR / "companies" | |
| self.ARTIFACTS_DIR = self.DATA_DIR / "artifacts" | |
| self.RUNS_DIR = self.DATA_DIR / "runs" | |
| # Initialize prompt template paths | |
| self.USE_CASES_PROMPT = ( | |
| self.GENERATORS_DIR | |
| / "use_case" | |
| / "prompts" | |
| / "use_cases_generation.txt" | |
| ) | |
| self.PLAN_PROMPT = ( | |
| self.GENERATORS_DIR / "structured_use_case" / "prompts" / "plan.j2" | |
| ) | |
| self.NARRATIVE_PROMPT = ( | |
| self.GENERATORS_DIR | |
| / "structured_use_case" | |
| / "prompts" | |
| / "narrative.j2" | |
| ) | |
| self.TOOL_GENERATION_PROMPT = ( | |
| self.GENERATORS_DIR / "tool" / "prompts" / "tool_generation.txt" | |
| ) | |
| self.TOOL_EVOLUTION_PROMPT = ( | |
| self.GENERATORS_DIR / "tool" / "prompts" / "tool_evolution.txt" | |
| ) | |
| self.BOT_GENERATION_PROMPT = ( | |
| self.GENERATORS_DIR / "bot" / "bot_generation.txt" | |
| ) | |
| self.BOT_GENERATION_TEMPLATE = ( | |
| self.GENERATORS_DIR / "bot" / "prompts" / "bot_generation_v2.j2" | |
| ) | |
| self.BOT_GENERATION_STYLE = ( | |
| self.GENERATORS_DIR / "bot" / "config" / "generation_style.yaml" | |
| ) | |
| # Instruction writing style prompt files | |
| self.BOT_WRITING_PROMPT_DECLARATIVE = ( | |
| self.GENERATORS_DIR / "bot" / "prompts" / "declarative_prompt.j2" | |
| ) | |
| self.BOT_WRITING_PROMPT_IMPERATIVE = ( | |
| self.GENERATORS_DIR / "bot" / "prompts" / "imperative_prompt.j2" | |
| ) | |
| self.TOOL_EVOLUTION_KNOBS_YAML = ( | |
| self.GENERATORS_DIR / "tool" / "config" / "evolution_knobs.yaml" | |
| ) | |
| self.USER_GENERATION_PROMPT = ( | |
| self.GENERATORS_DIR / "user" / "user_generation.txt" | |
| ) | |
| self.USER_PROXIES_GENERATION_PROMPT = ( | |
| self.GENERATORS_DIR | |
| / "user" | |
| / "prompts" | |
| / "user_proxies_generation.txt" | |
| ) | |
| self.PERSONA_GENERATION_PROMPT = ( | |
| self.GENERATORS_DIR | |
| / "user" | |
| / "prompts" | |
| / "user_persona_generation.txt" | |
| ) | |
| self.PERSONA_KNOBS_YAML = ( | |
| self.GENERATORS_DIR / "user" / "config" / "persona_knobs.yaml" | |
| ) | |
| self.USER_INSTRUCTION_PROMPT = ( | |
| self.GENERATORS_DIR | |
| / "conversation" | |
| / "prompts" | |
| / "user_instructions.txt" | |
| ) | |
| self.TOOL_INSTRUCTION_PROMPT = ( | |
| self.GENERATORS_DIR | |
| / "conversation" | |
| / "prompts" | |
| / "tool_instructions.txt" | |
| ) | |
| self.AGENT_INSTRUCTION_PROMPT = ( | |
| self.GENERATORS_DIR | |
| / "conversation" | |
| / "prompts" | |
| / "agent_instructions.txt" | |
| ) | |
| self.TURN_RANDOMIZER_YAML = ( | |
| self.GENERATORS_DIR | |
| / "conversation" | |
| / "config" | |
| / "turn_randomizer.yaml" | |
| ) | |
| # Additional prompt files | |
| self.OPENING_INSTRUCTIONS_PROMPT = ( | |
| self.GENERATORS_DIR | |
| / "conversation" | |
| / "prompts" | |
| / "opening_instructions.txt" | |
| ) | |
| self.USER_CARD_PROMPT = ( | |
| self.GENERATORS_DIR | |
| / "user_structured" | |
| / "prompts" | |
| / "user_card_prompt.txt" | |
| ) | |
| self.USE_CASE_PROMPT_V2 = ( | |
| self.GENERATORS_DIR | |
| / "structured_use_case" | |
| / "prompts" | |
| / "use_case_prompt_v2.j2" | |
| ) | |
| self.CHECKER_PROMPT = ( | |
| self.GENERATORS_DIR / "checks" / "prompts" / "checker_prompt.txt" | |
| ) | |
| self.ENRICHMENT_PROMPT = ( | |
| self.GENERATORS_DIR / "enrichment" / "prompts" / "prompt.j2" | |
| ) | |
| self.ARCHETYPE_PROMPT = ( | |
| self.GENERATORS_DIR / "archetype" / "prompts" / "prompt.j2" | |
| ) | |
| # Data files | |
| self.COMPANIES_CSV = self.COMPANIES_DIR / "companies_1000.csv" | |
| self.USER_BASE_DATA_CSV = ( | |
| self.GENERATORS_DIR / "user_structured" / "user_base_data.csv" | |
| ) | |
| self.COMPANY_FACTSHEETS_JSON = ( | |
| self.GENERATORS_DIR | |
| / "structured_use_case" | |
| / "company_factsheets.json" | |
| ) | |
| # Output directories for specific generators | |
| self.USE_CASE_OUTPUT_DIR = ( | |
| self.GENERATORS_DIR | |
| / "structured_use_case" | |
| / "company_usecases_delta" | |
| ) | |
| self.USE_CASE_AGG_OUTPUT = ( | |
| self.GENERATORS_DIR | |
| / "structured_use_case" | |
| / "company_usecases_delta_agg.json" | |
| ) | |
| # Create directories if they don't exist | |
| self._create_directories() | |
| def _create_directories(self): | |
| """Create all required directories if they don't exist""" | |
| dirs = [ | |
| self.DATA_DIR, | |
| self.ARTIFACTS_DIR, | |
| self.RUNS_DIR, | |
| # self.USE_CASES_DIR, | |
| # self.TOOLS_DIR, | |
| # self.BOTS_DIR, | |
| # self.USERS_DIR, | |
| # self.CONVERSATIONS_DIR, | |
| # self.COMPANIES_DIR, | |
| ] | |
| for directory in dirs: | |
| directory.mkdir(parents=True, exist_ok=True) | |
| class ModelConfig: | |
| """LLM Model configuration""" | |
| # Anthropic models | |
| OPUS_4_1: str = "claude-opus-4-1@20250805" | |
| SONNET_4: str = "claude-sonnet-4-5@20250929" | |
| # Gemini models | |
| GEMINI_FLASH_2_5: str = "gemini-2.5-flash" | |
| # Default models for different tasks | |
| USE_CASE_MODEL: str = field(init=False) | |
| TOOL_MODEL: str = field(init=False) | |
| BOT_MODEL: str = field(init=False) | |
| USER_MODEL: str = field(init=False) | |
| USER_PROXY_MODEL: str = field(init=False) | |
| CONVERSATION_USER_MODEL: str = field(init=False) | |
| CONVERSATION_AGENT_MODEL: str = field(init=False) | |
| USE_CASE_MODEL_TEMPERATURE: float = field(init=False) | |
| TOOL_MODEL_TEMPERATURE: float = field(init=False) | |
| BOT_MODEL_TEMPERATURE: float = field(init=False) | |
| USER_MODEL_TEMPERATURE: float = field(init=False) | |
| USER_PROXY_MODEL_TEMPERATURE: float = field(init=False) | |
| CONVERSATION_USER_MODEL_TEMPERATURE: float = field(init=False) | |
| CONVERSATION_AGENT_MODEL_TEMPERATURE: float = field(init=False) | |
| USER_PROXY_MAX_TOKENS: int = field(init=False) | |
| USE_CASE_MODEL_MAX_TOKENS: int = field(init=False) | |
| BOT_MODEL_MAX_TOKENS: int = field(init=False) | |
| TOOL_MODEL_MAX_TOKENS: int = field(init=False) | |
| def __post_init__(self): | |
| self.USE_CASE_MODEL = self.SONNET_4 | |
| self.TOOL_MODEL = self.SONNET_4 | |
| self.BOT_MODEL = self.SONNET_4 | |
| self.USER_MODEL = self.SONNET_4 | |
| self.USER_PROXY_MODEL = self.SONNET_4 | |
| self.CONVERSATION_USER_MODEL = self.SONNET_4 | |
| self.CONVERSATION_AGENT_MODEL = self.SONNET_4 | |
| self.USE_CASE_MODEL_TEMPERATURE = 1.0 | |
| self.TOOL_MODEL_TEMPERATURE = 1.0 | |
| self.BOT_MODEL_TEMPERATURE = 1.0 | |
| self.USER_MODEL_TEMPERATURE = 1.0 | |
| self.USER_PROXY_MODEL_TEMPERATURE = 1.0 | |
| self.CONVERSATION_USER_MODEL_TEMPERATURE = 1.0 | |
| self.CONVERSATION_AGENT_MODEL_TEMPERATURE = 1.0 | |
| self.USER_PROXY_MAX_TOKENS = 20000 | |
| self.USE_CASE_MODEL_MAX_TOKENS = 15000 | |
| self.BOT_MODEL_MAX_TOKENS = 800 | |
| self.TOOL_MODEL_MAX_TOKENS = 20000 | |
| class GCPConfig: | |
| """Google Cloud Platform configuration""" | |
| PROJECT_ID: str = field( | |
| default_factory=lambda: os.getenv( | |
| "GOOGLE_CLOUD_PROJECT", "gpu-reservation-sarvam" | |
| ) | |
| ) | |
| LOCATION: str = field( | |
| default_factory=lambda: os.getenv("GOOGLE_CLOUD_LOCATION", "us-east5") | |
| ) | |
| SERVICE_ACCOUNT_PATH: Optional[str] = field( | |
| default_factory=lambda: os.getenv("GOOGLE_APPLICATION_CREDENTIALS") | |
| ) | |
| class ConcurrencyConfig: | |
| """Concurrency settings for parallel generation steps""" | |
| # Global default when a step-specific value isn't set | |
| DEFAULT_MAX_WORKERS: int = field(init=False) | |
| # Optional step-specific overrides | |
| USERS_MAX_WORKERS: Optional[int] = None | |
| USE_CASES_MAX_WORKERS: Optional[int] = None | |
| TOOLS_MAX_WORKERS: Optional[int] = None | |
| BOTS_MAX_WORKERS: Optional[int] = None | |
| CONVERSATIONS_MAX_WORKERS: Optional[int] = None | |
| CHECKS_MAX_WORKERS: Optional[int] = None | |
| def __post_init__(self): | |
| cpu = os.cpu_count() or 2 | |
| # Reasonable cap to avoid over-saturation | |
| self.DEFAULT_MAX_WORKERS = max(1, min(16, cpu * 4)) | |
| class GenerationConfig: | |
| """Generation parameters and constants""" | |
| # Use case generation | |
| USE_CASE_MAX_WORKERS: int = 6 | |
| USE_CASE_N_CASES: int = 18 # ask the model for ~15–20 per company | |
| # Pipeline configuration | |
| RUN_ID: str = "20250921-agentic-v1" | |
| THEME: str = "agentic" | |
| # Global parameters | |
| MAX_COMPANIES: int = 5 | |
| PER_COMPANY_MAX: int = 5 | |
| SEED: int = 142857 | |
| SIMILARITY_THRESHOLD: float = 0.90 | |
| SIMILARITY_THRESHOLD_USE_CASE: float = 0.80 | |
| EMBEDDING_MODEL: str = "gemini-embedding-001" | |
| BATCH_SIZE: int = 64 | |
| NUM_PERSONAS: int = 5 | |
| # Steps configuration | |
| STEPS_ORDER: List[str] = field( | |
| default_factory=lambda: [ | |
| "01-enrichment", | |
| "02-usecase-planning", | |
| "03-usecases", | |
| "04-dedup-usecases", | |
| "05-tools", | |
| "06-bots", | |
| "07-proxies", | |
| "08-dedup-proxies", | |
| "09-personas", | |
| "10-conv", | |
| "11-manipulations", | |
| "12-checks", | |
| "13-fine-tuning", | |
| ] | |
| ) | |
| # Default mode and step | |
| MODE: str = "single" # or "all" | |
| DEFAULT_SINGLE_STEP: str = "02" # default if --step not passed | |
| class ConversationConfig: | |
| """Conversation generation configuration""" | |
| MAX_TURNS: int = 20 | |
| DEFAULT_SEED_UTTERANCE: Optional[str] = None | |
| END_TOKEN: str = "<END>" | |
| MAX_COMPANIES: int = 1 | |
| MAX_USE_CASES: int = 2 | |
| # Instructions will be loaded from prompt files | |
| USER_INSTRUCTION: str = field(init=False) | |
| TOOL_INSTRUCTION: str = field(init=False) | |
| AGENT_INSTRUCTION: str = field(init=False) | |
| def __post_init__(self): | |
| """Load instructions from prompt files""" | |
| paths = Paths() | |
| # Load user instructions | |
| if paths.USER_INSTRUCTION_PROMPT.exists(): | |
| self.USER_INSTRUCTION = paths.USER_INSTRUCTION_PROMPT.read_text( | |
| encoding="utf-8" | |
| ).strip() | |
| else: | |
| self.USER_INSTRUCTION = "Write only the next USER turn." | |
| # Load tool instructions | |
| if paths.TOOL_INSTRUCTION_PROMPT.exists(): | |
| self.TOOL_INSTRUCTION = paths.TOOL_INSTRUCTION_PROMPT.read_text( | |
| encoding="utf-8" | |
| ).strip() | |
| else: | |
| self.TOOL_INSTRUCTION = "Tool calling instructions." | |
| # Load agent instructions and combine with tool instructions | |
| if paths.AGENT_INSTRUCTION_PROMPT.exists(): | |
| agent_base = paths.AGENT_INSTRUCTION_PROMPT.read_text( | |
| encoding="utf-8" | |
| ).strip() | |
| self.AGENT_INSTRUCTION = f"{agent_base}" | |
| else: | |
| self.AGENT_INSTRUCTION = "Write only the next AGENT turn." | |
| class Config: | |
| """Main configuration container""" | |
| paths: Paths = field(default_factory=Paths) | |
| models: ModelConfig = field(default_factory=ModelConfig) | |
| gcp: GCPConfig = field(default_factory=GCPConfig) | |
| conversation: ConversationConfig = field( | |
| default_factory=ConversationConfig | |
| ) | |
| concurrency: ConcurrencyConfig = field(default_factory=ConcurrencyConfig) | |
| generation: GenerationConfig = field(default_factory=GenerationConfig) | |
| # Company list | |
| companies: List[str] = field(default_factory=list) | |
| # Logging configuration | |
| log_level: str = "INFO" | |
| log_format: str = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" | |
| def __post_init__(self): | |
| """ | |
| Initialize companies list after object creation | |
| """ | |
| if not self.companies: # Only load if empty | |
| try: | |
| self.companies = self._load_companies_from_csv( | |
| self.conversation.MAX_COMPANIES | |
| ) | |
| except (FileNotFoundError, RuntimeError) as e: | |
| print(f"Warning: {e}") | |
| print("Using empty companies list for demo") | |
| self.companies = [] | |
| def from_env(cls) -> "Config": | |
| """Create configuration from environment variables""" | |
| return cls() | |
| def _load_companies_from_csv(self, size: int) -> List[str]: | |
| """Load companies from a CSV file""" | |
| import csv | |
| # Use existing paths from this instance | |
| csv_path = self.paths.COMPANIES_CSV | |
| if not csv_path.exists(): | |
| raise FileNotFoundError( | |
| f"Companies CSV file not found at: {csv_path}" | |
| ) | |
| companies = [] | |
| try: | |
| with open(csv_path, "r", encoding="utf-8") as f: | |
| reader = csv.DictReader(f) | |
| for row in reader: | |
| if "Company_Name" in row: | |
| companies.append(row["Company_Name"].strip()) | |
| if len(companies) >= size: | |
| break | |
| except Exception as e: | |
| raise RuntimeError(f"Error reading companies CSV file: {e}") | |
| if not companies: | |
| # throw error | |
| raise RuntimeError("No companies found in CSV file") | |
| return companies | |
| # Global configuration instance | |
| config = Config() | |