|
|
import argparse |
|
|
import json |
|
|
import random |
|
|
import time |
|
|
from pathlib import Path |
|
|
from tqdm import tqdm |
|
|
from google import genai |
|
|
from google.genai import types |
|
|
|
|
|
from helpers.scenarios import SCENARIOS |
|
|
from helpers.styles import PERSONAS, QUIRKS, SPEAKING_STYLES, GRAMMAR_QUIRKS, TONES |
|
|
|
|
|
|
|
|
SYSTEM_PROMPT = """\ |
|
|
You are a synthetic data generator for Z-number decision matrices. |
|
|
The idea is |
|
|
|
|
|
Your task: Given a decision scenario, generate TWO things: |
|
|
1. A realistic, subjective conversational user query (as if a real person is thinking out loud about this decision) |
|
|
2. The corresponding Z-number decision matrix extracted from that query |
|
|
|
|
|
## User Query Guidelines: |
|
|
- Write in first person, casual/conversational tone |
|
|
- Include hedging language ("I think", "probably", "not sure", "maybe") |
|
|
- Be subjective, messy, unstructured, uncertain, overloaded with information, overthinking |
|
|
- Describe preferences using natural expressions like: "amazing", "terrible", "pretty good", "kind of a nightmare", "super expensive", "really matters to me", "not a huge deal", "I've heard it's cheaper", "supposedly independent", etc. |
|
|
- The Z-number matrix is YOUR extraction/interpretation of the user's natural rambling thoughts |
|
|
- It MUST NOT contain any Z-Number Scales |
|
|
|
|
|
## Decision Matrix Format: |
|
|
Return a Markdown table: |
|
|
|
|
|
| | criterion_1 | criterion_2 | ... | |
|
|
|---|---|---|---| |
|
|
| type | benefit | cost | ... | |
|
|
| alt_1 | 4:3 | -3:4 | ... | |
|
|
| alt_2 | 3:4 | -2:5 | ... | |
|
|
| weight | 5:4 | 3:3 | ... | |
|
|
|
|
|
## Z-Number Scales for Decision Matrix: |
|
|
- Value (A-part): |
|
|
- benefit: 5 (excellent) β 4 (good) β 3 (moderate) β 2 (poor) β 1 (very poor) |
|
|
- cost: -1 (very low cost) β -2 (low) β -3 (moderate) β -4 (high) β -5 (very high cost) |
|
|
- Confidence (B-part): 5 (very confident) β 4 (confident) β 3 (somewhat confident) β 2 (uncertain) β 1 (very uncertain) |
|
|
|
|
|
Rules: |
|
|
- First row: criterion names (snake_case) |
|
|
- Second row: "type" then "benefit" or "cost" |
|
|
- Middle rows: alternative names, then VALUE:CONFIDENCE pairs |
|
|
- Last row: "weight" with importance weights (positive 1-5 only) |
|
|
- VALUE: positive (1-5) for benefits, negative (-1 to -5) for costs |
|
|
- CONFIDENCE: always positive (1-5) |
|
|
""" |
|
|
|
|
|
USER_PROMPT_TEMPLATE = """\ |
|
|
Generate synthetic training data for the following: |
|
|
|
|
|
**Scenario:** {scenario} |
|
|
**Number of alternatives:** {n_alternatives} |
|
|
**Number of criteria:** {n_criteria} |
|
|
|
|
|
**Match this style user_query:** |
|
|
- **Persona:** {persona} |
|
|
- **Tone:** {tone} |
|
|
- **Speaking style:** {speaking_style} |
|
|
- **Must include this quirk:** {quirk} |
|
|
- **Grammar quirk:** {grammar_quirk} |
|
|
|
|
|
Respond with valid JSON only: |
|
|
{{ |
|
|
"user_query": "<the messy subjective conversational user query>", |
|
|
"decision_matrix": "<the markdown table>" |
|
|
}} |
|
|
""" |
|
|
|
|
|
|
|
|
def sample_scenario() -> tuple[str, str]: |
|
|
"""Sample a random scenario, returns (category, scenario).""" |
|
|
category = random.choice(list(SCENARIOS.keys())) |
|
|
scenario = random.choice(SCENARIOS[category]) |
|
|
return category, scenario |
|
|
|
|
|
|
|
|
def generate_sample( |
|
|
client: genai.Client, |
|
|
model: str, |
|
|
scenario: str, |
|
|
n_alternatives: int, |
|
|
n_criteria: int, |
|
|
style: dict, |
|
|
max_retries: int = 3, |
|
|
) -> dict | None: |
|
|
"""Generate a single training sample.""" |
|
|
|
|
|
system_prompt = SYSTEM_PROMPT |
|
|
|
|
|
user_prompt = USER_PROMPT_TEMPLATE.format( |
|
|
scenario=scenario, |
|
|
n_alternatives=n_alternatives, |
|
|
n_criteria=n_criteria, |
|
|
persona=style["persona"], |
|
|
tone=style["tone"], |
|
|
speaking_style=style["speaking_style"], |
|
|
quirk=style["quirk"], |
|
|
grammar_quirk=style["grammar_quirk"], |
|
|
) |
|
|
|
|
|
for attempt in range(max_retries): |
|
|
try: |
|
|
response = client.models.generate_content( |
|
|
model=model, |
|
|
contents=[ |
|
|
types.Content( |
|
|
role="user", |
|
|
parts=[types.Part(text=system_prompt + "\n\n" + user_prompt)] |
|
|
) |
|
|
], |
|
|
config=types.GenerateContentConfig( |
|
|
thinking_config=types.ThinkingConfig(thinking_level="minimal"), |
|
|
response_mime_type="application/json", |
|
|
temperature=1.0, |
|
|
), |
|
|
) |
|
|
|
|
|
|
|
|
text = response.text.strip() |
|
|
data = json.loads(text) |
|
|
|
|
|
if "user_query" in data and "decision_matrix" in data: |
|
|
return data |
|
|
|
|
|
except json.JSONDecodeError as e: |
|
|
print(f" [Attempt {attempt + 1}] JSON parse error: {e}") |
|
|
except Exception as e: |
|
|
print(f" [Attempt {attempt + 1}] Error: {e}") |
|
|
time.sleep(2 ** attempt) |
|
|
|
|
|
return None |
|
|
|
|
|
|
|
|
def generate_dataset( |
|
|
api_key: str, |
|
|
n_samples: int, |
|
|
output_path: Path, |
|
|
model: str = "gemini-3-flash-preview", |
|
|
min_alternatives: int = 2, |
|
|
max_alternatives: int = 5, |
|
|
min_criteria: int = 3, |
|
|
max_criteria: int = 7, |
|
|
) -> None: |
|
|
"""Generate the full dataset.""" |
|
|
|
|
|
client = genai.Client(api_key=api_key) |
|
|
|
|
|
samples = [] |
|
|
failed = 0 |
|
|
|
|
|
print(f"Generating {n_samples} samples...") |
|
|
print(f"Model: {model}") |
|
|
print(f"Alternatives: {min_alternatives}-{max_alternatives}") |
|
|
print(f"Criteria: {min_criteria}-{max_criteria}") |
|
|
print("-" * 50) |
|
|
|
|
|
for i in tqdm(range(n_samples)): |
|
|
|
|
|
category, scenario = sample_scenario() |
|
|
n_alt = random.randint(min_alternatives, max_alternatives) |
|
|
n_crit = random.randint(min_criteria, max_criteria) |
|
|
|
|
|
style = dict( |
|
|
persona=random.choice(PERSONAS), |
|
|
tone=random.choice(TONES), |
|
|
quirk=random.choice(QUIRKS), |
|
|
grammar_quirk=random.choice(GRAMMAR_QUIRKS), |
|
|
speaking_style=random.choice(SPEAKING_STYLES), |
|
|
) |
|
|
|
|
|
|
|
|
print(f"[{i + 1}/{n_samples}] {scenario} ({n_alt} alts, {n_crit} criteria) | Style: {style}") |
|
|
|
|
|
result = generate_sample( |
|
|
client=client, |
|
|
model=model, |
|
|
scenario=scenario, |
|
|
n_alternatives=n_alt, |
|
|
n_criteria=n_crit, |
|
|
style=style, |
|
|
) |
|
|
|
|
|
if result: |
|
|
sample = { |
|
|
"id": i, |
|
|
"category": category, |
|
|
"scenario": scenario, |
|
|
"n_alternatives": n_alt, |
|
|
"n_criteria": n_crit, |
|
|
"user_query": result["user_query"], |
|
|
"decision_matrix": result["decision_matrix"], |
|
|
"style": style, |
|
|
} |
|
|
samples.append(sample) |
|
|
|
|
|
|
|
|
with open(output_path, "a", encoding="utf-8") as f: |
|
|
f.write(json.dumps(sample, ensure_ascii=False) + "\n") |
|
|
else: |
|
|
failed += 1 |
|
|
print(f" β Failed to generate sample") |
|
|
|
|
|
|
|
|
if (i + 1) % 1 == 0: |
|
|
time.sleep(1) |
|
|
|
|
|
print("-" * 50) |
|
|
print(f"β
Generated: {len(samples)} samples") |
|
|
print(f"β Failed: {failed} samples") |
|
|
print(f"π Output: {output_path}") |
|
|
|
|
|
|
|
|
def main(): |
|
|
parser = argparse.ArgumentParser( |
|
|
description="Generate Z-number decision matrix training data" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--n", |
|
|
type=int, |
|
|
default=100, |
|
|
help="Number of samples to generate (default: 100)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--api-key", |
|
|
type=str, |
|
|
required=True, |
|
|
help="Gemini API key" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--output", |
|
|
type=str, |
|
|
default="train.jsonl", |
|
|
help="Output JSONL file path (default: train.jsonl)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--model", |
|
|
type=str, |
|
|
default="gemini-3-flash-preview", |
|
|
help="Model to use (default: gemini-3-flash-preview)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--min-alt", |
|
|
type=int, |
|
|
default=2, |
|
|
help="Minimum number of alternatives (default: 2)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--max-alt", |
|
|
type=int, |
|
|
default=5, |
|
|
help="Maximum number of alternatives (default: 5)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--min-crit", |
|
|
type=int, |
|
|
default=3, |
|
|
help="Minimum number of criteria (default: 3)" |
|
|
) |
|
|
parser.add_argument( |
|
|
"--max-crit", |
|
|
type=int, |
|
|
default=7, |
|
|
help="Maximum number of criteria (default: 7)" |
|
|
) |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
output_path = Path(args.output) |
|
|
|
|
|
generate_dataset( |
|
|
api_key=args.api_key, |
|
|
n_samples=args.n, |
|
|
output_path=output_path, |
|
|
model=args.model, |
|
|
min_alternatives=args.min_alt, |
|
|
max_alternatives=args.max_alt, |
|
|
min_criteria=args.min_crit, |
|
|
max_criteria=args.max_crit, |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |