File size: 34,133 Bytes
95d976b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
{"id": "bank_hard_001", "dataset": "bank", "goal": "What's the subscription rate for month='may' AND job='management' AND balance in Q1?", "expected_output_type": "scalar", "level": "L6", "template": "multi_condition_filter", "golden": {"answer_value": 2.44, "answer_type": "scalar", "verification_code": "df['_q'] = pd.qcut(df['balance'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nfiltered = df[(df['month'] == 'may') & (df['job'] == 'management') & (df['_q'] == 'Q1')]\nresult = round((filtered['y'] == 1).mean() * 100, 2) if len(filtered) > 0 else 0.0\ndf.drop('_q', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "null", "missing", "nan", "empty", "na"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "multi_condition_filter", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"col_a": "month", "val_a": "may", "col_b": "job", "val_b": "management", "metric": "balance", "quartile": "Q1", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_005", "dataset": "bank", "goal": "Among customers in top 95% of age AND bottom 10% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.59, "answer_type": "scalar", "verification_code": "p_high = df['age'].quantile(95 / 100)\np_low = df['duration'].quantile(10 / 100)\ncohort = df[(df['age'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "age", "metric_b": "duration", "pct_high": 95, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_012", "dataset": "bank", "goal": "Find the job with lowest average balance. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 8.27, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['balance'].mean()\nextrema_val = group_stats.max() if 'lowest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "balance", "extrema": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_019", "dataset": "bank", "goal": "What's the subscription rate for month='may' AND job='management' AND age in Q4?", "expected_output_type": "scalar", "level": "L6", "template": "multi_condition_filter", "golden": {"answer_value": 7.04, "answer_type": "scalar", "verification_code": "df['_q'] = pd.qcut(df['age'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nfiltered = df[(df['month'] == 'may') & (df['job'] == 'management') & (df['_q'] == 'Q4')]\nresult = round((filtered['y'] == 1).mean() * 100, 2) if len(filtered) > 0 else 0.0\ndf.drop('_q', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "null", "missing", "nan", "empty", "na"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "multi_condition_filter", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"col_a": "month", "val_a": "may", "col_b": "job", "val_b": "management", "metric": "age", "quartile": "Q4", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_020", "dataset": "bank", "goal": "For month with above-average day, which have subscription rate > 25%? Return sorted list.", "expected_output_type": "list", "level": "L5", "template": "top_n_in_segment", "golden": {"answer_value": ["oct"], "answer_type": "list", "verification_code": "avg_metric = df.groupby('month')['day'].mean()\nhigh_metric = avg_metric[avg_metric > avg_metric.mean()].index\nrates = df[df['month'].isin(high_metric)].groupby('month')['y'].apply(\n    lambda x: (x == 1).mean() * 100)\nresult = sorted(rates[rates > 25].index.tolist())", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["tie", "ties", "equal", "same", "duplicate"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "top_n_in_segment", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "month", "metric": "day", "threshold": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_021", "dataset": "bank", "goal": "Find the job with highest average balance. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 24.62, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['balance'].mean()\nextrema_val = group_stats.max() if 'highest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "balance", "extrema": "highest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_023", "dataset": "bank", "goal": "Among customers in top 90% of balance AND bottom 10% of day, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 33.53, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(90 / 100)\np_low = df['day'].quantile(10 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['day'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "day", "pct_high": 90, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_026", "dataset": "bank", "goal": "For each job, compute volatility score: std(age) / mean(age). Return top 5 with [group, mean, std, volatility].", "expected_output_type": "dataframe", "level": "L6", "template": "segment_volatility", "golden": {"answer_value": [{"job": "unemployed", "mean": 40.97, "std": 9.74, "volatility": 0.2377}, {"job": "self-employed", "mean": 40.42, "std": 9.46, "volatility": 0.234}, {"job": "admin.", "mean": 39.68, "std": 9.23, "volatility": 0.2326}, {"job": "services", "mean": 38.94, "std": 8.86, "volatility": 0.2275}, {"job": "management", "mean": 40.2, "std": 9.13, "volatility": 0.2271}], "answer_type": "dataframe", "verification_code": "stats = df.groupby('job')['age'].agg(['mean', 'std']).round(2)\nstats['volatility'] = round(stats['std'] / stats['mean'], 4)\nresult = stats.nlargest(5, 'volatility').reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits", "volatility", "cv", "coefficient", "variation"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "segment_volatility", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "age"}}}
{"id": "bank_hard_028", "dataset": "bank", "goal": "Among customers in top 95% of balance AND bottom 10% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.84, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(95 / 100)\np_low = df['duration'].quantile(10 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 95, "pct_low": 10, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_029", "dataset": "bank", "goal": "Among customers in top 90% of balance AND bottom 25% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.54, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(90 / 100)\np_low = df['duration'].quantile(25 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 90, "pct_low": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_030", "dataset": "bank", "goal": "Rank job by subscription rate. Which bottom-3 have above-median age?", "expected_output_type": "list", "level": "L6", "template": "ranked_anomaly", "golden": {"answer_value": ["blue-collar", "entrepreneur"], "answer_type": "list", "verification_code": "stats = df.groupby('job').agg(\n    rate=('y', lambda x: (x == 1).mean()),\n    avg_metric=('age', 'mean'))\nstats['rank'] = stats['rate'].rank()\nbottom_3 = stats[stats['rank'] <= 3]\nresult = sorted(bottom_3[bottom_3['avg_metric'] > stats['avg_metric'].median()].index.tolist())", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rank", "order", "sort", "ascending", "descending"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "ranked_anomaly", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "age", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_031", "dataset": "bank", "goal": "For each month, compute volatility score: std(day) / mean(day). Return top 5 with [group, mean, std, volatility].", "expected_output_type": "dataframe", "level": "L6", "template": "segment_volatility", "golden": {"answer_value": [{"month": "feb", "mean": 6.05, "std": 5.46, "volatility": 0.9025}, {"month": "sep", "mean": 11.67, "std": 8.04, "volatility": 0.6889}, {"month": "mar", "mean": 13.45, "std": 9.18, "volatility": 0.6825}, {"month": "jun", "mean": 11.32, "std": 7.33, "volatility": 0.6475}, {"month": "dec", "mean": 14.19, "std": 8.83, "volatility": 0.6223}], "answer_type": "dataframe", "verification_code": "stats = df.groupby('month')['day'].agg(['mean', 'std']).round(2)\nstats['volatility'] = round(stats['std'] / stats['mean'], 4)\nresult = stats.nlargest(5, 'volatility').reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits", "volatility", "cv", "coefficient", "variation"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "segment_volatility", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "month", "metric": "day"}}}
{"id": "bank_hard_033", "dataset": "bank", "goal": "Show the average balance breakdown by job. Include count and mean balance for each category, sorted by mean descending.", "expected_output_type": "dataframe", "level": "L4", "template": "metric_breakdown", "golden": {"answer_value": [{"job": "retired", "count": 35185, "mean_balance": 1812.07}, {"job": "unknown", "count": 2917, "mean_balance": 1678.96}, {"job": "self-employed", "count": 19020, "mean_balance": 1598.27}, {"job": "student", "count": 11767, "mean_balance": 1577.32}, {"job": "management", "count": 175541, "mean_balance": 1510.39}, {"job": "unemployed", "count": 17634, "mean_balance": 1440.57}, {"job": "entrepreneur", "count": 17718, "mean_balance": 1306.75}, {"job": "housemaid", "count": 15912, "mean_balance": 1281.22}, {"job": "technician", "count": 138107, "mean_balance": 1071.57}, {"job": "admin.", "count": 81492, "mean_balance": 1019.92}, {"job": "blue-collar", "count": 170498, "mean_balance": 977.49}, {"job": "services", "count": 64209, "mean_balance": 834.63}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('job').agg(\n    count=('balance', 'size'),\n    mean_balance=('balance', lambda x: round(x.mean(), 2))\n).sort_values('mean_balance', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["round", "decimal", "precision", "digits"], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "metric_breakdown", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "metric": "balance"}}}
{"id": "bank_hard_035", "dataset": "bank", "goal": "Find the job with lowest average age. What is the subscription rate for that segment?", "expected_output_type": "scalar", "level": "L5", "template": "chain_conversion", "golden": {"answer_value": 34.08, "answer_type": "scalar", "verification_code": "group_stats = df.groupby('job')['age'].mean()\nextrema_val = group_stats.max() if 'lowest' == 'highest' else group_stats.min()\ntied = group_stats[group_stats == extrema_val].sort_index()\nextrema_group = tied.index[0]\nsubset = df[df['job'] == extrema_group]\nresult = round((subset['y'] == 1).mean() * 100, 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "chain_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "job", "metric": "age", "extrema": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_038", "dataset": "bank", "goal": "Find the month with the lowest subscription rate. Within that group, what is the average day?", "expected_output_type": "scalar", "level": "L5", "template": "nested_extrema", "golden": {"answer_value": 16.09, "answer_type": "scalar", "verification_code": "group_rates = df.groupby('month')['y'].apply(lambda x: (x == 1).mean())\nouter_val = group_rates.max() if 'lowest' == 'highest' else group_rates.min()\nouter_tied = group_rates[group_rates == outer_val].sort_index()\nouter_group = outer_tied.index[0]\nsubset = df[df['month'] == outer_group]\nresult = round(subset['day'].mean(), 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["tie", "ties", "equal", "same", "duplicate", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "nested_extrema", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "month", "metric": "day", "extrema_outer": "lowest", "extrema_inner": "highest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_039", "dataset": "bank", "goal": "Divide customers into 4 day quartiles. What is the subscription percentage (0-100) in the highest (top 25%) (Q4) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "quartile_conversion", "golden": {"answer_value": 11.55, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['day'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q4']\nresult = round((bin_data['y'] == 1).mean() * 100, 2)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["quartile", "how", "define", "q1", "q2", "q3", "q4", "bins", "bucket", "positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "quartile_conversion", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "day", "quartile": "Q4", "quartile_desc": "highest (top 25%)", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_040", "dataset": "bank", "goal": "Among customers in top 95% of balance AND bottom 25% of duration, what's subscription rate?", "expected_output_type": "scalar", "level": "L6", "template": "percentile_cohort", "golden": {"answer_value": 0.65, "answer_type": "scalar", "verification_code": "p_high = df['balance'].quantile(95 / 100)\np_low = df['duration'].quantile(25 / 100)\ncohort = df[(df['balance'] >= p_high) & (df['duration'] <= p_low)]\nresult = round((cohort['y'] == 1).mean() * 100, 2) if len(cohort) > 0 else 0.0", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["percentile", "inclusive", "exclusive", "boundary", "include", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Complex analysis with 3+ operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 6, "template_name": "percentile_cohort", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"metric_a": "balance", "metric_b": "duration", "pct_high": 95, "pct_low": 25, "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_041", "dataset": "bank", "goal": "Which job categories would have the biggest impact if brought to average subscription rate? Return top 3 by potential gain (count * rate gap), sorted by impact.", "expected_output_type": "list", "level": "L5", "template": "segment_improvement_potential", "golden": {"answer_value": ["blue-collar", "services", "entrepreneur"], "answer_type": "list", "verification_code": "overall_rate = (df['y'] == 1).mean()\ngroup_stats = df.groupby('job').agg(\n    rate=('y', lambda x: (x == 1).mean()),\n    count=('y', 'size')\n)\ngroup_stats['gap'] = overall_rate - group_stats['rate']\ngroup_stats['potential'] = group_stats['count'] * group_stats['gap']\ntop_potential = group_stats[group_stats['gap'] > 0].nlargest(3, 'potential')\nresult = top_potential.index.tolist()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": ["positive", "success", "target", "y=1", "class", "outcome", "rate", "percentage", "decimal", "format", "0-100", "0-1"], "success_criteria": ["Answer must match expected value", "List elements must match (order matters)"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "segment_improvement_potential", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "job", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "bank_hard_044", "dataset": "bank", "goal": "Find the month with the lowest subscription rate. Within that group, what is the average age?", "expected_output_type": "scalar", "level": "L5", "template": "nested_extrema", "golden": {"answer_value": 38.98, "answer_type": "scalar", "verification_code": "group_rates = df.groupby('month')['y'].apply(lambda x: (x == 1).mean())\nouter_val = group_rates.max() if 'lowest' == 'highest' else group_rates.min()\nouter_tied = group_rates[group_rates == outer_val].sort_index()\nouter_group = outer_tied.index[0]\nsubset = df[df['month'] == outer_group]\nresult = round(subset['age'].mean(), 2)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": ["tie", "ties", "equal", "same", "duplicate", "positive", "success", "target", "y=1", "class", "outcome"], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "nested_extrema", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "month", "metric": "age", "extrema_outer": "lowest", "extrema_inner": "lowest", "target_col": "y", "target_pos": "1", "target_desc": "subscription"}}}
{"id": "road_hard_014", "dataset": "road", "goal": "Which lighting categories have the highest total reported accidents? Show breakdown with [lighting, count, total_num_reported_accidents, avg_num_reported_accidents] sorted by total descending.", "expected_output_type": "dataframe", "level": "L4", "template": "count_segment_total", "golden": {"answer_value": [{"lighting": "dim", "count": 183826, "total_num_reported_accidents": 211283, "avg_num_reported_accidents": 1.15}, {"lighting": "daylight", "count": 178015, "total_num_reported_accidents": 207579, "avg_num_reported_accidents": 1.17}, {"lighting": "night", "count": 155913, "total_num_reported_accidents": 196214, "avg_num_reported_accidents": 1.26}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('lighting').agg(\n    count=('num_reported_accidents', 'size'),\n    total_num_reported_accidents=('num_reported_accidents', 'sum'),\n    avg_num_reported_accidents=('num_reported_accidents', lambda x: round(x.mean(), 2))\n).sort_values('total_num_reported_accidents', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": [], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "count_segment_total", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "lighting", "target_col": "num_reported_accidents", "target_desc": "reported accidents"}}}
{"id": "road_hard_021", "dataset": "road", "goal": "Which weather categories have the highest average accident risk? Show breakdown with [weather, count, avg_accident_risk] sorted by average descending.", "expected_output_type": "dataframe", "level": "L4", "template": "continuous_segment_breakdown", "golden": {"answer_value": [{"weather": "foggy", "count": 181463, "avg_accident_risk": 0.3863}, {"weather": "rainy", "count": 156985, "avg_accident_risk": 0.3615}, {"weather": "clear", "count": 179306, "avg_accident_risk": 0.3101}], "answer_type": "dataframe", "verification_code": "breakdown = df.groupby('weather').agg(\n    count=('accident_risk', 'size'),\n    avg_accident_risk=('accident_risk', lambda x: round(x.mean(), 4))\n).sort_values('avg_accident_risk', ascending=False)\nresult = breakdown.reset_index()", "tolerance": 0.0}, "tolerance": 0.0, "ambiguities": [], "success_criteria": ["Answer must match expected value", "DataFrame shape must match", "Column names must match", "Values must match with numeric tolerance 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_segment_breakdown", "generator": "qa-gen-v2", "tolerance": 0.0, "slots": {"group_col": "weather", "target_col": "accident_risk", "target_desc": "accident risk"}}}
{"id": "road_hard_015", "dataset": "road", "goal": "Divide records into 4 speed_limit quartiles. What is the average accident risk in the lower-middle (25-50%) (Q2) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "continuous_quartile_analysis", "golden": {"answer_value": 0.29, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['speed_limit'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q2']\nresult = round(bin_data['accident_risk'].mean(), 4)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_quartile_analysis", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "speed_limit", "quartile": "Q2", "quartile_desc": "lower-middle (25-50%)", "target_col": "accident_risk", "target_desc": "accident risk"}}}
{"id": "road_hard_002", "dataset": "road", "goal": "Divide records into 4 curvature quartiles. What is the average accident risk in the upper-middle (50-75%) (Q3) quartile?", "expected_output_type": "scalar", "level": "L4", "template": "continuous_quartile_analysis", "golden": {"answer_value": 0.41, "answer_type": "scalar", "verification_code": "df['_bin'] = pd.qcut(df['curvature'], 4, labels=['Q1','Q2','Q3','Q4'], duplicates='drop')\nbin_data = df[df['_bin'] == 'Q3']\nresult = round(bin_data['accident_risk'].mean(), 4)\ndf.drop('_bin', axis=1, inplace=True)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Single aggregation or binning operation expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 4, "template_name": "continuous_quartile_analysis", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"bin_col": "curvature", "quartile": "Q3", "quartile_desc": "upper-middle (50-75%)", "target_col": "accident_risk", "target_desc": "accident risk"}}}
{"id": "road_hard_007", "dataset": "road", "goal": "How much higher is the average accident risk for lighting='daylight' compared to 'night'? Return difference.", "expected_output_type": "scalar", "level": "L5", "template": "continuous_comparison", "golden": {"answer_value": -0.17, "answer_type": "scalar", "verification_code": "avg_a = df[df['lighting'] == 'daylight']['accident_risk'].mean()\navg_b = df[df['lighting'] == 'night']['accident_risk'].mean()\nresult = round(avg_a - avg_b, 4)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "continuous_comparison", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "lighting", "val_a": "daylight", "val_b": "night", "target_col": "accident_risk", "target_desc": "accident risk"}}}
{"id": "road_hard_004", "dataset": "road", "goal": "How much higher is the average accident risk for road_type='rural' compared to 'urban'? Return difference.", "expected_output_type": "scalar", "level": "L5", "template": "continuous_comparison", "golden": {"answer_value": -0.01, "answer_type": "scalar", "verification_code": "avg_a = df[df['road_type'] == 'rural']['accident_risk'].mean()\navg_b = df[df['road_type'] == 'urban']['accident_risk'].mean()\nresult = round(avg_a - avg_b, 4)", "tolerance": 0.01}, "tolerance": 0.01, "ambiguities": [], "success_criteria": ["Answer must match expected value", "Numeric tolerance: 0.01"], "constraints": ["Use pandas for data manipulation", "Store final answer in 'result' variable", "Multi-step analysis with 2 operations expected"], "relationships": [], "clarification_budget": 2, "metadata": {"difficulty_level": 5, "template_name": "continuous_comparison", "generator": "qa-gen-v2", "tolerance": 0.01, "slots": {"group_col": "road_type", "val_a": "rural", "val_b": "urban", "target_col": "accident_risk", "target_desc": "accident risk"}}}