planning-agent-pro / scripts /update_maze_data.py
catninja123's picture
Deploy Planning Agent Pro v1.0
2b4baca verified
"""
MAZE 数据批量更新脚本(Phase 4 接口预留)
用于将新的 MAZE 录取结果数据批量导入到系统中。
使用方式:
python scripts/update_maze_data.py --year 2026 --input data/maze_2026.csv
输入格式(CSV):
student_id, school, round, outcome, sat, gpa, ib_score, test_optional, hs_cat, major_cat, year
输出:
更新 track2/maze_data/ 目录下的数据文件
输出更新摘要报告
TODO(等待 2026 数据):
1. 确认 CSV 格式与现有 MAZE 数据结构一致
2. 运行数据质量检查(缺失值、异常值)
3. 更新 name_to_id_mappings.json(如有新学校)
4. 重新计算 V38.2 特征统计
"""
import argparse
import csv
import json
import sys
from pathlib import Path
from typing import Dict, List
# 数据目录
_ROOT = Path(__file__).parent.parent
_MAZE_DATA_DIR = _ROOT / "track2" / "maze_data"
_MODELS_DIR = _ROOT / "track1" / "models"
def validate_row(row: Dict, row_num: int) -> List[str]:
"""验证单行数据,返回错误列表"""
errors = []
required_fields = ["school", "round", "outcome"]
for field in required_fields:
if not row.get(field):
errors.append(f"第{row_num}行:缺少必填字段 {field}")
# 验证 outcome
valid_outcomes = ["录取", "拒绝", "等待列表", "admitted", "rejected", "waitlisted"]
if row.get("outcome") and row["outcome"].lower() not in [o.lower() for o in valid_outcomes]:
errors.append(f"第{row_num}行:outcome 值无效 '{row['outcome']}',应为 {valid_outcomes}")
# 验证 round
valid_rounds = ["ED", "EA", "RD", "ED2"]
if row.get("round") and row["round"].upper() not in valid_rounds:
errors.append(f"第{row_num}行:round 值无效 '{row['round']}',应为 {valid_rounds}")
return errors
def load_existing_data() -> List[Dict]:
"""加载现有 MAZE 数据"""
data_file = _MAZE_DATA_DIR / "admissions.json"
if data_file.exists():
with open(data_file, "r", encoding="utf-8") as f:
return json.load(f)
return []
def update_maze_data(input_file: str, year: int, dry_run: bool = True) -> Dict:
"""
主更新函数。
Args:
input_file: 输入 CSV 文件路径
year: 申请年份(如 2026)
dry_run: 如果为 True,只验证不写入
Returns:
更新摘要
"""
print(f"[update_maze_data] 开始处理 {input_file},年份 {year},dry_run={dry_run}")
# 读取输入文件
new_records = []
errors = []
with open(input_file, "r", encoding="utf-8") as f:
reader = csv.DictReader(f)
for i, row in enumerate(reader, 1):
row_errors = validate_row(row, i)
if row_errors:
errors.extend(row_errors)
else:
row["year"] = year
new_records.append(row)
print(f" 读取 {len(new_records)} 条有效记录,{len(errors)} 个错误")
if errors:
print(" 错误列表:")
for e in errors[:10]:
print(f" {e}")
if dry_run:
print(" [DRY RUN] 不写入数据")
return {
"status": "dry_run",
"new_records": len(new_records),
"errors": len(errors),
"error_details": errors[:10],
}
# 加载现有数据
existing = load_existing_data()
print(f" 现有数据:{len(existing)} 条")
# 合并(去重:同一学生同一学校同一年份)
existing_keys = set()
for r in existing:
key = f"{r.get('student_id', '')}_{r.get('school', '')}_{r.get('year', '')}"
existing_keys.add(key)
added = 0
skipped = 0
for r in new_records:
key = f"{r.get('student_id', '')}_{r.get('school', '')}_{r.get('year', '')}"
if key in existing_keys:
skipped += 1
else:
existing.append(r)
existing_keys.add(key)
added += 1
# 写入
_MAZE_DATA_DIR.mkdir(parents=True, exist_ok=True)
output_file = _MAZE_DATA_DIR / "admissions.json"
with open(output_file, "w", encoding="utf-8") as f:
json.dump(existing, f, ensure_ascii=False, indent=2)
print(f" 新增 {added} 条,跳过重复 {skipped} 条,总计 {len(existing)} 条")
print(f" 已写入 {output_file}")
return {
"status": "success",
"added": added,
"skipped": skipped,
"total": len(existing),
"errors": len(errors),
}
def query_feeder_school(feeder_school: str, target_school: str, year_range: tuple = (2022, 2026)) -> Dict:
"""
Phase 4 接口:精确查询某高中到某大学的录取记录。
Args:
feeder_school: 高中名称(支持模糊匹配)
target_school: 目标大学名称
year_range: 年份范围 (start, end)
Returns:
{
"feeder_school": str,
"target_school": str,
"year_range": tuple,
"total_admitted": int,
"total_applied": int, # 如果有数据
"cases": List[Dict], # 脱敏案例列表
}
"""
existing = load_existing_data()
results = []
for r in existing:
# 年份过滤
year = r.get("year")
if year and not (year_range[0] <= int(year) <= year_range[1]):
continue
# 高中匹配
hs = r.get("hs_name", "")
if feeder_school.lower() not in hs.lower():
continue
# 学校匹配
school = r.get("school", "")
if target_school.lower() not in school.lower():
continue
results.append({
"year": r.get("year"),
"outcome": r.get("outcome"),
"round": r.get("round"),
"sat": r.get("sat"),
"gpa": r.get("gpa"),
"ib_score": r.get("ib_score"),
"test_optional": r.get("test_optional"),
"major_cat": r.get("major_cat"),
})
admitted = [r for r in results if r.get("outcome") in ("录取", "admitted")]
return {
"feeder_school": feeder_school,
"target_school": target_school,
"year_range": year_range,
"total_admitted": len(admitted),
"total_cases": len(results),
"cases": results,
}
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="MAZE 数据批量更新工具")
parser.add_argument("--input", required=True, help="输入 CSV 文件路径")
parser.add_argument("--year", type=int, required=True, help="申请年份(如 2026)")
parser.add_argument("--dry-run", action="store_true", help="只验证不写入")
args = parser.parse_args()
result = update_maze_data(args.input, args.year, dry_run=args.dry_run)
print(json.dumps(result, ensure_ascii=False, indent=2))