""" MAZE 数据批量更新脚本(Phase 4 接口预留) 用于将新的 MAZE 录取结果数据批量导入到系统中。 使用方式: python scripts/update_maze_data.py --year 2026 --input data/maze_2026.csv 输入格式(CSV): student_id, school, round, outcome, sat, gpa, ib_score, test_optional, hs_cat, major_cat, year 输出: 更新 track2/maze_data/ 目录下的数据文件 输出更新摘要报告 TODO(等待 2026 数据): 1. 确认 CSV 格式与现有 MAZE 数据结构一致 2. 运行数据质量检查(缺失值、异常值) 3. 更新 name_to_id_mappings.json(如有新学校) 4. 重新计算 V38.2 特征统计 """ import argparse import csv import json import sys from pathlib import Path from typing import Dict, List # 数据目录 _ROOT = Path(__file__).parent.parent _MAZE_DATA_DIR = _ROOT / "track2" / "maze_data" _MODELS_DIR = _ROOT / "track1" / "models" def validate_row(row: Dict, row_num: int) -> List[str]: """验证单行数据,返回错误列表""" errors = [] required_fields = ["school", "round", "outcome"] for field in required_fields: if not row.get(field): errors.append(f"第{row_num}行:缺少必填字段 {field}") # 验证 outcome valid_outcomes = ["录取", "拒绝", "等待列表", "admitted", "rejected", "waitlisted"] if row.get("outcome") and row["outcome"].lower() not in [o.lower() for o in valid_outcomes]: errors.append(f"第{row_num}行:outcome 值无效 '{row['outcome']}',应为 {valid_outcomes}") # 验证 round valid_rounds = ["ED", "EA", "RD", "ED2"] if row.get("round") and row["round"].upper() not in valid_rounds: errors.append(f"第{row_num}行:round 值无效 '{row['round']}',应为 {valid_rounds}") return errors def load_existing_data() -> List[Dict]: """加载现有 MAZE 数据""" data_file = _MAZE_DATA_DIR / "admissions.json" if data_file.exists(): with open(data_file, "r", encoding="utf-8") as f: return json.load(f) return [] def update_maze_data(input_file: str, year: int, dry_run: bool = True) -> Dict: """ 主更新函数。 Args: input_file: 输入 CSV 文件路径 year: 申请年份(如 2026) dry_run: 如果为 True,只验证不写入 Returns: 更新摘要 """ print(f"[update_maze_data] 开始处理 {input_file},年份 {year},dry_run={dry_run}") # 读取输入文件 new_records = [] errors = [] with open(input_file, "r", encoding="utf-8") as f: reader = csv.DictReader(f) for i, row in enumerate(reader, 1): row_errors = validate_row(row, i) if row_errors: errors.extend(row_errors) else: row["year"] = year new_records.append(row) print(f" 读取 {len(new_records)} 条有效记录,{len(errors)} 个错误") if errors: print(" 错误列表:") for e in errors[:10]: print(f" {e}") if dry_run: print(" [DRY RUN] 不写入数据") return { "status": "dry_run", "new_records": len(new_records), "errors": len(errors), "error_details": errors[:10], } # 加载现有数据 existing = load_existing_data() print(f" 现有数据:{len(existing)} 条") # 合并(去重:同一学生同一学校同一年份) existing_keys = set() for r in existing: key = f"{r.get('student_id', '')}_{r.get('school', '')}_{r.get('year', '')}" existing_keys.add(key) added = 0 skipped = 0 for r in new_records: key = f"{r.get('student_id', '')}_{r.get('school', '')}_{r.get('year', '')}" if key in existing_keys: skipped += 1 else: existing.append(r) existing_keys.add(key) added += 1 # 写入 _MAZE_DATA_DIR.mkdir(parents=True, exist_ok=True) output_file = _MAZE_DATA_DIR / "admissions.json" with open(output_file, "w", encoding="utf-8") as f: json.dump(existing, f, ensure_ascii=False, indent=2) print(f" 新增 {added} 条,跳过重复 {skipped} 条,总计 {len(existing)} 条") print(f" 已写入 {output_file}") return { "status": "success", "added": added, "skipped": skipped, "total": len(existing), "errors": len(errors), } def query_feeder_school(feeder_school: str, target_school: str, year_range: tuple = (2022, 2026)) -> Dict: """ Phase 4 接口:精确查询某高中到某大学的录取记录。 Args: feeder_school: 高中名称(支持模糊匹配) target_school: 目标大学名称 year_range: 年份范围 (start, end) Returns: { "feeder_school": str, "target_school": str, "year_range": tuple, "total_admitted": int, "total_applied": int, # 如果有数据 "cases": List[Dict], # 脱敏案例列表 } """ existing = load_existing_data() results = [] for r in existing: # 年份过滤 year = r.get("year") if year and not (year_range[0] <= int(year) <= year_range[1]): continue # 高中匹配 hs = r.get("hs_name", "") if feeder_school.lower() not in hs.lower(): continue # 学校匹配 school = r.get("school", "") if target_school.lower() not in school.lower(): continue results.append({ "year": r.get("year"), "outcome": r.get("outcome"), "round": r.get("round"), "sat": r.get("sat"), "gpa": r.get("gpa"), "ib_score": r.get("ib_score"), "test_optional": r.get("test_optional"), "major_cat": r.get("major_cat"), }) admitted = [r for r in results if r.get("outcome") in ("录取", "admitted")] return { "feeder_school": feeder_school, "target_school": target_school, "year_range": year_range, "total_admitted": len(admitted), "total_cases": len(results), "cases": results, } if __name__ == "__main__": parser = argparse.ArgumentParser(description="MAZE 数据批量更新工具") parser.add_argument("--input", required=True, help="输入 CSV 文件路径") parser.add_argument("--year", type=int, required=True, help="申请年份(如 2026)") parser.add_argument("--dry-run", action="store_true", help="只验证不写入") args = parser.parse_args() result = update_maze_data(args.input, args.year, dry_run=args.dry_run) print(json.dumps(result, ensure_ascii=False, indent=2))