Spaces:
Sleeping
Sleeping
| """ | |
| MAZE 数据批量更新脚本(Phase 4 接口预留) | |
| 用于将新的 MAZE 录取结果数据批量导入到系统中。 | |
| 使用方式: | |
| python scripts/update_maze_data.py --year 2026 --input data/maze_2026.csv | |
| 输入格式(CSV): | |
| student_id, school, round, outcome, sat, gpa, ib_score, test_optional, hs_cat, major_cat, year | |
| 输出: | |
| 更新 track2/maze_data/ 目录下的数据文件 | |
| 输出更新摘要报告 | |
| TODO(等待 2026 数据): | |
| 1. 确认 CSV 格式与现有 MAZE 数据结构一致 | |
| 2. 运行数据质量检查(缺失值、异常值) | |
| 3. 更新 name_to_id_mappings.json(如有新学校) | |
| 4. 重新计算 V38.2 特征统计 | |
| """ | |
| import argparse | |
| import csv | |
| import json | |
| import sys | |
| from pathlib import Path | |
| from typing import Dict, List | |
| # 数据目录 | |
| _ROOT = Path(__file__).parent.parent | |
| _MAZE_DATA_DIR = _ROOT / "track2" / "maze_data" | |
| _MODELS_DIR = _ROOT / "track1" / "models" | |
| def validate_row(row: Dict, row_num: int) -> List[str]: | |
| """验证单行数据,返回错误列表""" | |
| errors = [] | |
| required_fields = ["school", "round", "outcome"] | |
| for field in required_fields: | |
| if not row.get(field): | |
| errors.append(f"第{row_num}行:缺少必填字段 {field}") | |
| # 验证 outcome | |
| valid_outcomes = ["录取", "拒绝", "等待列表", "admitted", "rejected", "waitlisted"] | |
| if row.get("outcome") and row["outcome"].lower() not in [o.lower() for o in valid_outcomes]: | |
| errors.append(f"第{row_num}行:outcome 值无效 '{row['outcome']}',应为 {valid_outcomes}") | |
| # 验证 round | |
| valid_rounds = ["ED", "EA", "RD", "ED2"] | |
| if row.get("round") and row["round"].upper() not in valid_rounds: | |
| errors.append(f"第{row_num}行:round 值无效 '{row['round']}',应为 {valid_rounds}") | |
| return errors | |
| def load_existing_data() -> List[Dict]: | |
| """加载现有 MAZE 数据""" | |
| data_file = _MAZE_DATA_DIR / "admissions.json" | |
| if data_file.exists(): | |
| with open(data_file, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| return [] | |
| def update_maze_data(input_file: str, year: int, dry_run: bool = True) -> Dict: | |
| """ | |
| 主更新函数。 | |
| Args: | |
| input_file: 输入 CSV 文件路径 | |
| year: 申请年份(如 2026) | |
| dry_run: 如果为 True,只验证不写入 | |
| Returns: | |
| 更新摘要 | |
| """ | |
| print(f"[update_maze_data] 开始处理 {input_file},年份 {year},dry_run={dry_run}") | |
| # 读取输入文件 | |
| new_records = [] | |
| errors = [] | |
| with open(input_file, "r", encoding="utf-8") as f: | |
| reader = csv.DictReader(f) | |
| for i, row in enumerate(reader, 1): | |
| row_errors = validate_row(row, i) | |
| if row_errors: | |
| errors.extend(row_errors) | |
| else: | |
| row["year"] = year | |
| new_records.append(row) | |
| print(f" 读取 {len(new_records)} 条有效记录,{len(errors)} 个错误") | |
| if errors: | |
| print(" 错误列表:") | |
| for e in errors[:10]: | |
| print(f" {e}") | |
| if dry_run: | |
| print(" [DRY RUN] 不写入数据") | |
| return { | |
| "status": "dry_run", | |
| "new_records": len(new_records), | |
| "errors": len(errors), | |
| "error_details": errors[:10], | |
| } | |
| # 加载现有数据 | |
| existing = load_existing_data() | |
| print(f" 现有数据:{len(existing)} 条") | |
| # 合并(去重:同一学生同一学校同一年份) | |
| existing_keys = set() | |
| for r in existing: | |
| key = f"{r.get('student_id', '')}_{r.get('school', '')}_{r.get('year', '')}" | |
| existing_keys.add(key) | |
| added = 0 | |
| skipped = 0 | |
| for r in new_records: | |
| key = f"{r.get('student_id', '')}_{r.get('school', '')}_{r.get('year', '')}" | |
| if key in existing_keys: | |
| skipped += 1 | |
| else: | |
| existing.append(r) | |
| existing_keys.add(key) | |
| added += 1 | |
| # 写入 | |
| _MAZE_DATA_DIR.mkdir(parents=True, exist_ok=True) | |
| output_file = _MAZE_DATA_DIR / "admissions.json" | |
| with open(output_file, "w", encoding="utf-8") as f: | |
| json.dump(existing, f, ensure_ascii=False, indent=2) | |
| print(f" 新增 {added} 条,跳过重复 {skipped} 条,总计 {len(existing)} 条") | |
| print(f" 已写入 {output_file}") | |
| return { | |
| "status": "success", | |
| "added": added, | |
| "skipped": skipped, | |
| "total": len(existing), | |
| "errors": len(errors), | |
| } | |
| def query_feeder_school(feeder_school: str, target_school: str, year_range: tuple = (2022, 2026)) -> Dict: | |
| """ | |
| Phase 4 接口:精确查询某高中到某大学的录取记录。 | |
| Args: | |
| feeder_school: 高中名称(支持模糊匹配) | |
| target_school: 目标大学名称 | |
| year_range: 年份范围 (start, end) | |
| Returns: | |
| { | |
| "feeder_school": str, | |
| "target_school": str, | |
| "year_range": tuple, | |
| "total_admitted": int, | |
| "total_applied": int, # 如果有数据 | |
| "cases": List[Dict], # 脱敏案例列表 | |
| } | |
| """ | |
| existing = load_existing_data() | |
| results = [] | |
| for r in existing: | |
| # 年份过滤 | |
| year = r.get("year") | |
| if year and not (year_range[0] <= int(year) <= year_range[1]): | |
| continue | |
| # 高中匹配 | |
| hs = r.get("hs_name", "") | |
| if feeder_school.lower() not in hs.lower(): | |
| continue | |
| # 学校匹配 | |
| school = r.get("school", "") | |
| if target_school.lower() not in school.lower(): | |
| continue | |
| results.append({ | |
| "year": r.get("year"), | |
| "outcome": r.get("outcome"), | |
| "round": r.get("round"), | |
| "sat": r.get("sat"), | |
| "gpa": r.get("gpa"), | |
| "ib_score": r.get("ib_score"), | |
| "test_optional": r.get("test_optional"), | |
| "major_cat": r.get("major_cat"), | |
| }) | |
| admitted = [r for r in results if r.get("outcome") in ("录取", "admitted")] | |
| return { | |
| "feeder_school": feeder_school, | |
| "target_school": target_school, | |
| "year_range": year_range, | |
| "total_admitted": len(admitted), | |
| "total_cases": len(results), | |
| "cases": results, | |
| } | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="MAZE 数据批量更新工具") | |
| parser.add_argument("--input", required=True, help="输入 CSV 文件路径") | |
| parser.add_argument("--year", type=int, required=True, help="申请年份(如 2026)") | |
| parser.add_argument("--dry-run", action="store_true", help="只验证不写入") | |
| args = parser.parse_args() | |
| result = update_maze_data(args.input, args.year, dry_run=args.dry_run) | |
| print(json.dumps(result, ensure_ascii=False, indent=2)) | |