Spaces:
Sleeping
Sleeping
| """ | |
| Agentic 规划引擎 v1.0 | |
| 真正的 LLM 驱动工具调用循环——LLM 自己决定调用哪些工具、调用几次。 | |
| 架构: | |
| - Grok 4.20 作为主推理模型(原生支持 tool_use) | |
| - 工具集:10 个数据检索 + 分析工具,覆盖 MAZE / 活动向量 / 知识库 | |
| - 循环上限:8 步(防止无限循环) | |
| - 输出:SSE 流式,每次工具调用和结果实时推送 | |
| 集成方式: | |
| - 被 magi_system.py 在 planning 模式下调用 | |
| - 也可作为独立 API 端点 /api/agentic_plan 使用 | |
| """ | |
| import json | |
| import math | |
| import os | |
| import re | |
| import sys | |
| from pathlib import Path | |
| from typing import Any, Dict, Generator, List, Optional | |
| import requests | |
| import numpy as np | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| # ─── 常量 ───────────────────────────────────────────────────────────────────── | |
| GROK_URL = "https://api.x.ai/v1/chat/completions" | |
| GROK_HEADERS = { | |
| "Authorization": f"Bearer {os.environ.get('XAI_API_KEY', '')}", | |
| "Content-Type": "application/json", | |
| } | |
| DATA_DIR = Path(os.environ.get("DATA_DIR", str(Path(__file__).parent.parent / "data"))) | |
| MAX_STEPS = 8 | |
| # ─── 工具定义(Grok tool_use 格式)──────────────────────────────────────────── | |
| TOOLS = [ | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "find_similar_cases", | |
| "description": ( | |
| "从 MAZE 939 个真实录取案例中,找到与目标学生最相似的历史案例。" | |
| "返回相似案例列表,包含录取/拒绝结果、相似度分解、活动摘要。" | |
| "适用场景:了解类似学生的录取规律、活动组合参考、选校参考。" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "sat": {"type": "number", "description": "学生 SAT 分数(1000-1600)"}, | |
| "major": {"type": "string", "description": "申请专业方向,如 Computer Science、Economics、Biology"}, | |
| "hs_type": {"type": "string", "description": "高中类型:国际高中/国际部/公立/私立/美高"}, | |
| "target_school": {"type": "string", "description": "目标学校名称(可选),如 Harvard、MIT、Stanford"}, | |
| "n": {"type": "integer", "description": "返回案例数量,默认 8", "default": 8}, | |
| "result_filter": {"type": "string", "description": "筛选结果:Accept(只看录取)、Reject(只看拒绝)、null(全部)"}, | |
| }, | |
| "required": ["sat", "major"], | |
| }, | |
| }, | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "find_similar_activity_profiles", | |
| "description": ( | |
| "用 42 维混合向量(活动类型分布 + 语义相似度)检索活动组合相似的历史案例。" | |
| "比 find_similar_cases 更专注于活动维度的相似性。" | |
| "适用场景:分析某类活动组合的录取效果、发现活动差距。" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "activity_types": { | |
| "type": "object", | |
| "description": "活动类型分布,键为类型名,值为 0-1 的比例。类型包括:Research/Leadership/Art/Sports/Community/Work/Tutoring/Club/Competition/Other", | |
| }, | |
| "major": {"type": "string", "description": "专业方向"}, | |
| "sat_band": {"type": "string", "description": "SAT 分段,如 1400-1449、1500-1549"}, | |
| "n": {"type": "integer", "description": "返回案例数量,默认 8", "default": 8}, | |
| }, | |
| "required": ["activity_types"], | |
| }, | |
| }, | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "get_school_stats", | |
| "description": ( | |
| "获取某所学校的录取统计数据:SAT 均值/标准差、录取率、大陆高中申请人数/录取人数。" | |
| "数据来源:MAZE 939 案例 + meiben 大陆高中录取数据库。" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "school_name": {"type": "string", "description": "学校名称,如 Harvard、MIT、Columbia"}, | |
| "hs_category": {"type": "string", "description": "高中类别筛选(可选):国际高中/国际部/公立/私立"}, | |
| }, | |
| "required": ["school_name"], | |
| }, | |
| }, | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "get_school_taste", | |
| "description": ( | |
| "获取某所学校的招生偏好(口味):偏好的活动类型、文书风格、学生特质、常见拒绝原因。" | |
| "数据来源:school_taste_updates.json + consultant_knowledge_kb.json。" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "school_name": {"type": "string", "description": "学校名称"}, | |
| }, | |
| "required": ["school_name"], | |
| }, | |
| }, | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "analyze_activity_gap", | |
| "description": ( | |
| "对比学生的活动组合与相似录取案例,识别活动差距和改进方向。" | |
| "输入:学生活动描述 + find_similar_cases 的返回结果。" | |
| "输出:差距分析、具体建议、优先级排序。" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "student_activities": {"type": "string", "description": "学生当前活动描述"}, | |
| "similar_cases_summary": {"type": "string", "description": "相似录取案例的活动摘要(来自 find_similar_cases 结果)"}, | |
| "major": {"type": "string", "description": "专业方向"}, | |
| "target_school": {"type": "string", "description": "目标学校(可选)"}, | |
| }, | |
| "required": ["student_activities", "similar_cases_summary"], | |
| }, | |
| }, | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "get_ps_distillation", | |
| "description": ( | |
| "从 ps_distillation_kb.json(MAZE 158 个 Top15/20 录取案例蒸馏)中检索文书写作规律。" | |
| "返回:高频主题、核心原则、红旗警告、低 SAT 成功模式(如适用)。" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "major": {"type": "string", "description": "专业方向"}, | |
| "sat": {"type": "number", "description": "学生 SAT 分数(用于判断是否需要低 SAT 成功模式)"}, | |
| }, | |
| "required": ["major"], | |
| }, | |
| }, | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "check_pause_factors", | |
| "description": ( | |
| "检查学生档案是否触发 Harvard 招生委员会的 pause factors(暂停因素)。" | |
| "这些因素可能导致招生官在审阅时产生顾虑,即使其他指标很强。" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "student_profile": { | |
| "type": "object", | |
| "description": "学生档案,包含 sat/gpa/activities/essays 等字段", | |
| }, | |
| "target_schools": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "目标学校列表(可选)", | |
| }, | |
| }, | |
| "required": ["student_profile"], | |
| }, | |
| }, | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "search_blog_knowledge", | |
| "description": ( | |
| "从 blog_cards_all.json(4376 张招生官博客知识卡片)中检索相关内容。" | |
| "适用场景:查找特定主题的招生官观点、文书建议、活动评价标准。" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "query": {"type": "string", "description": "检索查询,如「文书开头」「科研活动价值」「中国学生常见问题」"}, | |
| "top_k": {"type": "integer", "description": "返回卡片数量,默认 5", "default": 5}, | |
| }, | |
| "required": ["query"], | |
| }, | |
| }, | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "calibrate_school_list", | |
| "description": ( | |
| "根据学生档案校准选校清单:评估每所学校的录取概率、建议轮次(ED/EA/RD)、风险等级。" | |
| "返回结构化的选校建议表格。" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "student": { | |
| "type": "object", | |
| "description": "学生档案(sat/gpa/major/hs_type/activities)", | |
| }, | |
| "school_list": { | |
| "type": "array", | |
| "items": {"type": "string"}, | |
| "description": "待评估的学校列表", | |
| }, | |
| }, | |
| "required": ["student", "school_list"], | |
| }, | |
| }, | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "get_special_case_patterns", | |
| "description": ( | |
| "检索特殊案例模式:低分高录、逆袭案例、特定背景学生的成功规律。" | |
| "数据来源:special_case_patterns.json。" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "pattern_type": { | |
| "type": "string", | |
| "description": "模式类型:low_sat_admit(低SAT录取)/ underrepresented(弱势群体)/ late_bloomer(后期爆发)/ niche_talent(特殊才能)", | |
| }, | |
| "major": {"type": "string", "description": "专业方向(可选)"}, | |
| }, | |
| "required": ["pattern_type"], | |
| }, | |
| }, | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "get_feeder_school_info", | |
| "description": ( | |
| "查询某所大学的大陆高中 feeder school 分布(哪些高中送了多少人)," | |
| "或查询某所高中的录取档案(该高中历史上送往哪些大学、T10/T15 率)。" | |
| "数据来源:meiben 27000+ 条大陆录取记录。" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "school_name": { | |
| "type": "string", | |
| "description": "大学英文名(如 'Stanford University')", | |
| }, | |
| "hs_name": { | |
| "type": "string", | |
| "description": "高中中文名(可选),若提供则返回该高中的录取档案", | |
| }, | |
| }, | |
| "required": ["school_name"], | |
| }, | |
| }, | |
| }, | |
| { | |
| "type": "function", | |
| "function": { | |
| "name": "get_summer_program_recommendation", | |
| "description": ( | |
| "根据学生的专业方向和目标档次,推荐适合的夏校和竞赛项目。" | |
| "数据来源:meiben 录取数据中提取的夏校/竞赛给力指数(1-10分)。" | |
| ), | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "major": {"type": "string", "description": "专业方向"}, | |
| "sat": {"type": "number", "description": "SAT 分数(可选)"}, | |
| "target_tier": { | |
| "type": "string", | |
| "description": "目标档次:T10 / T15 / T20", | |
| }, | |
| }, | |
| "required": ["major"], | |
| }, | |
| }, | |
| }, | |
| ] | |
| # ─── 工具执行层 ──────────────────────────────────────────────────────────────── | |
| def _execute_tool(tool_name: str, tool_input: Dict) -> str: | |
| """执行工具调用,返回 JSON 字符串结果""" | |
| try: | |
| if tool_name == "find_similar_cases": | |
| return _tool_find_similar_cases(**tool_input) | |
| elif tool_name == "find_similar_activity_profiles": | |
| return _tool_find_similar_activity_profiles(**tool_input) | |
| elif tool_name == "get_school_stats": | |
| return _tool_get_school_stats(**tool_input) | |
| elif tool_name == "get_school_taste": | |
| return _tool_get_school_taste(**tool_input) | |
| elif tool_name == "analyze_activity_gap": | |
| return _tool_analyze_activity_gap(**tool_input) | |
| elif tool_name == "get_ps_distillation": | |
| return _tool_get_ps_distillation(**tool_input) | |
| elif tool_name == "check_pause_factors": | |
| return _tool_check_pause_factors(**tool_input) | |
| elif tool_name == "search_blog_knowledge": | |
| return _tool_search_blog_knowledge(**tool_input) | |
| elif tool_name == "calibrate_school_list": | |
| return _tool_calibrate_school_list(**tool_input) | |
| elif tool_name == "get_special_case_patterns": | |
| return _tool_get_special_case_patterns(**tool_input) | |
| elif tool_name == "get_feeder_school_info": | |
| return _tool_get_feeder_school_info(**tool_input) | |
| elif tool_name == "get_summer_program_recommendation": | |
| return _tool_get_summer_program_recommendation(**tool_input) | |
| else: | |
| return json.dumps({"error": f"Unknown tool: {tool_name}"}) | |
| except Exception as e: | |
| return json.dumps({"error": str(e), "tool": tool_name}) | |
| # ─── 工具实现 ────────────────────────────────────────────────────────────────── | |
| def _tool_find_similar_cases(sat: float, major: str, hs_type: str = "国际高中", | |
| target_school: str = None, n: int = 8, | |
| result_filter: str = None) -> str: | |
| from track2.maze_retriever import find_similar_cases | |
| student = {"sat": sat, "major": major, "hs_type": hs_type} | |
| results = find_similar_cases(student, target_school=target_school, n=n, | |
| result_filter=result_filter) | |
| # 精简输出,避免 token 爆炸 | |
| simplified = [] | |
| for r in results: | |
| case = r["case"] | |
| simplified.append({ | |
| "similarity": round(r["similarity"], 3), | |
| "result_at_school": r.get("result_at_school"), | |
| "school": case.get("school", ""), | |
| "year": case.get("year", ""), | |
| "sat": case.get("sat", ""), | |
| "major_area": case.get("major_area", ""), | |
| "hs_type": case.get("hs_type", ""), | |
| "activity_summary": str(case.get("activity_summary", ""))[:300], | |
| "all_results_summary": _summarize_results(r.get("all_results", {})), | |
| "similarity_breakdown": r.get("similarity_breakdown", {}), | |
| }) | |
| return json.dumps({ | |
| "total_found": len(simplified), | |
| "cases": simplified, | |
| "insight": _infer_pattern(simplified, target_school), | |
| }, ensure_ascii=False) | |
| def _summarize_results(results: Dict) -> str: | |
| """把录取结果字典压缩为一行""" | |
| admits = [s for s, r in results.items() if r in ("Accept", "Enrolled", "WL→Accept")] | |
| rejects = [s for s, r in results.items() if r in ("Reject", "Deny")] | |
| parts = [] | |
| if admits: | |
| parts.append(f"录取:{', '.join(admits[:4])}") | |
| if rejects: | |
| parts.append(f"拒绝:{', '.join(rejects[:4])}") | |
| return ";".join(parts) if parts else "无结果" | |
| def _infer_pattern(cases: List[Dict], target_school: Optional[str]) -> str: | |
| """从案例列表中推断规律""" | |
| if not cases: | |
| return "未找到足够相似的案例" | |
| admits = [c for c in cases if c.get("result_at_school") in ("Accept", "Enrolled")] | |
| rejects = [c for c in cases if c.get("result_at_school") in ("Reject", "Deny")] | |
| total_with_result = len(admits) + len(rejects) | |
| if target_school and total_with_result > 0: | |
| rate = len(admits) / total_with_result | |
| return f"在 {len(cases)} 个相似案例中,{total_with_result} 个申请了 {target_school},录取率约 {rate:.0%}({len(admits)} 录取 / {len(rejects)} 拒绝)" | |
| return f"找到 {len(cases)} 个相似案例,SAT 范围 {min(c.get('sat', 0) or 0 for c in cases)}-{max(c.get('sat', 0) or 0 for c in cases)}" | |
| def _tool_find_similar_activity_profiles(activity_types: Dict, major: str = "", | |
| sat_band: str = "", n: int = 8) -> str: | |
| """用 42 维混合向量检索活动相似案例""" | |
| av_file = DATA_DIR / "activity_hybrid_vectors.json" | |
| if not av_file.exists(): | |
| return json.dumps({"error": "activity_hybrid_vectors.json not found"}) | |
| with open(av_file) as f: | |
| av_data = json.load(f) | |
| meta = av_data.get("meta", {}) | |
| type_buckets = meta.get("type_buckets", [ | |
| "Research", "Leadership", "Art", "Sports", "Community", | |
| "Work", "Tutoring", "Club", "Competition", "Other" | |
| ]) | |
| # 构建查询向量(10 维类型分布) | |
| query_type_vec = np.zeros(len(type_buckets)) | |
| for bucket, val in activity_types.items(): | |
| if bucket in type_buckets: | |
| idx = type_buckets.index(bucket) | |
| query_type_vec[idx] = float(val) | |
| # 归一化 | |
| norm = np.linalg.norm(query_type_vec) | |
| if norm > 0: | |
| query_type_vec = query_type_vec / norm | |
| # 计算余弦相似度(只用前 10 维类型部分) | |
| cases = av_data.get("cases", []) | |
| scored = [] | |
| for case in cases: | |
| vec = np.array(case.get("hybrid_vector", [])) | |
| if len(vec) < 10: | |
| continue | |
| case_type_vec = vec[:10] | |
| case_norm = np.linalg.norm(case_type_vec) | |
| if case_norm == 0: | |
| continue | |
| sim = float(np.dot(query_type_vec, case_type_vec) / case_norm) | |
| # 专业方向加权 | |
| if major and case.get("major_area", ""): | |
| if _major_match(major, case["major_area"]): | |
| sim *= 1.2 | |
| # SAT 段加权 | |
| if sat_band and case.get("sat_band", "") == sat_band: | |
| sim *= 1.1 | |
| scored.append({ | |
| "similarity": round(sim, 3), | |
| "school": case.get("school", ""), | |
| "result": case.get("result", ""), | |
| "year": case.get("year", ""), | |
| "major_area": case.get("major_area", ""), | |
| "sat_band": case.get("sat_band", ""), | |
| "activity_summary": str(case.get("activity_summary", ""))[:250], | |
| }) | |
| scored.sort(key=lambda x: -x["similarity"]) | |
| top = scored[:n] | |
| admits = [c for c in top if c["result"] == "admit"] | |
| rejects = [c for c in top if c["result"] == "reject"] | |
| return json.dumps({ | |
| "total_found": len(top), | |
| "admit_count": len(admits), | |
| "reject_count": len(rejects), | |
| "cases": top, | |
| "admit_activity_patterns": _extract_activity_patterns(admits), | |
| "reject_activity_patterns": _extract_activity_patterns(rejects), | |
| }, ensure_ascii=False) | |
| def _major_match(major1: str, major2: str) -> bool: | |
| STEM = {"cs", "computer", "engineering", "math", "physics", "biology", "chemistry", "data", "stem"} | |
| Hum = {"economics", "econ", "history", "philosophy", "politics", "sociology", "psychology", "humanities"} | |
| m1 = major1.lower() | |
| m2 = major2.lower() | |
| for group in [STEM, Hum]: | |
| if any(k in m1 for k in group) and any(k in m2 for k in group): | |
| return True | |
| return False | |
| def _extract_activity_patterns(cases: List[Dict]) -> str: | |
| if not cases: | |
| return "无案例" | |
| summaries = [c["activity_summary"] for c in cases if c["activity_summary"]] | |
| return ";".join(summaries[:3]) | |
| def _tool_get_school_stats(school_name: str, hs_category: str = None) -> str: | |
| from track2.maze_retriever import get_school_admission_stats | |
| stats = get_school_admission_stats(school_name, hs_cat=hs_category) | |
| return json.dumps(stats, ensure_ascii=False) | |
| def _tool_get_school_taste(school_name: str) -> str: | |
| taste_file = DATA_DIR / "school_taste_updates.json" | |
| consultant_file = DATA_DIR / "consultant_knowledge_kb.json" | |
| result = {"school": school_name, "taste": {}, "red_flags": [], "preferences": []} | |
| if taste_file.exists(): | |
| with open(taste_file) as f: | |
| taste_data = json.load(f) | |
| # 模糊匹配学校名 | |
| for key, val in taste_data.items(): | |
| if school_name.lower() in key.lower() or key.lower() in school_name.lower(): | |
| result["taste"] = val | |
| break | |
| if consultant_file.exists(): | |
| with open(consultant_file) as f: | |
| kb = json.load(f) | |
| # 从 consultant KB 中提取该学校相关内容 | |
| school_lower = school_name.lower() | |
| relevant = [] | |
| def _search(obj, path=""): | |
| if isinstance(obj, str) and school_lower in obj.lower() and len(obj) > 30: | |
| relevant.append(obj[:200]) | |
| elif isinstance(obj, dict): | |
| for k, v in obj.items(): | |
| _search(v, f"{path}/{k}") | |
| elif isinstance(obj, list): | |
| for item in obj: | |
| _search(item, path) | |
| _search(kb) | |
| result["consultant_notes"] = relevant[:5] | |
| return json.dumps(result, ensure_ascii=False) | |
| def _tool_analyze_activity_gap(student_activities: str, similar_cases_summary: str, | |
| major: str = "", target_school: str = "") -> str: | |
| """用 Grok 分析活动差距""" | |
| prompt = f"""你是一位资深美本申请顾问。 | |
| 学生当前活动: | |
| {student_activities[:800]} | |
| 相似录取案例的活动模式: | |
| {similar_cases_summary[:1000]} | |
| 专业方向:{major or '未知'} | |
| 目标学校:{target_school or '未指定'} | |
| 请分析: | |
| 1. **活动差距**:学生缺少哪类活动?(对比录取案例) | |
| 2. **优势**:学生已有的活动中,哪些是加分项? | |
| 3. **优先级建议**:最需要补充的 2-3 个活动方向(具体可执行) | |
| 4. **时间线**:如果距离申请还有 12 个月,怎么安排? | |
| 输出要简洁直接,每点不超过 2 句话。""" | |
| resp = requests.post(GROK_URL, headers=GROK_HEADERS, json={ | |
| "model": "grok-4.20-0309", | |
| "messages": [{"role": "user", "content": prompt}], | |
| "max_tokens": 1000, | |
| }, timeout=60) | |
| resp.raise_for_status() | |
| analysis = resp.json()["choices"][0]["message"]["content"] | |
| return json.dumps({"analysis": analysis}, ensure_ascii=False) | |
| def _tool_get_ps_distillation(major: str, sat: float = None) -> str: | |
| ps_kb_file = DATA_DIR / "ps_distillation_kb.json" | |
| if not ps_kb_file.exists(): | |
| return json.dumps({"error": "ps_distillation_kb.json not found"}) | |
| with open(ps_kb_file) as f: | |
| ps_kb = json.load(f) | |
| result = { | |
| "version": ps_kb.get("meta", {}).get("version", "unknown"), | |
| "high_success_themes": ps_kb.get("theme_taxonomy", {}).get("high_success_themes", [])[:5], | |
| "core_principles": ps_kb.get("core_principles", [])[:5], | |
| "red_flags": ps_kb.get("red_flags", [])[:8], | |
| "distilled_wisdom": ps_kb.get("distilled_wisdom", "")[:500], | |
| } | |
| # 低 SAT 专属模式 | |
| if sat and sat < 1500: | |
| result["low_sat_patterns"] = ps_kb.get("low_sat_success_patterns", [])[:5] | |
| # 中国学生专属 | |
| chinese = ps_kb.get("chinese_student_specific", {}) | |
| if chinese: | |
| result["chinese_specific"] = { | |
| "common_mistakes": chinese.get("common_mistakes", [])[:4], | |
| "success_strategies": chinese.get("success_strategies", [])[:3], | |
| } | |
| return json.dumps(result, ensure_ascii=False) | |
| def _tool_check_pause_factors(student_profile: Dict, target_schools: List[str] = None) -> str: | |
| from agent.pause_factor_checker import check_student_profile, format_pause_factor_warning | |
| result = check_student_profile(student_profile, target_schools) | |
| warnings = [] | |
| for school in (target_schools or ["Harvard"]): | |
| w = format_pause_factor_warning(result, school) | |
| if w: | |
| warnings.append({"school": school, "warning": w[:300]}) | |
| return json.dumps({ | |
| "has_pause_factors": result.get("has_pause_factors", False), | |
| "pause_factors": result.get("pause_factors", [])[:5], | |
| "warnings": warnings, | |
| }, ensure_ascii=False) | |
| def _tool_search_blog_knowledge(query: str, top_k: int = 5) -> str: | |
| blog_file = DATA_DIR / "blog_cards_all.json" | |
| if not blog_file.exists(): | |
| return json.dumps({"error": "blog_cards_all.json not found"}) | |
| with open(blog_file) as f: | |
| cards = json.load(f) | |
| # 简单关键词匹配(BM25 太重,这里用快速关键词) | |
| query_words = set(query.lower().split()) | |
| scored = [] | |
| for card in cards: | |
| claim = card.get("claim", "").lower() | |
| tag = str(card.get("primary_tag", "")).lower() | |
| score = sum(1 for w in query_words if w in claim or w in tag) | |
| if score > 0: | |
| scored.append((score, card)) | |
| scored.sort(key=lambda x: -x[0]) | |
| results = [] | |
| for score, card in scored[:top_k]: | |
| results.append({ | |
| "claim": card.get("claim", ""), | |
| "evidence": card.get("evidence", {}).get("quote", "")[:150] if isinstance(card.get("evidence"), dict) else "", | |
| "school": card.get("source_school", ""), | |
| "tag": card.get("primary_tag", ""), | |
| }) | |
| return json.dumps({"results": results, "total_matched": len(scored)}, ensure_ascii=False) | |
| def _tool_calibrate_school_list(student: Dict, school_list: List[str]) -> str: | |
| from agent.school_calibrator import calibrate_school_list | |
| result = calibrate_school_list(student, school_list) | |
| # 精简输出 | |
| if isinstance(result, dict) and "results" in result: | |
| simplified = [{ | |
| "school": r.get("school", ""), | |
| "probability": r.get("probability_pct", ""), | |
| "risk_level": r.get("risk_level", ""), | |
| "recommended_round": r.get("recommended_round", ""), | |
| "notes": r.get("notes", "")[:100], | |
| } for r in result["results"][:10]] | |
| return json.dumps({"school_list": simplified, "summary": result.get("summary_table", "")[:500]}, ensure_ascii=False) | |
| return json.dumps(result, ensure_ascii=False) | |
| def _tool_get_special_case_patterns(pattern_type: str, major: str = "") -> str: | |
| scp_file = DATA_DIR / "special_case_patterns.json" | |
| if not scp_file.exists(): | |
| # fallback: 从 ps_distillation_kb 里的 low_sat 部分 | |
| ps_kb_file = DATA_DIR / "ps_distillation_kb.json" | |
| if ps_kb_file.exists() and pattern_type == "low_sat_admit": | |
| with open(ps_kb_file) as f: | |
| ps_kb = json.load(f) | |
| return json.dumps({ | |
| "pattern_type": pattern_type, | |
| "patterns": ps_kb.get("low_sat_success_patterns", [])[:5], | |
| }, ensure_ascii=False) | |
| return json.dumps({"error": "special_case_patterns.json not found"}) | |
| with open(scp_file) as f: | |
| scp = json.load(f) | |
| patterns = scp.get(pattern_type, []) | |
| if major: | |
| # 过滤专业相关 | |
| filtered = [p for p in patterns if not p.get("major") or _major_match(major, p.get("major", ""))] | |
| patterns = filtered if filtered else patterns | |
| return json.dumps({ | |
| "pattern_type": pattern_type, | |
| "patterns": patterns[:6], | |
| }, ensure_ascii=False) | |
| def _tool_get_feeder_school_info(school_name: str, hs_name: str = None) -> str: | |
| """ | |
| 查询某所大学的大陆高中 feeder school 分布,或某所高中的录取档案。 | |
| school_name: 大学英文名(如 'Stanford University') | |
| hs_name: 可选,高中中文名(如 '北京师范大学附属实验中学国际部'),若提供则返回该高中的录取档案 | |
| """ | |
| feeder_file = DATA_DIR / "meiben_feeder_kb.json" | |
| if not feeder_file.exists(): | |
| return json.dumps({"error": "meiben_feeder_kb.json not found"}) | |
| with open(feeder_file) as f: | |
| kb = json.load(f) | |
| # 查询高中档案 | |
| if hs_name: | |
| # 模糊匹配高中名 | |
| hs_profiles = kb.get("high_school_profiles", {}) | |
| matched = None | |
| for name, profile in hs_profiles.items(): | |
| if hs_name in name or name in hs_name: | |
| matched = (name, profile) | |
| break | |
| if not matched: | |
| # 尝试更宽松的匹配 | |
| hs_lower = hs_name.lower().replace(" ", "") | |
| for name, profile in hs_profiles.items(): | |
| if hs_lower in name.lower().replace(" ", ""): | |
| matched = (name, profile) | |
| break | |
| if matched: | |
| name, profile = matched | |
| return json.dumps({ | |
| "hs_name": name, | |
| "province": profile.get("province"), | |
| "total_offers": profile.get("total_offers"), | |
| "t10_ivy_offers": profile.get("t10_ivy_offers"), | |
| "t15_offers": profile.get("t15_offers"), | |
| "t10_rate": profile.get("t10_rate"), | |
| "t15_rate": profile.get("t15_rate"), | |
| "top_universities": profile.get("top_universities", [])[:12], | |
| "years_active": profile.get("years_active"), | |
| }, ensure_ascii=False) | |
| return json.dumps({"error": f"High school '{hs_name}' not found in database"}) | |
| # 查询大学的 feeder school 分布 | |
| feeder_kb = kb.get("feeder_school_kb", {}) | |
| # 模糊匹配大学名 | |
| matched_uni = None | |
| for uni, data in feeder_kb.items(): | |
| if school_name.lower() in uni.lower() or uni.lower() in school_name.lower(): | |
| matched_uni = (uni, data) | |
| break | |
| if not matched_uni: | |
| # 尝试缩写匹配 | |
| abbrev_map = { | |
| "stanford": "Stanford University", | |
| "harvard": "Harvard University", | |
| "mit": "Massachusetts Institute of Technology", | |
| "yale": "Yale University", | |
| "princeton": "Princeton University", | |
| "columbia": "Columbia University", | |
| "upenn": "University of Pennsylvania", | |
| "penn": "University of Pennsylvania", | |
| "duke": "Duke University", | |
| "dartmouth": "Dartmouth College", | |
| "brown": "Brown University", | |
| "cornell": "Cornell University", | |
| "washu": "Washington University in St.Louis", | |
| "wustl": "Washington University in St.Louis", | |
| } | |
| for abbrev, full_name in abbrev_map.items(): | |
| if abbrev in school_name.lower(): | |
| for uni, data in feeder_kb.items(): | |
| if full_name.lower() in uni.lower(): | |
| matched_uni = (uni, data) | |
| break | |
| if matched_uni: | |
| break | |
| if not matched_uni: | |
| available = list(feeder_kb.keys())[:20] | |
| return json.dumps({"error": f"University '{school_name}' not found", "available_universities": available}) | |
| uni_name, data = matched_uni | |
| # 同时返回省市偏好 | |
| province_pref = kb.get("province_preference", {}).get(uni_name, {}) | |
| return json.dumps({ | |
| "university": uni_name, | |
| "total_mainland_offers": data.get("total_offers"), | |
| "top_feeder_schools": data.get("top_feeder_schools", [])[:10], | |
| "province_distribution": data.get("province_distribution", {}), | |
| "year_trend": data.get("year_trend", {}), | |
| "province_preference": province_pref, | |
| "insight": f"{uni_name} 在大陆的主要来源高中集中在 {', '.join(list(data.get('province_distribution', {}).keys())[:3])} 等省市。" | |
| }, ensure_ascii=False) | |
| def _tool_get_summer_program_recommendation(major: str, sat: float = None, | |
| hs_type: str = "国际高中", | |
| target_tier: str = "T15") -> str: | |
| """ | |
| 根据学生背景推荐适合的夏校和竞赛项目。 | |
| major: 专业方向 | |
| sat: SAT 分数(可选) | |
| hs_type: 高中类型 | |
| target_tier: 目标档次 T10/T15/T20 | |
| """ | |
| feeder_file = DATA_DIR / "meiben_feeder_kb.json" | |
| scores_file = DATA_DIR / "summer_program_scores.json" | |
| programs = [] | |
| # 从 meiben_feeder_kb 的 summer_program_db 获取项目列表 | |
| if feeder_file.exists(): | |
| with open(feeder_file) as f: | |
| kb = json.load(f) | |
| all_programs = kb.get("summer_program_db", []) | |
| # 按专业方向过滤 | |
| major_lower = major.lower() | |
| major_keywords = { | |
| "cs": ["computer", "coding", "programming", "software", "ai", "data"], | |
| "bio": ["biology", "biomedical", "life science", "medicine", "health"], | |
| "physics": ["physics", "astronomy", "astrophysics"], | |
| "econ": ["economics", "business", "finance", "policy"], | |
| "env": ["environment", "sustainability", "climate", "earth"], | |
| "humanities": ["history", "literature", "philosophy", "writing", "language"], | |
| "math": ["math", "statistics", "quantitative"], | |
| "engineering": ["engineering", "mechanical", "electrical", "civil"], | |
| } | |
| # 确定专业类别 | |
| major_cat = "general" | |
| for cat, keywords in major_keywords.items(): | |
| if any(kw in major_lower for kw in keywords): | |
| major_cat = cat | |
| break | |
| # 按给力指数排序,过滤出高质量项目 | |
| min_geili = 7.0 if target_tier == "T10" else 6.0 if target_tier == "T15" else 5.0 | |
| filtered = [p for p in all_programs if p.get("avg_geili_score", 0) >= min_geili] | |
| filtered.sort(key=lambda x: x.get("avg_geili_score", 0), reverse=True) | |
| # 分类型返回 | |
| summer_progs = [p for p in filtered if p.get("type") == "summer_program"][:8] | |
| competitions = [p for p in filtered if p.get("type") == "competition"][:6] | |
| programs = { | |
| "summer_programs": summer_progs, | |
| "competitions": competitions, | |
| "total_filtered": len(filtered), | |
| "filter_criteria": f"给力指数 >= {min_geili}({target_tier} 目标)", | |
| } | |
| return json.dumps({ | |
| "major": major, | |
| "target_tier": target_tier, | |
| "recommendations": programs, | |
| "note": "给力指数 10=顶级(RSI/ISEF),8-9=强力,6-7=良好,5以下=一般", | |
| }, ensure_ascii=False) | |
| # ─── SSE 工具 ────────────────────────────────────────────────────────────────── | |
| def _sse(event: str, data: Any) -> str: | |
| return f"event: {event}\ndata: {json.dumps(data, ensure_ascii=False)}\n\n" | |
| # ─── 主 Agentic Loop ─────────────────────────────────────────────────────────── | |
| SYSTEM_PROMPT = """你是一位资深美本申请顾问,拥有访问真实录取数据库的工具集。 | |
| 你的工作方式: | |
| 1. 仔细分析用户的问题和学生信息 | |
| 2. 主动调用工具获取真实数据(不要凭空推断) | |
| 3. 基于工具返回的真实案例数据给出具体建议 | |
| 4. 当数据足够时,输出完整的规划建议 | |
| 重要原则: | |
| - 优先调用 find_similar_cases 获取真实案例基准 | |
| - 对于活动问题,同时调用 find_similar_activity_profiles | |
| - 对于选校问题,调用 get_school_stats + get_school_taste | |
| - 数据说话:所有建议必须有真实案例或统计数据支撑 | |
| - 不要在没有数据的情况下给出录取概率估算 | |
| 输出格式: | |
| - 使用 Markdown 格式 | |
| - 数据来源要标注(如「来自 MAZE 8 个相似案例」) | |
| - 建议要具体可执行,不要泛泛而谈""" | |
| def run_agentic_plan( | |
| query: str, | |
| student: Dict, | |
| stream: bool = True, | |
| ) -> Generator[str, None, None]: | |
| """ | |
| 主 Agentic 规划循环。 | |
| Args: | |
| query: 用户的自然语言问题 | |
| student: 学生档案 {sat, toefl, gpa, major, hs_type, activities, target_schools} | |
| stream: 是否 SSE 流式输出 | |
| Yields: | |
| SSE 格式的字符串 | |
| """ | |
| yield _sse("agentic_start", { | |
| "message": "规划智能体启动,正在分析问题...", | |
| "tools_available": [t["function"]["name"] for t in TOOLS], | |
| }) | |
| # 构建初始消息 | |
| student_context = _format_student_context(student) | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": f"{student_context}\n\n用户问题:{query}", | |
| } | |
| ] | |
| step = 0 | |
| while step < MAX_STEPS: | |
| step += 1 | |
| # 调用 Grok(带工具定义) | |
| yield _sse("thinking", {"step": step, "message": f"第 {step} 步:推理中..."}) | |
| try: | |
| resp = requests.post( | |
| GROK_URL, | |
| headers=GROK_HEADERS, | |
| json={ | |
| "model": "grok-4.20-0309", | |
| "messages": messages, | |
| "tools": TOOLS, | |
| "tool_choice": "auto", | |
| "max_tokens": 4000, | |
| }, | |
| timeout=120, | |
| ) | |
| resp.raise_for_status() | |
| response_data = resp.json() | |
| except Exception as e: | |
| yield _sse("error", {"message": f"LLM 调用失败:{str(e)}"}) | |
| return | |
| choice = response_data["choices"][0] | |
| message = choice["message"] | |
| finish_reason = choice.get("finish_reason", "") | |
| # 将 assistant 消息加入历史 | |
| messages.append({"role": "assistant", "content": message.get("content") or "", "tool_calls": message.get("tool_calls", [])}) | |
| # 检查是否有工具调用 | |
| tool_calls = message.get("tool_calls", []) | |
| if not tool_calls or finish_reason == "stop": | |
| # LLM 决定输出最终答案 | |
| final_content = message.get("content", "") | |
| yield _sse("final_answer", { | |
| "content": final_content, | |
| "steps_taken": step, | |
| }) | |
| break | |
| # 执行所有工具调用 | |
| tool_results = [] | |
| for tc in tool_calls: | |
| tool_name = tc.get("function", {}).get("name", "") | |
| tool_input_str = tc.get("function", {}).get("arguments", "{}") | |
| tool_call_id = tc.get("id", f"call_{step}") | |
| try: | |
| tool_input = json.loads(tool_input_str) | |
| except json.JSONDecodeError: | |
| tool_input = {} | |
| yield _sse("tool_call", { | |
| "step": step, | |
| "tool": tool_name, | |
| "input": tool_input, | |
| "call_id": tool_call_id, | |
| }) | |
| # 执行工具 | |
| result_str = _execute_tool(tool_name, tool_input) | |
| # 预览结果(前 200 字符) | |
| try: | |
| result_preview = json.loads(result_str) | |
| if isinstance(result_preview, dict): | |
| preview_keys = list(result_preview.keys())[:3] | |
| preview = {k: str(result_preview[k])[:100] for k in preview_keys} | |
| else: | |
| preview = str(result_preview)[:200] | |
| except Exception: | |
| preview = result_str[:200] | |
| yield _sse("tool_result", { | |
| "step": step, | |
| "tool": tool_name, | |
| "call_id": tool_call_id, | |
| "preview": preview, | |
| "result_length": len(result_str), | |
| }) | |
| tool_results.append({ | |
| "tool_call_id": tool_call_id, | |
| "role": "tool", | |
| "content": result_str, | |
| }) | |
| # 将工具结果加入消息历史 | |
| messages.extend(tool_results) | |
| else: | |
| # 超过最大步数 | |
| yield _sse("max_steps_reached", { | |
| "message": f"已达到最大步数 {MAX_STEPS},输出当前结论", | |
| "steps_taken": MAX_STEPS, | |
| }) | |
| # 强制输出最终答案 | |
| try: | |
| resp = requests.post( | |
| GROK_URL, | |
| headers=GROK_HEADERS, | |
| json={ | |
| "model": "grok-4.20-0309", | |
| "messages": messages + [{"role": "user", "content": "请基于以上所有工具结果,输出最终的规划建议。"}], | |
| "max_tokens": 3000, | |
| }, | |
| timeout=120, | |
| ) | |
| resp.raise_for_status() | |
| final_content = resp.json()["choices"][0]["message"]["content"] | |
| yield _sse("final_answer", {"content": final_content, "steps_taken": MAX_STEPS}) | |
| except Exception as e: | |
| yield _sse("error", {"message": f"最终输出失败:{str(e)}"}) | |
| yield _sse("agentic_done", {"message": "规划完成", "total_steps": step}) | |
| def _format_student_context(student: Dict) -> str: | |
| """将学生档案格式化为 LLM 可读的上下文""" | |
| lines = ["【学生档案】"] | |
| if student.get("sat"): | |
| lines.append(f"- SAT:{student['sat']}") | |
| if student.get("toefl"): | |
| lines.append(f"- TOEFL:{student['toefl']}") | |
| if student.get("gpa"): | |
| lines.append(f"- GPA:{student['gpa']}") | |
| if student.get("major"): | |
| lines.append(f"- 专业方向:{student['major']}") | |
| if student.get("hs_type") or student.get("high_school"): | |
| lines.append(f"- 高中:{student.get('hs_type') or student.get('high_school')}") | |
| if student.get("activities"): | |
| lines.append(f"- 活动:{str(student['activities'])[:600]}") | |
| if student.get("target_schools"): | |
| schools = student["target_schools"] | |
| if isinstance(schools, list): | |
| schools = "、".join(schools) | |
| lines.append(f"- 目标学校:{schools}") | |
| if student.get("grade"): | |
| lines.append(f"- 年级:{student['grade']}") | |
| return "\n".join(lines) | |