#!/usr/bin/env python """ 实际提取algorithm-generated.jsonl样本并保存为JSON文件 """ import json import sys from pathlib import Path def extract_and_save_sample(jsonl_path: str, sample_id: int = 0): """ 从algorithm-generated.jsonl中提取指定样本并保存为JSON文件 """ # 读取JSONL文件 samples = [] with open(jsonl_path, 'r', encoding='utf-8') as f: for line in f: if line.strip(): samples.append(json.loads(line)) if sample_id >= len(samples): print(f"错误:样本ID {sample_id} 超出范围,最大ID为 {len(samples)-1}") return None, None sample = samples[sample_id] # 提取trace数据 trace_data = { "filename": f"algorithm_sample_{sample_id}.json", "title": f"Algorithm Sample {sample_id}: {sample['question'][:50]}...", "description": f"Multi-agent collaboration sample from algorithm-generated dataset. Agents: {', '.join(sample['agents'])}. Question: {sample['question'][:100]}...", "trace_type": "multi_agent_collaboration", "trace_source": "algorithm_generated", "tags": ["multi_agent", "algorithm_generated", "real_failure"] + sample.get('agents', []), "content": { "id": f"algorithm_trace_{sample_id}", "timestamp": "2025-01-27T00:00:00", "metadata": { "source": "algorithm-generated.jsonl", "original_id": sample['id'], "mistake_step": sample.get('mistake_step', 0), "mistake_agent": sample.get('mistake_agent', 'unknown'), "mistake_reason": sample.get('mistake_reason', 'unknown'), "ground_truth": sample.get('ground_truth', 'unknown'), "is_correct": sample.get('is_correct', False) }, "data": { "question": sample['question'], "agents": sample['agents'], "total_observations": len(json.loads(sample['trace'])) if isinstance(sample['trace'], str) else len(sample['trace']) }, "observations": json.loads(sample['trace']) if isinstance(sample['trace'], str) else sample['trace'] } } # 创建配置条目 config_entry = { "id": f"algorithm_sample_{sample_id}", "name": f"Algorithm Generated Sample {sample_id}", "description": trace_data["description"], "trace_file": f"traces/algorithm_sample_{sample_id}.json", "knowledge_graph_file": f"knowledge_graphs/kg_algorithm_sample_{sample_id}.json", "tags": trace_data["tags"], "complexity": "advanced", "trace_type": trace_data["trace_type"], "trace_source": trace_data["trace_source"], "features": [ "multi_agent_collaboration", "real_failure_analysis", "complex_reasoning", "tool_usage", "error_patterns" ] } # 保存trace文件 trace_filepath = Path(f"traces/algorithm_sample_{sample_id}.json") with open(trace_filepath, 'w', encoding='utf-8') as f: json.dump(trace_data, f, indent=2, ensure_ascii=False) print(f"✅ 成功保存trace文件: {trace_filepath}") print(f" 问题: {sample['question'][:100]}...") print(f" 智能体: {', '.join(sample['agents'])}") print(f" 观察数量: {len(trace_data['content']['observations'])}") print(f" 错误步骤: {sample.get('mistake_step', 'N/A')}") print(f" 错误智能体: {sample.get('mistake_agent', 'N/A')}") return trace_data, config_entry if __name__ == "__main__": if len(sys.argv) < 2: print("用法: python extract_algorithm_sample.py [sample_id]") sys.exit(1) jsonl_path = sys.argv[1] sample_id = int(sys.argv[2]) if len(sys.argv) > 2 else 0 print(f"🔄 提取样本 {sample_id} from {jsonl_path}") trace_data, config_entry = extract_and_save_sample(jsonl_path, sample_id) if trace_data and config_entry: print("\n📋 生成的配置条目:") print(json.dumps(config_entry, indent=2, ensure_ascii=False)) print("\n✨ 下一步:运行knowledge graph提取")