Spaces:
Running
Running
| #!/usr/bin/env python | |
| """ | |
| 实际提取algorithm-generated.jsonl样本并保存为JSON文件 | |
| """ | |
| import json | |
| import sys | |
| from pathlib import Path | |
| def extract_and_save_sample(jsonl_path: str, sample_id: int = 0): | |
| """ | |
| 从algorithm-generated.jsonl中提取指定样本并保存为JSON文件 | |
| """ | |
| # 读取JSONL文件 | |
| samples = [] | |
| with open(jsonl_path, 'r', encoding='utf-8') as f: | |
| for line in f: | |
| if line.strip(): | |
| samples.append(json.loads(line)) | |
| if sample_id >= len(samples): | |
| print(f"错误:样本ID {sample_id} 超出范围,最大ID为 {len(samples)-1}") | |
| return None, None | |
| sample = samples[sample_id] | |
| # 提取trace数据 | |
| trace_data = { | |
| "filename": f"algorithm_sample_{sample_id}.json", | |
| "title": f"Algorithm Sample {sample_id}: {sample['question'][:50]}...", | |
| "description": f"Multi-agent collaboration sample from algorithm-generated dataset. Agents: {', '.join(sample['agents'])}. Question: {sample['question'][:100]}...", | |
| "trace_type": "multi_agent_collaboration", | |
| "trace_source": "algorithm_generated", | |
| "tags": ["multi_agent", "algorithm_generated", "real_failure"] + sample.get('agents', []), | |
| "content": { | |
| "id": f"algorithm_trace_{sample_id}", | |
| "timestamp": "2025-01-27T00:00:00", | |
| "metadata": { | |
| "source": "algorithm-generated.jsonl", | |
| "original_id": sample['id'], | |
| "mistake_step": sample.get('mistake_step', 0), | |
| "mistake_agent": sample.get('mistake_agent', 'unknown'), | |
| "mistake_reason": sample.get('mistake_reason', 'unknown'), | |
| "ground_truth": sample.get('ground_truth', 'unknown'), | |
| "is_correct": sample.get('is_correct', False) | |
| }, | |
| "data": { | |
| "question": sample['question'], | |
| "agents": sample['agents'], | |
| "total_observations": len(json.loads(sample['trace'])) if isinstance(sample['trace'], str) else len(sample['trace']) | |
| }, | |
| "observations": json.loads(sample['trace']) if isinstance(sample['trace'], str) else sample['trace'] | |
| } | |
| } | |
| # 创建配置条目 | |
| config_entry = { | |
| "id": f"algorithm_sample_{sample_id}", | |
| "name": f"Algorithm Generated Sample {sample_id}", | |
| "description": trace_data["description"], | |
| "trace_file": f"traces/algorithm_sample_{sample_id}.json", | |
| "knowledge_graph_file": f"knowledge_graphs/kg_algorithm_sample_{sample_id}.json", | |
| "tags": trace_data["tags"], | |
| "complexity": "advanced", | |
| "trace_type": trace_data["trace_type"], | |
| "trace_source": trace_data["trace_source"], | |
| "features": [ | |
| "multi_agent_collaboration", | |
| "real_failure_analysis", | |
| "complex_reasoning", | |
| "tool_usage", | |
| "error_patterns" | |
| ] | |
| } | |
| # 保存trace文件 | |
| trace_filepath = Path(f"traces/algorithm_sample_{sample_id}.json") | |
| with open(trace_filepath, 'w', encoding='utf-8') as f: | |
| json.dump(trace_data, f, indent=2, ensure_ascii=False) | |
| print(f"✅ 成功保存trace文件: {trace_filepath}") | |
| print(f" 问题: {sample['question'][:100]}...") | |
| print(f" 智能体: {', '.join(sample['agents'])}") | |
| print(f" 观察数量: {len(trace_data['content']['observations'])}") | |
| print(f" 错误步骤: {sample.get('mistake_step', 'N/A')}") | |
| print(f" 错误智能体: {sample.get('mistake_agent', 'N/A')}") | |
| return trace_data, config_entry | |
| if __name__ == "__main__": | |
| if len(sys.argv) < 2: | |
| print("用法: python extract_algorithm_sample.py <jsonl_path> [sample_id]") | |
| sys.exit(1) | |
| jsonl_path = sys.argv[1] | |
| sample_id = int(sys.argv[2]) if len(sys.argv) > 2 else 0 | |
| print(f"🔄 提取样本 {sample_id} from {jsonl_path}") | |
| trace_data, config_entry = extract_and_save_sample(jsonl_path, sample_id) | |
| if trace_data and config_entry: | |
| print("\n📋 生成的配置条目:") | |
| print(json.dumps(config_entry, indent=2, ensure_ascii=False)) | |
| print("\n✨ 下一步:运行knowledge graph提取") | |