File size: 4,264 Bytes
5aa050e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
"""
实际提取algorithm-generated.jsonl样本并保存为JSON文件
"""

import json
import sys
from pathlib import Path

def extract_and_save_sample(jsonl_path: str, sample_id: int = 0):
    """
    从algorithm-generated.jsonl中提取指定样本并保存为JSON文件
    """
    
    # 读取JSONL文件
    samples = []
    with open(jsonl_path, 'r', encoding='utf-8') as f:
        for line in f:
            if line.strip():
                samples.append(json.loads(line))
    
    if sample_id >= len(samples):
        print(f"错误:样本ID {sample_id} 超出范围,最大ID为 {len(samples)-1}")
        return None, None
    
    sample = samples[sample_id]
    
    # 提取trace数据
    trace_data = {
        "filename": f"algorithm_sample_{sample_id}.json",
        "title": f"Algorithm Sample {sample_id}: {sample['question'][:50]}...",
        "description": f"Multi-agent collaboration sample from algorithm-generated dataset. Agents: {', '.join(sample['agents'])}. Question: {sample['question'][:100]}...",
        "trace_type": "multi_agent_collaboration",
        "trace_source": "algorithm_generated",
        "tags": ["multi_agent", "algorithm_generated", "real_failure"] + sample.get('agents', []),
        "content": {
            "id": f"algorithm_trace_{sample_id}",
            "timestamp": "2025-01-27T00:00:00",
            "metadata": {
                "source": "algorithm-generated.jsonl",
                "original_id": sample['id'],
                "mistake_step": sample.get('mistake_step', 0),
                "mistake_agent": sample.get('mistake_agent', 'unknown'),
                "mistake_reason": sample.get('mistake_reason', 'unknown'),
                "ground_truth": sample.get('ground_truth', 'unknown'),
                "is_correct": sample.get('is_correct', False)
            },
            "data": {
                "question": sample['question'],
                "agents": sample['agents'],
                "total_observations": len(json.loads(sample['trace'])) if isinstance(sample['trace'], str) else len(sample['trace'])
            },
            "observations": json.loads(sample['trace']) if isinstance(sample['trace'], str) else sample['trace']
        }
    }
    
    # 创建配置条目
    config_entry = {
        "id": f"algorithm_sample_{sample_id}",
        "name": f"Algorithm Generated Sample {sample_id}",
        "description": trace_data["description"],
        "trace_file": f"traces/algorithm_sample_{sample_id}.json",
        "knowledge_graph_file": f"knowledge_graphs/kg_algorithm_sample_{sample_id}.json",
        "tags": trace_data["tags"],
        "complexity": "advanced",
        "trace_type": trace_data["trace_type"],
        "trace_source": trace_data["trace_source"],
        "features": [
            "multi_agent_collaboration",
            "real_failure_analysis",
            "complex_reasoning",
            "tool_usage",
            "error_patterns"
        ]
    }
    
    # 保存trace文件
    trace_filepath = Path(f"traces/algorithm_sample_{sample_id}.json")
    with open(trace_filepath, 'w', encoding='utf-8') as f:
        json.dump(trace_data, f, indent=2, ensure_ascii=False)
    
    print(f"✅ 成功保存trace文件: {trace_filepath}")
    print(f"   问题: {sample['question'][:100]}...")
    print(f"   智能体: {', '.join(sample['agents'])}")
    print(f"   观察数量: {len(trace_data['content']['observations'])}")
    print(f"   错误步骤: {sample.get('mistake_step', 'N/A')}")
    print(f"   错误智能体: {sample.get('mistake_agent', 'N/A')}")
    
    return trace_data, config_entry

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("用法: python extract_algorithm_sample.py <jsonl_path> [sample_id]")
        sys.exit(1)
    
    jsonl_path = sys.argv[1]
    sample_id = int(sys.argv[2]) if len(sys.argv) > 2 else 0
    
    print(f"🔄 提取样本 {sample_id} from {jsonl_path}")
    trace_data, config_entry = extract_and_save_sample(jsonl_path, sample_id)
    
    if trace_data and config_entry:
        print("\n📋 生成的配置条目:")
        print(json.dumps(config_entry, indent=2, ensure_ascii=False))
        print("\n✨ 下一步:运行knowledge graph提取")