Spaces:
Running
Running
File size: 4,264 Bytes
5aa050e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
#!/usr/bin/env python
"""
实际提取algorithm-generated.jsonl样本并保存为JSON文件
"""
import json
import sys
from pathlib import Path
def extract_and_save_sample(jsonl_path: str, sample_id: int = 0):
"""
从algorithm-generated.jsonl中提取指定样本并保存为JSON文件
"""
# 读取JSONL文件
samples = []
with open(jsonl_path, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
samples.append(json.loads(line))
if sample_id >= len(samples):
print(f"错误:样本ID {sample_id} 超出范围,最大ID为 {len(samples)-1}")
return None, None
sample = samples[sample_id]
# 提取trace数据
trace_data = {
"filename": f"algorithm_sample_{sample_id}.json",
"title": f"Algorithm Sample {sample_id}: {sample['question'][:50]}...",
"description": f"Multi-agent collaboration sample from algorithm-generated dataset. Agents: {', '.join(sample['agents'])}. Question: {sample['question'][:100]}...",
"trace_type": "multi_agent_collaboration",
"trace_source": "algorithm_generated",
"tags": ["multi_agent", "algorithm_generated", "real_failure"] + sample.get('agents', []),
"content": {
"id": f"algorithm_trace_{sample_id}",
"timestamp": "2025-01-27T00:00:00",
"metadata": {
"source": "algorithm-generated.jsonl",
"original_id": sample['id'],
"mistake_step": sample.get('mistake_step', 0),
"mistake_agent": sample.get('mistake_agent', 'unknown'),
"mistake_reason": sample.get('mistake_reason', 'unknown'),
"ground_truth": sample.get('ground_truth', 'unknown'),
"is_correct": sample.get('is_correct', False)
},
"data": {
"question": sample['question'],
"agents": sample['agents'],
"total_observations": len(json.loads(sample['trace'])) if isinstance(sample['trace'], str) else len(sample['trace'])
},
"observations": json.loads(sample['trace']) if isinstance(sample['trace'], str) else sample['trace']
}
}
# 创建配置条目
config_entry = {
"id": f"algorithm_sample_{sample_id}",
"name": f"Algorithm Generated Sample {sample_id}",
"description": trace_data["description"],
"trace_file": f"traces/algorithm_sample_{sample_id}.json",
"knowledge_graph_file": f"knowledge_graphs/kg_algorithm_sample_{sample_id}.json",
"tags": trace_data["tags"],
"complexity": "advanced",
"trace_type": trace_data["trace_type"],
"trace_source": trace_data["trace_source"],
"features": [
"multi_agent_collaboration",
"real_failure_analysis",
"complex_reasoning",
"tool_usage",
"error_patterns"
]
}
# 保存trace文件
trace_filepath = Path(f"traces/algorithm_sample_{sample_id}.json")
with open(trace_filepath, 'w', encoding='utf-8') as f:
json.dump(trace_data, f, indent=2, ensure_ascii=False)
print(f"✅ 成功保存trace文件: {trace_filepath}")
print(f" 问题: {sample['question'][:100]}...")
print(f" 智能体: {', '.join(sample['agents'])}")
print(f" 观察数量: {len(trace_data['content']['observations'])}")
print(f" 错误步骤: {sample.get('mistake_step', 'N/A')}")
print(f" 错误智能体: {sample.get('mistake_agent', 'N/A')}")
return trace_data, config_entry
if __name__ == "__main__":
if len(sys.argv) < 2:
print("用法: python extract_algorithm_sample.py <jsonl_path> [sample_id]")
sys.exit(1)
jsonl_path = sys.argv[1]
sample_id = int(sys.argv[2]) if len(sys.argv) > 2 else 0
print(f"🔄 提取样本 {sample_id} from {jsonl_path}")
trace_data, config_entry = extract_and_save_sample(jsonl_path, sample_id)
if trace_data and config_entry:
print("\n📋 生成的配置条目:")
print(json.dumps(config_entry, indent=2, ensure_ascii=False))
print("\n✨ 下一步:运行knowledge graph提取")
|