File size: 5,266 Bytes
aceb1b2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""
Trace Converter CLI

Command-line interface for converting agent traces from various formats
to Potato's canonical JSONL format.

Usage:
    python -m potato.trace_converter --input traces.json --input-format react --output data.jsonl
    python -m potato.trace_converter --input traces.json --auto-detect --output data.jsonl
    python -m potato.trace_converter --list-formats
"""

import argparse
import json
import logging
import sys
from pathlib import Path

from .registry import converter_registry

logger = logging.getLogger(__name__)


def parse_args(args=None):
    parser = argparse.ArgumentParser(
        prog="potato-trace-convert",
        description="Convert agent traces from various formats to Potato's canonical JSONL format."
    )

    parser.add_argument(
        "--input", "-i",
        help="Input file path (JSON, JSONL, or Parquet)"
    )
    parser.add_argument(
        "--input-format", "-f",
        help="Input format name (e.g., react, langchain, langfuse, atif, webarena, openai, anthropic, swebench, otel, multi_agent, mcp)"
    )
    parser.add_argument(
        "--output", "-o",
        help="Output file path (JSONL). Defaults to stdout."
    )
    parser.add_argument(
        "--auto-detect",
        action="store_true",
        help="Auto-detect the input format"
    )
    parser.add_argument(
        "--list-formats",
        action="store_true",
        help="List all supported formats and exit"
    )
    parser.add_argument(
        "--pretty",
        action="store_true",
        help="Pretty-print JSON output (one object per line, indented)"
    )
    parser.add_argument(
        "--verbose", "-v",
        action="store_true",
        help="Enable verbose logging"
    )

    return parser.parse_args(args)


def load_input(file_path: str):
    """Load input data from JSON, JSONL, or Parquet file."""
    path = Path(file_path)
    if not path.exists():
        raise FileNotFoundError(f"Input file not found: {file_path}")

    # Handle Parquet files
    if path.suffix.lower() == ".parquet":
        import pyarrow.parquet as pq
        table = pq.read_table(str(path))
        return table.to_pandas().to_dict("records")

    content = path.read_text(encoding="utf-8").strip()

    # Try parsing as JSON first
    try:
        return json.loads(content)
    except json.JSONDecodeError:
        pass

    # Try parsing as JSONL (one JSON object per line)
    records = []
    for line_num, line in enumerate(content.splitlines(), 1):
        line = line.strip()
        if not line:
            continue
        try:
            records.append(json.loads(line))
        except json.JSONDecodeError as e:
            raise ValueError(f"Invalid JSON on line {line_num}: {e}")
    return records


def main(args=None):
    parsed = parse_args(args)

    if parsed.verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.WARNING)

    # List formats
    if parsed.list_formats:
        print("Supported trace formats:")
        print()
        for info in converter_registry.list_converters():
            print(f"  {info['format_name']:15s} {info['description']}")
            if info.get('file_extensions'):
                print(f"  {'':15s} Extensions: {', '.join(info['file_extensions'])}")
            print()
        return 0

    # Validate arguments
    if not parsed.input:
        print("Error: --input is required (or use --list-formats)", file=sys.stderr)
        return 1

    # Load input
    try:
        data = load_input(parsed.input)
    except (FileNotFoundError, ValueError) as e:
        print(f"Error loading input: {e}", file=sys.stderr)
        return 1

    # Determine format
    format_name = parsed.input_format
    if not format_name:
        if parsed.auto_detect:
            format_name = converter_registry.detect_format(data)
            if not format_name:
                print("Error: Could not auto-detect input format. "
                      "Please specify with --input-format.", file=sys.stderr)
                return 1
            print(f"Auto-detected format: {format_name}", file=sys.stderr)
        else:
            print("Error: --input-format or --auto-detect is required", file=sys.stderr)
            return 1

    # Convert
    try:
        traces = converter_registry.convert(format_name, data)
    except ValueError as e:
        print(f"Error: {e}", file=sys.stderr)
        return 1
    except Exception as e:
        print(f"Conversion error: {e}", file=sys.stderr)
        return 1

    # Output
    output_lines = []
    for trace in traces:
        trace_dict = trace.to_dict()
        if parsed.pretty:
            output_lines.append(json.dumps(trace_dict, ensure_ascii=False, indent=2))
        else:
            output_lines.append(json.dumps(trace_dict, ensure_ascii=False))

    output_text = "\n".join(output_lines) + "\n"

    if parsed.output:
        Path(parsed.output).parent.mkdir(parents=True, exist_ok=True)
        Path(parsed.output).write_text(output_text, encoding="utf-8")
        print(f"Converted {len(traces)} traces to {parsed.output}", file=sys.stderr)
    else:
        sys.stdout.write(output_text)

    return 0


if __name__ == "__main__":
    sys.exit(main())