victor HF Staff commited on
Commit
7fd314b
Β·
verified Β·
1 Parent(s): c134dd8

Upload parsers.js with huggingface_hub

Browse files
Files changed (1) hide show
  1. parsers.js +621 -0
parsers.js ADDED
@@ -0,0 +1,621 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // parsers.js β€” Unified session parser for Claude Code, Pi, Codex, and OpenCode
2
+ //
3
+ // Unified Schema:
4
+ // UnifiedSession { id, source, title?, cwd?, model?, provider?, startedAt?, messages[] }
5
+ // UnifiedMessage { id, role, timestamp?, blocks[], usage?, model? }
6
+ // ContentBlock = { type: "text", text }
7
+ // | { type: "thinking", text }
8
+ // | { type: "tool_call", callId, toolName, input }
9
+ // | { type: "tool_result", callId, content, isError? }
10
+
11
+ function detectFormat(lines) {
12
+ for (const line of lines) {
13
+ if (!line || typeof line !== "object") continue;
14
+ // Claude Code: has uuid + type in [user, assistant, system, file-history-snapshot, progress]
15
+ if (line.uuid && ["user", "assistant", "system", "file-history-snapshot", "progress"].includes(line.type)) {
16
+ return "claude-code";
17
+ }
18
+ // Codex: envelope {timestamp, type, payload} with type=session_meta
19
+ if (line.type === "session_meta" && line.payload) {
20
+ return "codex";
21
+ }
22
+ // Pi: type=session with version + cwd (no payload wrapper)
23
+ if (line.type === "session" && line.version !== undefined && line.cwd) {
24
+ return "pi";
25
+ }
26
+ // OpenCode: type=session with directory field
27
+ if (line.type === "session" && line.directory) {
28
+ return "opencode";
29
+ }
30
+ // Codex response_item with payload
31
+ if (line.type === "response_item" && line.payload) {
32
+ return "codex";
33
+ }
34
+ }
35
+ return null;
36
+ }
37
+
38
+ // ─── Claude Code ─────────────────────────────────────────────
39
+
40
+ function parseClaudeCode(lines) {
41
+ const session = {
42
+ id: null,
43
+ source: "claude-code",
44
+ title: null,
45
+ cwd: null,
46
+ model: null,
47
+ provider: "anthropic",
48
+ startedAt: null,
49
+ messages: [],
50
+ };
51
+
52
+ // Filter to conversation entries only (skip file-history-snapshot, progress)
53
+ const entries = lines.filter(
54
+ (l) => l && ["user", "assistant", "system"].includes(l.type)
55
+ );
56
+
57
+ if (entries.length > 0) {
58
+ session.cwd = entries[0].cwd || null;
59
+ session.startedAt = entries[0].timestamp || null;
60
+ session.id = entries[0].promptId || entries[0].uuid || null;
61
+ }
62
+
63
+ // Build tree and walk main branch
64
+ const byUuid = new Map();
65
+ for (const e of entries) {
66
+ if (e.uuid) byUuid.set(e.uuid, e);
67
+ }
68
+
69
+ // Find root(s) β€” entries with no parentUuid or parentUuid not in set
70
+ // Then walk children in timestamp order
71
+ const childrenOf = new Map();
72
+ for (const e of entries) {
73
+ const parent = e.parentUuid || null;
74
+ if (!childrenOf.has(parent)) childrenOf.set(parent, []);
75
+ childrenOf.get(parent).push(e);
76
+ }
77
+
78
+ // Walk main branch (non-sidechain)
79
+ const ordered = [];
80
+ function walk(parentId) {
81
+ const children = childrenOf.get(parentId) || [];
82
+ const mainBranch = children.filter((c) => !c.isSidechain);
83
+ mainBranch.sort(
84
+ (a, b) => new Date(a.timestamp) - new Date(b.timestamp)
85
+ );
86
+ for (const child of mainBranch) {
87
+ ordered.push(child);
88
+ walk(child.uuid);
89
+ }
90
+ }
91
+ walk(null);
92
+
93
+ // If tree walk produced nothing (flat format), fall back to array order
94
+ const sequence = ordered.length > 0 ? ordered : entries;
95
+
96
+ // Group consecutive assistant entries with the same message.id
97
+ let currentMsg = null;
98
+ for (const entry of sequence) {
99
+ const msg = entry.message || {};
100
+ const role = entry.type === "system" ? "system" : msg.role || entry.type;
101
+
102
+ if (role === "system" && entry.subtype === "turn_duration") continue;
103
+
104
+ const blocks = extractClaudeCodeBlocks(msg.content);
105
+ if (blocks.length === 0 && role !== "system") continue;
106
+
107
+ const msgId = msg.id || entry.uuid;
108
+
109
+ // Merge consecutive assistant chunks with same message.id
110
+ if (
111
+ role === "assistant" &&
112
+ currentMsg &&
113
+ currentMsg.role === "assistant" &&
114
+ currentMsg._rawId === msgId
115
+ ) {
116
+ currentMsg.blocks.push(...blocks);
117
+ if (msg.usage) currentMsg.usage = mapClaudeCodeUsage(msg.usage);
118
+ continue;
119
+ }
120
+
121
+ if (!session.model && msg.model) session.model = msg.model;
122
+
123
+ currentMsg = {
124
+ id: entry.uuid || msgId,
125
+ role: normalizeRole(role),
126
+ timestamp: entry.timestamp || null,
127
+ blocks,
128
+ usage: msg.usage ? mapClaudeCodeUsage(msg.usage) : undefined,
129
+ model: msg.model || undefined,
130
+ _rawId: msgId,
131
+ };
132
+ session.messages.push(currentMsg);
133
+ }
134
+
135
+ // Clean up internal fields
136
+ for (const m of session.messages) delete m._rawId;
137
+
138
+ return session;
139
+ }
140
+
141
+ function extractClaudeCodeBlocks(content) {
142
+ if (!content) return [];
143
+ if (typeof content === "string") return [{ type: "text", text: content }];
144
+ if (!Array.isArray(content)) return [];
145
+
146
+ const blocks = [];
147
+ for (const item of content) {
148
+ if (!item || typeof item !== "object") continue;
149
+ switch (item.type) {
150
+ case "text":
151
+ if (item.text) blocks.push({ type: "text", text: item.text });
152
+ break;
153
+ case "thinking":
154
+ if (item.thinking)
155
+ blocks.push({ type: "thinking", text: item.thinking });
156
+ break;
157
+ case "tool_use":
158
+ blocks.push({
159
+ type: "tool_call",
160
+ callId: item.id || "",
161
+ toolName: item.name || "unknown",
162
+ input: item.input || {},
163
+ });
164
+ break;
165
+ case "tool_result":
166
+ blocks.push({
167
+ type: "tool_result",
168
+ callId: item.tool_use_id || "",
169
+ content: formatToolResultContent(item.content),
170
+ isError: item.is_error || false,
171
+ });
172
+ break;
173
+ }
174
+ }
175
+ return blocks;
176
+ }
177
+
178
+ function formatToolResultContent(content) {
179
+ if (typeof content === "string") return content;
180
+ if (Array.isArray(content)) {
181
+ return content
182
+ .map((c) => {
183
+ if (typeof c === "string") return c;
184
+ if (c.type === "text") return c.text || "";
185
+ return JSON.stringify(c);
186
+ })
187
+ .join("\n");
188
+ }
189
+ return JSON.stringify(content);
190
+ }
191
+
192
+ function mapClaudeCodeUsage(usage) {
193
+ if (!usage) return undefined;
194
+ return {
195
+ inputTokens: usage.input_tokens,
196
+ outputTokens: usage.output_tokens,
197
+ cacheRead: usage.cache_read_input_tokens,
198
+ cacheWrite: usage.cache_creation_input_tokens,
199
+ };
200
+ }
201
+
202
+ // ─── Pi ──────────────────────────────────────────────────────
203
+
204
+ function parsePi(lines) {
205
+ const session = {
206
+ id: null,
207
+ source: "pi",
208
+ title: null,
209
+ cwd: null,
210
+ model: null,
211
+ provider: null,
212
+ startedAt: null,
213
+ messages: [],
214
+ };
215
+
216
+ for (const line of lines) {
217
+ if (!line) continue;
218
+ switch (line.type) {
219
+ case "session":
220
+ session.id = line.id;
221
+ session.cwd = line.cwd;
222
+ session.startedAt = line.timestamp;
223
+ break;
224
+
225
+ case "model_change":
226
+ session.provider = line.provider;
227
+ session.model = line.modelId;
228
+ break;
229
+
230
+ case "message": {
231
+ const msg = line.message;
232
+ if (!msg) break;
233
+ const blocks = extractPiBlocks(msg.content);
234
+ const unified = {
235
+ id: line.id,
236
+ role: normalizeRole(msg.role),
237
+ timestamp: line.timestamp,
238
+ blocks,
239
+ usage: msg.usage ? mapPiUsage(msg.usage) : undefined,
240
+ model: msg.model || undefined,
241
+ };
242
+ session.messages.push(unified);
243
+ break;
244
+ }
245
+ }
246
+ }
247
+
248
+ return session;
249
+ }
250
+
251
+ function extractPiBlocks(content) {
252
+ if (!content) return [];
253
+ if (typeof content === "string") return [{ type: "text", text: content }];
254
+ if (!Array.isArray(content)) return [];
255
+
256
+ const blocks = [];
257
+ for (const item of content) {
258
+ if (!item || typeof item !== "object") continue;
259
+ switch (item.type) {
260
+ case "text":
261
+ if (item.text) blocks.push({ type: "text", text: item.text });
262
+ break;
263
+ case "thinking":
264
+ if (item.thinking)
265
+ blocks.push({ type: "thinking", text: item.thinking });
266
+ break;
267
+ case "toolCall":
268
+ blocks.push({
269
+ type: "tool_call",
270
+ callId: item.id || "",
271
+ toolName: item.name || "unknown",
272
+ input: item.arguments || {},
273
+ });
274
+ break;
275
+ case "toolResult":
276
+ blocks.push({
277
+ type: "tool_result",
278
+ callId: item.toolCallId || "",
279
+ content:
280
+ typeof item.result === "string"
281
+ ? item.result
282
+ : JSON.stringify(item.result),
283
+ isError: item.isError || false,
284
+ });
285
+ break;
286
+ }
287
+ }
288
+ return blocks;
289
+ }
290
+
291
+ function mapPiUsage(usage) {
292
+ if (!usage) return undefined;
293
+ return {
294
+ inputTokens: usage.input,
295
+ outputTokens: usage.output,
296
+ cacheRead: usage.cacheRead,
297
+ cacheWrite: usage.cacheWrite,
298
+ };
299
+ }
300
+
301
+ // ─── Codex ───────────────────────────────────────────────────
302
+
303
+ function parseCodex(lines) {
304
+ const session = {
305
+ id: null,
306
+ source: "codex",
307
+ title: null,
308
+ cwd: null,
309
+ model: null,
310
+ provider: null,
311
+ startedAt: null,
312
+ messages: [],
313
+ };
314
+
315
+ let lastAssistant = null;
316
+
317
+ for (const line of lines) {
318
+ if (!line) continue;
319
+ const payload = line.payload || {};
320
+
321
+ switch (line.type) {
322
+ case "session_meta":
323
+ session.id = payload.id;
324
+ session.cwd = payload.cwd;
325
+ session.startedAt = payload.timestamp || line.timestamp;
326
+ session.provider = payload.model_provider;
327
+ session.model = payload.model;
328
+ if (payload.git) session.title = payload.git.branch;
329
+ break;
330
+
331
+ case "turn_context":
332
+ if (payload.model) session.model = session.model || payload.model;
333
+ break;
334
+
335
+ case "response_item": {
336
+ const p = payload;
337
+ if (p.type === "message") {
338
+ const role = normalizeRole(p.role);
339
+ if (role === "system") {
340
+ // developer messages β€” skip system instructions
341
+ continue;
342
+ }
343
+ const blocks = extractCodexBlocks(p.content);
344
+ if (blocks.length === 0) continue;
345
+
346
+ const msg = {
347
+ id: p.id || `codex-${session.messages.length}`,
348
+ role,
349
+ timestamp: line.timestamp,
350
+ blocks,
351
+ usage: p.usage ? mapCodexUsage(p.usage) : undefined,
352
+ model: p.model || undefined,
353
+ };
354
+ session.messages.push(msg);
355
+ if (role === "assistant") lastAssistant = msg;
356
+ } else if (p.type === "function_call") {
357
+ // Append tool call to last assistant message
358
+ const block = {
359
+ type: "tool_call",
360
+ callId: p.call_id || "",
361
+ toolName: p.name || "unknown",
362
+ input: tryParseJSON(p.arguments),
363
+ };
364
+ if (lastAssistant) {
365
+ lastAssistant.blocks.push(block);
366
+ } else {
367
+ session.messages.push({
368
+ id: `codex-fc-${session.messages.length}`,
369
+ role: "assistant",
370
+ timestamp: line.timestamp,
371
+ blocks: [block],
372
+ });
373
+ }
374
+ } else if (p.type === "function_call_output") {
375
+ session.messages.push({
376
+ id: `codex-fco-${session.messages.length}`,
377
+ role: "user",
378
+ timestamp: line.timestamp,
379
+ blocks: [
380
+ {
381
+ type: "tool_result",
382
+ callId: p.call_id || "",
383
+ content: p.output || "",
384
+ isError: false,
385
+ },
386
+ ],
387
+ });
388
+ } else if (p.type === "local_shell_call") {
389
+ const block = {
390
+ type: "tool_call",
391
+ callId: p.id || "",
392
+ toolName: "shell",
393
+ input: p.action || {},
394
+ };
395
+ if (lastAssistant) {
396
+ lastAssistant.blocks.push(block);
397
+ }
398
+ } else if (p.type === "reasoning") {
399
+ const text =
400
+ p.encrypted_content
401
+ ? "(encrypted reasoning)"
402
+ : p.summary
403
+ ?.map((s) => s.text || "")
404
+ .join("\n") || "(reasoning)";
405
+ if (lastAssistant) {
406
+ lastAssistant.blocks.unshift({ type: "thinking", text });
407
+ }
408
+ }
409
+ break;
410
+ }
411
+ }
412
+ }
413
+
414
+ return session;
415
+ }
416
+
417
+ function extractCodexBlocks(content) {
418
+ if (!content) return [];
419
+ if (typeof content === "string") return [{ type: "text", text: content }];
420
+ if (!Array.isArray(content)) return [];
421
+
422
+ const blocks = [];
423
+ for (const item of content) {
424
+ if (!item || typeof item !== "object") continue;
425
+ if (item.type === "output_text" && item.text) {
426
+ blocks.push({ type: "text", text: item.text });
427
+ } else if (item.type === "input_text" && item.text) {
428
+ blocks.push({ type: "text", text: item.text });
429
+ }
430
+ }
431
+ return blocks;
432
+ }
433
+
434
+ function mapCodexUsage(usage) {
435
+ if (!usage) return undefined;
436
+ return {
437
+ inputTokens: usage.input || usage.input_tokens,
438
+ outputTokens: usage.output || usage.output_tokens,
439
+ cacheRead: usage.cacheRead || usage.cache_read_input_tokens,
440
+ cacheWrite: usage.cacheWrite || usage.cache_creation_input_tokens,
441
+ };
442
+ }
443
+
444
+ // ─── OpenCode ────────────────────────────────────────────────
445
+
446
+ function parseOpenCode(lines) {
447
+ const session = {
448
+ id: null,
449
+ source: "opencode",
450
+ title: null,
451
+ cwd: null,
452
+ model: null,
453
+ provider: null,
454
+ startedAt: null,
455
+ messages: [],
456
+ };
457
+
458
+ const messageParts = new Map(); // messageId -> part[]
459
+ const messageEntries = [];
460
+
461
+ for (const line of lines) {
462
+ if (!line) continue;
463
+ switch (line.type) {
464
+ case "session":
465
+ session.id = line.id;
466
+ session.title = line.title;
467
+ session.cwd = line.directory;
468
+ session.startedAt = line.time_created
469
+ ? new Date(line.time_created).toISOString()
470
+ : null;
471
+ break;
472
+
473
+ case "message":
474
+ messageEntries.push(line);
475
+ break;
476
+
477
+ case "part":
478
+ if (!messageParts.has(line.message_id)) {
479
+ messageParts.set(line.message_id, []);
480
+ }
481
+ messageParts.get(line.message_id).push(line);
482
+ break;
483
+ }
484
+ }
485
+
486
+ // Sort messages by time
487
+ messageEntries.sort((a, b) => (a.time_created || 0) - (b.time_created || 0));
488
+
489
+ for (const msg of messageEntries) {
490
+ const data = msg.data || {};
491
+ const role = normalizeRole(data.role);
492
+ const parts = messageParts.get(msg.id) || [];
493
+ parts.sort((a, b) => (a.time_created || 0) - (b.time_created || 0));
494
+
495
+ if (!session.model && data.model) {
496
+ session.model = data.model.modelID || null;
497
+ session.provider = data.model.providerID || null;
498
+ }
499
+
500
+ const blocks = [];
501
+ for (const part of parts) {
502
+ const pd = part.data || {};
503
+ switch (pd.type) {
504
+ case "text":
505
+ if (pd.text) blocks.push({ type: "text", text: pd.text });
506
+ break;
507
+ case "tool-invocation":
508
+ case "tool":
509
+ blocks.push({
510
+ type: "tool_call",
511
+ callId: pd.toolCallId || pd.id || "",
512
+ toolName: pd.toolName || pd.tool?.name || "unknown",
513
+ input: pd.args || pd.tool?.args || pd.input || {},
514
+ });
515
+ break;
516
+ case "tool-result":
517
+ blocks.push({
518
+ type: "tool_result",
519
+ callId: pd.toolCallId || "",
520
+ content:
521
+ typeof pd.result === "string"
522
+ ? pd.result
523
+ : JSON.stringify(pd.result),
524
+ isError: pd.isError || false,
525
+ });
526
+ break;
527
+ case "step-start":
528
+ case "step-finish":
529
+ // Skip lifecycle markers
530
+ break;
531
+ default:
532
+ if (pd.text) blocks.push({ type: "text", text: pd.text });
533
+ }
534
+ }
535
+
536
+ if (blocks.length === 0) continue;
537
+
538
+ const usage = data.tokens
539
+ ? {
540
+ inputTokens: data.tokens.input,
541
+ outputTokens: data.tokens.output,
542
+ }
543
+ : undefined;
544
+
545
+ session.messages.push({
546
+ id: msg.id,
547
+ role,
548
+ timestamp: msg.time_created
549
+ ? new Date(msg.time_created).toISOString()
550
+ : null,
551
+ blocks,
552
+ usage,
553
+ model: data.model?.modelID || undefined,
554
+ });
555
+ }
556
+
557
+ return session;
558
+ }
559
+
560
+ // ─── Shared Helpers ──────────────────────────────────────────
561
+
562
+ function normalizeRole(role) {
563
+ if (!role) return "user";
564
+ const r = role.toLowerCase();
565
+ if (r === "developer" || r === "system") return "system";
566
+ if (r === "assistant") return "assistant";
567
+ return "user";
568
+ }
569
+
570
+ function tryParseJSON(str) {
571
+ if (typeof str !== "string") return str;
572
+ try {
573
+ return JSON.parse(str);
574
+ } catch {
575
+ return str;
576
+ }
577
+ }
578
+
579
+ // ─── Entry Point ─────────────────────────────────────────────
580
+
581
+ function parseSession(jsonlText) {
582
+ const lines = jsonlText
583
+ .split("\n")
584
+ .filter((l) => l.trim())
585
+ .map((l) => {
586
+ try {
587
+ return JSON.parse(l);
588
+ } catch {
589
+ return null;
590
+ }
591
+ })
592
+ .filter(Boolean);
593
+
594
+ const format = detectFormat(lines);
595
+ if (!format) throw new Error("Unknown session format");
596
+
597
+ switch (format) {
598
+ case "claude-code":
599
+ return parseClaudeCode(lines);
600
+ case "pi":
601
+ return parsePi(lines);
602
+ case "codex":
603
+ return parseCodex(lines);
604
+ case "opencode":
605
+ return parseOpenCode(lines);
606
+ default:
607
+ throw new Error(`Unsupported format: ${format}`);
608
+ }
609
+ }
610
+
611
+ // Export for both browser and Node.js
612
+ if (typeof module !== "undefined" && module.exports) {
613
+ module.exports = {
614
+ parseSession,
615
+ detectFormat,
616
+ parseClaudeCode,
617
+ parsePi,
618
+ parseCodex,
619
+ parseOpenCode,
620
+ };
621
+ }