MetaCortex-Dynamics commited on
Commit
6ecf2dc
·
verified ·
1 Parent(s): 80768d2

Create pipeline/ingest/claude_archive.py

Browse files
Files changed (1) hide show
  1. pipeline/ingest/claude_archive.py +78 -0
pipeline/ingest/claude_archive.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Ingest Claude/Anthropic conversation archives into the governed pipeline.
3
+
4
+ Reads the bulk conversations.json from Claude data export.
5
+ Format: conversations[].chat_messages[].{sender, text, created_at}
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import json
11
+ import zipfile
12
+ from datetime import datetime, timezone
13
+ from hashlib import sha256
14
+ from pathlib import Path
15
+ from typing import Iterator
16
+
17
+ from pipeline.stages.s1_segment import Segment
18
+ from pipeline.types import SourceProvenance, Tier
19
+
20
+
21
+ def ingest_claude_archive(zip_path: str | Path) -> Iterator[Segment]:
22
+ """Ingest the Claude data export ZIP."""
23
+ zip_path = Path(zip_path)
24
+ file_hash = sha256(zip_path.read_bytes()).hexdigest()
25
+
26
+ with zipfile.ZipFile(zip_path) as z:
27
+ with z.open("conversations.json") as f:
28
+ conversations = json.load(f)
29
+
30
+ for conv in conversations:
31
+ conv_id = conv.get("uuid", "unknown")
32
+ name = conv.get("name", "untitled")
33
+ created = conv.get("created_at", "")
34
+
35
+ provenance = SourceProvenance(
36
+ source_id=f"claude:{conv_id}",
37
+ tier=Tier.T3,
38
+ url=f"claude-export/{name[:60]}",
39
+ commit_or_version=created,
40
+ license="proprietary",
41
+ acquired_at=datetime.now(timezone.utc).isoformat(),
42
+ artifact_sha256=file_hash,
43
+ )
44
+
45
+ messages = conv.get("chat_messages", [])
46
+ offset = 0
47
+
48
+ for msg in messages:
49
+ sender = msg.get("sender", "unknown")
50
+ if sender == "system":
51
+ continue
52
+
53
+ # Claude format: text field OR content field (content may be list)
54
+ text = msg.get("text", "")
55
+ if not text:
56
+ content = msg.get("content", "")
57
+ if isinstance(content, list):
58
+ text = " ".join(
59
+ c.get("text", "") if isinstance(c, dict) else str(c)
60
+ for c in content
61
+ )
62
+ elif isinstance(content, str):
63
+ text = content
64
+
65
+ text = text.strip()
66
+ if not text:
67
+ continue
68
+
69
+ byte_start = offset
70
+ byte_end = offset + len(text.encode("utf-8"))
71
+ offset = byte_end
72
+
73
+ yield Segment(
74
+ source=provenance,
75
+ byte_range=(byte_start, byte_end),
76
+ text=text,
77
+ segment_type=f"chat_{sender}",
78
+ )