File size: 19,903 Bytes
77bcbf1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
"""
HuggingFace Hub Integration

Push and pull dataset provenance to/from HuggingFace Hub.

Exports complete W3C PROV-O accountability bundle:
- cascade_provenance.json (CASCADE native format)
- prov_o.jsonld (W3C PROV-O JSON-LD - interoperable)
- prov_n.txt (W3C PROV-N notation - human readable)
- activities.jsonl (Activity log for audit)
- agents.json (Agent attributions)
- croissant.json (MLCommons Croissant)
"""

import json
import time
from datetime import datetime, timezone
from typing import Any, Dict, List, Optional

from .provenance import ProvenanceGraph
from .croissant import CroissantExporter


class AccountabilityBundle:
    """
    Complete W3C PROV-O accountability package.
    
    When a dataset is extracted, this bundle provides full audit trail:
    - Who created/modified it (agents)
    - What transformations occurred (activities)
    - Where it came from (entity lineage)
    - When everything happened (timestamps)
    - How to verify integrity (hashes)
    """
    
    def __init__(self, graph: ProvenanceGraph):
        self.graph = graph
        self.created_at = datetime.now(timezone.utc).isoformat()
    
    def to_prov_o_jsonld(self) -> Dict[str, Any]:
        """Export W3C PROV-O JSON-LD (interoperable standard)."""
        return self.graph.to_prov_jsonld()
    
    def to_prov_n(self) -> str:
        """Export W3C PROV-N notation (human readable)."""
        return self.graph.to_prov_n()
    
    def to_activity_log(self) -> List[Dict[str, Any]]:
        """Export activity log for audit (JSONL format)."""
        activities = []
        for activity in self.graph.list_activities():
            activities.append({
                "id": activity.id,
                "name": activity.name,
                "type": activity.activity_type.value,
                "started_at": datetime.fromtimestamp(activity.started_at).isoformat() if activity.started_at else None,
                "ended_at": datetime.fromtimestamp(activity.ended_at).isoformat() if activity.ended_at else None,
                "duration_seconds": activity.duration,
                "inputs": activity.inputs,
                "outputs": activity.outputs,
                "parameters": activity.parameters,
                "attributes": activity.attributes,
            })
        return activities
    
    def to_agent_attributions(self) -> Dict[str, Any]:
        """Export agent attributions for accountability."""
        agents = {}
        for agent in self.graph.list_agents():
            agents[agent.id] = {
                "name": agent.name,
                "type": agent.agent_type.value,
                "version": agent.version,
                "identifier": agent.identifier,
                "attributes": agent.attributes,
            }
        
        # Build attribution matrix: which agent did what
        attributions = []
        for rel in self.graph.list_relationships():
            if rel.relation_type.value == "wasAssociatedWith":
                activity = self.graph.get_activity(rel.source_id)
                agent = self.graph.get_agent(rel.target_id)
                if activity and agent:
                    attributions.append({
                        "activity_id": activity.id,
                        "activity_name": activity.name,
                        "agent_id": agent.id,
                        "agent_name": agent.name,
                        "timestamp": datetime.fromtimestamp(activity.started_at).isoformat() if activity.started_at else None,
                    })
        
        return {
            "agents": agents,
            "attributions": attributions,
            "total_agents": len(agents),
            "total_attributions": len(attributions),
        }
    
    def to_integrity_manifest(self) -> Dict[str, Any]:
        """Export integrity manifest for verification."""
        is_valid, invalid_ids = self.graph.verify_integrity()
        
        return {
            "root_hash": self.graph.root_hash,
            "created_at": self.created_at,
            "is_valid": is_valid,
            "invalid_entity_ids": invalid_ids,
            "entity_hashes": {
                entity.id: {
                    "content_hash": entity.content_hash,
                    "schema_hash": entity.schema_hash,
                }
                for entity in self.graph.list_entities()
            },
            "verification_note": (
                "To verify: recompute content hashes and compare against this manifest. "
                "Any mismatch indicates data tampering."
            ),
        }
    
    def export(self, output_dir: str):
        """Export all accountability artifacts to a directory."""
        import os
        os.makedirs(output_dir, exist_ok=True)
        
        # 1. CASCADE provenance JSON
        with open(os.path.join(output_dir, "cascade_provenance.json"), "w") as f:
            json.dump(self.graph.to_dict(), f, indent=2, default=str)
        
        # 2. W3C PROV-O JSON-LD
        with open(os.path.join(output_dir, "prov_o.jsonld"), "w") as f:
            json.dump(self.to_prov_o_jsonld(), f, indent=2, default=str)
        
        # 3. W3C PROV-N notation
        with open(os.path.join(output_dir, "prov_n.txt"), "w") as f:
            f.write(self.to_prov_n())
        
        # 4. Activity log
        with open(os.path.join(output_dir, "activities.jsonl"), "w") as f:
            for activity in self.to_activity_log():
                f.write(json.dumps(activity, default=str) + "\n")
        
        # 5. Agent attributions
        with open(os.path.join(output_dir, "agents.json"), "w") as f:
            json.dump(self.to_agent_attributions(), f, indent=2, default=str)
        
        # 6. Integrity manifest
        with open(os.path.join(output_dir, "integrity_manifest.json"), "w") as f:
            json.dump(self.to_integrity_manifest(), f, indent=2, default=str)
            
        # 7. Croissant metadata
        exporter = CroissantExporter(self.graph)
        croissant_content = exporter.to_json(name="dataset", url="local://")
        with open(os.path.join(output_dir, "croissant.json"), "w") as f:
            f.write(croissant_content)
    
    def summary(self) -> Dict[str, Any]:
        """Summary of the accountability bundle."""
        stats = self.graph.stats
        return {
            "bundle_created_at": self.created_at,
            "graph_name": self.graph.name,
            "root_hash": self.graph.root_hash,
            "entities": stats["entities"],
            "activities": stats["activities"],
            "agents": stats["agents"],
            "relationships": stats["relationships"],
            "files_included": [
                "cascade_provenance.json",
                "prov_o.jsonld",
                "prov_n.txt",
                "activities.jsonl",
                "agents.json",
                "integrity_manifest.json",
                "croissant.json",
            ],
        }


class HubIntegration:
    """
    Integration with HuggingFace Hub for dataset provenance.
    
    Stores complete accountability bundle:
    1. cascade_provenance.json - CASCADE native format
    2. prov_o.jsonld - W3C PROV-O JSON-LD (interoperable)
    3. prov_n.txt - W3C PROV-N notation (human readable)
    4. activities.jsonl - Activity log for audit
    5. agents.json - Agent attributions
    6. integrity_manifest.json - Hash verification
    7. croissant.json - MLCommons Croissant
    8. README.md - Human-readable provenance section
    """
    
    PROVENANCE_FILENAME = "cascade_provenance.json"
    PROV_O_FILENAME = "prov_o.jsonld"
    PROV_N_FILENAME = "prov_n.txt"
    ACTIVITIES_FILENAME = "activities.jsonl"
    AGENTS_FILENAME = "agents.json"
    INTEGRITY_FILENAME = "integrity_manifest.json"
    CROISSANT_FILENAME = "croissant.json"
    
    def __init__(self, token: str = None):
        """
        Initialize Hub integration.
        
        Args:
            token: HuggingFace API token (optional, uses cached token if not provided)
        """
        self.token = token
    
    def push_provenance(
        self,
        graph: ProvenanceGraph,
        repo_id: str,
        commit_message: str = "Update provenance",
        private: bool = False,
        include_croissant: bool = True,
        full_accountability: bool = True,
    ) -> str:
        """
        Push complete accountability bundle to HuggingFace Hub.
        
        Args:
            graph: The provenance graph to push
            repo_id: HuggingFace repo ID (e.g., "username/dataset-name")
            commit_message: Commit message
            private: Whether the repo should be private
            include_croissant: Whether to include Croissant JSON-LD
            full_accountability: Whether to include full W3C PROV-O bundle
        
        Returns:
            URL of the pushed provenance
        """
        from huggingface_hub import HfApi, CommitOperationAdd
        
        api = HfApi(token=self.token)
        
        # Ensure repo exists
        api.create_repo(
            repo_id=repo_id,
            repo_type="dataset",
            private=private,
            exist_ok=True,
        )
        
        operations = []
        bundle = AccountabilityBundle(graph)
        
        # 1. CASCADE provenance JSON (native format)
        provenance_content = json.dumps(graph.to_dict(), indent=2, default=str)
        operations.append(CommitOperationAdd(
            path_in_repo=self.PROVENANCE_FILENAME,
            path_or_fileobj=provenance_content.encode("utf-8"),
        ))
        
        if full_accountability:
            # 2. W3C PROV-O JSON-LD (interoperable standard)
            prov_o_content = json.dumps(bundle.to_prov_o_jsonld(), indent=2, default=str)
            operations.append(CommitOperationAdd(
                path_in_repo=self.PROV_O_FILENAME,
                path_or_fileobj=prov_o_content.encode("utf-8"),
            ))
            
            # 3. W3C PROV-N notation (human readable)
            prov_n_content = bundle.to_prov_n()
            operations.append(CommitOperationAdd(
                path_in_repo=self.PROV_N_FILENAME,
                path_or_fileobj=prov_n_content.encode("utf-8"),
            ))
            
            # 4. Activity log (JSONL for easy grep/audit)
            activities = bundle.to_activity_log()
            activities_content = "\n".join(json.dumps(a, default=str) for a in activities)
            operations.append(CommitOperationAdd(
                path_in_repo=self.ACTIVITIES_FILENAME,
                path_or_fileobj=activities_content.encode("utf-8"),
            ))
            
            # 5. Agent attributions
            agents_content = json.dumps(bundle.to_agent_attributions(), indent=2, default=str)
            operations.append(CommitOperationAdd(
                path_in_repo=self.AGENTS_FILENAME,
                path_or_fileobj=agents_content.encode("utf-8"),
            ))
            
            # 6. Integrity manifest (for verification)
            integrity_content = json.dumps(bundle.to_integrity_manifest(), indent=2, default=str)
            operations.append(CommitOperationAdd(
                path_in_repo=self.INTEGRITY_FILENAME,
                path_or_fileobj=integrity_content.encode("utf-8"),
            ))
        
        # 7. Croissant JSON-LD (MLCommons standard)
        if include_croissant:
            exporter = CroissantExporter(graph)
            croissant_content = exporter.to_json(
                name=repo_id.split("/")[-1],
                url=f"https://huggingface.co/datasets/{repo_id}",
            )
            operations.append(CommitOperationAdd(
                path_in_repo=self.CROISSANT_FILENAME,
                path_or_fileobj=croissant_content.encode("utf-8"),
            ))
        
        # Commit all accountability artifacts
        api.create_commit(
            repo_id=repo_id,
            repo_type="dataset",
            operations=operations,
            commit_message=commit_message,
        )
        
        return f"https://huggingface.co/datasets/{repo_id}"
    
    def pull_provenance(self, repo_id: str) -> Optional[ProvenanceGraph]:
        """
        Pull provenance from HuggingFace Hub.
        
        Args:
            repo_id: HuggingFace repo ID
        
        Returns:
            ProvenanceGraph if found, None otherwise
        """
        from huggingface_hub import hf_hub_download
        
        try:
            # Download provenance file
            local_path = hf_hub_download(
                repo_id=repo_id,
                filename=self.PROVENANCE_FILENAME,
                repo_type="dataset",
                token=self.token,
            )
            
            with open(local_path, "r", encoding="utf-8") as f:
                data = json.load(f)
            
            return ProvenanceGraph.from_dict(data)
            
        except Exception as e:
            print(f"Could not pull provenance from {repo_id}: {e}")
            return None
    
    def get_dataset_provenance_url(self, repo_id: str) -> str:
        """Get URL to provenance file in Hub."""
        return f"https://huggingface.co/datasets/{repo_id}/blob/main/{self.PROVENANCE_FILENAME}"
    
    def update_dataset_card(
        self,
        repo_id: str,
        graph: ProvenanceGraph,
    ) -> str:
        """
        Update dataset card with provenance summary.
        
        Adds/updates YAML front-matter with:
        - Lineage information
        - Root hash
        - Entity/activity counts
        
        Args:
            repo_id: HuggingFace repo ID
            graph: Provenance graph
        
        Returns:
            URL of the updated dataset
        """
        from huggingface_hub import HfApi, hf_hub_download
        
        api = HfApi(token=self.token)
        
        # Build provenance section for README
        provenance_section = self._build_readme_section(graph)
        
        # Get current README
        try:
            readme_path = hf_hub_download(
                repo_id=repo_id,
                filename="README.md",
                repo_type="dataset",
                token=self.token,
            )
            with open(readme_path, "r", encoding="utf-8") as f:
                current_readme = f.read()
        except:
            current_readme = f"# {repo_id.split('/')[-1]}\n\n"
        
        # Update or append provenance section
        marker_start = "<!-- CASCADE_PROVENANCE_START -->"
        marker_end = "<!-- CASCADE_PROVENANCE_END -->"
        
        if marker_start in current_readme:
            # Replace existing section
            import re
            pattern = re.escape(marker_start) + r".*?" + re.escape(marker_end)
            new_readme = re.sub(
                pattern,
                f"{marker_start}\n{provenance_section}\n{marker_end}",
                current_readme,
                flags=re.DOTALL,
            )
        else:
            # Append section
            new_readme = current_readme.rstrip() + f"\n\n{marker_start}\n{provenance_section}\n{marker_end}\n"
        
        # Push updated README
        api.upload_file(
            path_or_fileobj=new_readme.encode("utf-8"),
            path_in_repo="README.md",
            repo_id=repo_id,
            repo_type="dataset",
            commit_message="Update provenance in README",
        )
        
        return f"https://huggingface.co/datasets/{repo_id}"
    
    def _build_readme_section(self, graph: ProvenanceGraph) -> str:
        """Build provenance section for README."""
        stats = graph.stats
        bundle = AccountabilityBundle(graph)
        
        lines = [
            "## 🔗 Provenance & Accountability",
            "",
            "This dataset has CASCADE provenance tracking enabled with full W3C PROV-O compliance.",
            "",
            "### Integrity",
            "",
            f"| Metric | Value |",
            f"|--------|-------|",
            f"| Root Hash | `{graph.root_hash[:16]}...` |",
            f"| Entities | {stats['entities']} |",
            f"| Activities | {stats['activities']} |",
            f"| Agents | {stats['agents']} |",
            f"| Relationships | {stats['relationships']} |",
            "",
        ]
        
        # Add lineage summary
        entities = graph.list_entities()
        if entities:
            lines.append("### Lineage")
            lines.append("")
            for entity in entities[:5]:  # Show first 5
                upstream = graph.get_lineage(entity.id, "upstream")
                if upstream:
                    lines.append(f"- **{entity.name}** derived from: {', '.join(upstream[:3])}")
                else:
                    lines.append(f"- **{entity.name}** (source)")
            if len(entities) > 5:
                lines.append(f"- ... and {len(entities) - 5} more entities")
            lines.append("")
        
        # Add activities summary
        activities = graph.list_activities()
        if activities:
            lines.append("### Activities")
            lines.append("")
            for activity in activities[:5]:
                duration = f" ({activity.duration:.2f}s)" if activity.duration else ""
                lines.append(f"- **{activity.name}** [{activity.activity_type.value}]{duration}")
            if len(activities) > 5:
                lines.append(f"- ... and {len(activities) - 5} more activities")
            lines.append("")
        
        # Add agents summary
        agents = graph.list_agents()
        if agents:
            lines.append("### Agents (Accountability)")
            lines.append("")
            for agent in agents[:5]:
                lines.append(f"- **{agent.name}** [{agent.agent_type.value}]")
            if len(agents) > 5:
                lines.append(f"- ... and {len(agents) - 5} more agents")
            lines.append("")
        
        # Accountability bundle files
        lines.extend([
            "### Accountability Bundle",
            "",
            "| File | Standard | Description |",
            "|------|----------|-------------|",
            f"| [{self.PROVENANCE_FILENAME}]({self.PROVENANCE_FILENAME}) | CASCADE | Native provenance format |",
            f"| [{self.PROV_O_FILENAME}]({self.PROV_O_FILENAME}) | W3C PROV-O | Interoperable JSON-LD |",
            f"| [{self.PROV_N_FILENAME}]({self.PROV_N_FILENAME}) | W3C PROV-N | Human-readable notation |",
            f"| [{self.ACTIVITIES_FILENAME}]({self.ACTIVITIES_FILENAME}) | JSONL | Activity audit log |",
            f"| [{self.AGENTS_FILENAME}]({self.AGENTS_FILENAME}) | JSON | Agent attributions |",
            f"| [{self.INTEGRITY_FILENAME}]({self.INTEGRITY_FILENAME}) | JSON | Hash verification manifest |",
            f"| [{self.CROISSANT_FILENAME}]({self.CROISSANT_FILENAME}) | MLCommons | Croissant metadata |",
            "",
        ])
        
        return "\n".join(lines)


def push_to_hub(
    graph: ProvenanceGraph,
    repo_id: str,
    token: str = None,
    private: bool = False,
) -> str:
    """
    Convenience function to push provenance to Hub.
    
    Args:
        graph: Provenance graph to push
        repo_id: HuggingFace repo ID
        token: HF token (optional)
        private: Whether repo should be private
    
    Returns:
        URL of the pushed provenance
    """
    hub = HubIntegration(token=token)
    return hub.push_provenance(graph, repo_id, private=private)


def pull_from_hub(repo_id: str, token: str = None) -> Optional[ProvenanceGraph]:
    """
    Convenience function to pull provenance from Hub.
    
    Args:
        repo_id: HuggingFace repo ID
        token: HF token (optional)
    
    Returns:
        ProvenanceGraph if found
    """
    hub = HubIntegration(token=token)
    return hub.pull_provenance(repo_id)