Optitransfer commited on
Commit
5781c22
·
verified ·
1 Parent(s): 72e48f7

Upload crdt_merge/datasets_ext.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. crdt_merge/datasets_ext.py +87 -0
crdt_merge/datasets_ext.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ HuggingFace Datasets integration for crdt-merge.
3
+
4
+ Merge two HF datasets directly by name or Dataset objects.
5
+ """
6
+
7
+ from __future__ import annotations
8
+ from typing import Any, Dict, List, Optional
9
+
10
+
11
+ def merge_datasets(
12
+ dataset_a: Any,
13
+ dataset_b: Any,
14
+ key: Optional[str] = None,
15
+ timestamp_col: Optional[str] = None,
16
+ prefer: str = "latest",
17
+ dedup: bool = True,
18
+ ) -> Any:
19
+ """
20
+ Merge two HuggingFace Dataset objects using CRDT semantics.
21
+
22
+ Args:
23
+ dataset_a: HF Dataset object or dataset name (str)
24
+ dataset_b: HF Dataset object or dataset name (str)
25
+ key: Column to match rows on
26
+ timestamp_col: Column with timestamps for LWW
27
+ prefer: "latest", "a", or "b"
28
+ dedup: Remove exact duplicates
29
+
30
+ Returns:
31
+ Merged HF Dataset
32
+ """
33
+ from datasets import Dataset, load_dataset
34
+
35
+ # Load if string names provided
36
+ if isinstance(dataset_a, str):
37
+ dataset_a = load_dataset(dataset_a, split="train")
38
+ if isinstance(dataset_b, str):
39
+ dataset_b = load_dataset(dataset_b, split="train")
40
+
41
+ # Convert to pandas, merge, convert back
42
+ from .dataframe import merge as df_merge
43
+
44
+ df_a = dataset_a.to_pandas()
45
+ df_b = dataset_b.to_pandas()
46
+
47
+ merged_df = df_merge(
48
+ df_a, df_b,
49
+ key=key,
50
+ timestamp_col=timestamp_col,
51
+ prefer=prefer,
52
+ dedup=dedup,
53
+ )
54
+
55
+ return Dataset.from_pandas(merged_df)
56
+
57
+
58
+ def dedup_dataset(
59
+ dataset: Any,
60
+ columns: Optional[List[str]] = None,
61
+ method: str = "exact",
62
+ threshold: float = 0.85,
63
+ ) -> Any:
64
+ """
65
+ Deduplicate a HuggingFace Dataset.
66
+
67
+ Args:
68
+ dataset: HF Dataset object or name
69
+ columns: Columns to compare (None = all)
70
+ method: "exact" or "fuzzy"
71
+ threshold: Fuzzy similarity threshold
72
+
73
+ Returns:
74
+ Deduplicated Dataset with stats
75
+ """
76
+ from datasets import Dataset, load_dataset
77
+ from .dedup import dedup_records
78
+
79
+ if isinstance(dataset, str):
80
+ dataset = load_dataset(dataset, split="train")
81
+
82
+ records = [dict(r) for r in dataset]
83
+ unique, removed = dedup_records(records, columns=columns, method=method, threshold=threshold)
84
+
85
+ result = Dataset.from_list(unique)
86
+ result.info.description = f"Deduplicated: {removed} duplicates removed from {len(records)} rows"
87
+ return result