Danielfonseca1212 commited on
Commit
c287c5c
·
verified ·
1 Parent(s): b5798f0

Create elliptic.data.py

Browse files
Files changed (1) hide show
  1. elliptic.data.py +94 -0
elliptic.data.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # elliptic_data.py — Loader do Elliptic Bitcoin Dataset via PyG
2
+ import torch
3
+ import numpy as np
4
+ import pandas as pd
5
+ from torch_geometric.datasets import EllipticBitcoinDataset
6
+ from torch_geometric.loader import NeighborLoader
7
+ from torch_geometric.transforms import NormalizeFeatures
8
+ import os
9
+
10
+ def carregar_elliptic(root='/tmp/elliptic', normalize=True):
11
+ """
12
+ Carrega o Elliptic Bitcoin Dataset via PyG.
13
+
14
+ Estatísticas reais:
15
+ - 203,769 nós (transações Bitcoin)
16
+ - 234,355 arestas (fluxo de Bitcoin)
17
+ - 166 features por nó (94 locais + 72 agregadas)
18
+ - 2 classes: ilícito (lavagem) / lícito
19
+ - 49 timesteps (jan 2017 - set 2018)
20
+ - ~21% rotulados, ~79% desconhecidos
21
+
22
+ Split temporal (como no paper):
23
+ - Treino: timesteps 1-34
24
+ - Teste: timesteps 35-49
25
+ """
26
+ transform = NormalizeFeatures() if normalize else None
27
+
28
+ try:
29
+ dataset = EllipticBitcoinDataset(root=root, transform=transform)
30
+ data = dataset[0]
31
+ return data, True
32
+ except Exception as e:
33
+ return None, str(e)
34
+
35
+
36
+ def preparar_splits(data):
37
+ """
38
+ Split temporal como descrito no paper original:
39
+ Treino nos primeiros timesteps, teste nos últimos.
40
+ Máscara 'unknown' (classe 2) excluída do treino/teste.
41
+ """
42
+ # PyG já fornece máscaras train/test no Elliptic
43
+ # Classe 0 = ilícito, 1 = lícito, 2 = desconhecido
44
+
45
+ # Filtra apenas nós rotulados
46
+ labeled_mask = data.y != 2
47
+ train_mask = data.train_mask & labeled_mask
48
+ test_mask = data.test_mask & labeled_mask
49
+
50
+ # Estatísticas
51
+ y_train = data.y[train_mask]
52
+ y_test = data.y[test_mask]
53
+
54
+ stats = {
55
+ 'n_nos': data.x.shape[0],
56
+ 'n_arestas': data.edge_index.shape[1],
57
+ 'n_features': data.x.shape[1],
58
+ 'n_rotulados': int(labeled_mask.sum()),
59
+ 'n_train': int(train_mask.sum()),
60
+ 'n_test': int(test_mask.sum()),
61
+ 'n_ilicito_train': int((y_train == 0).sum()),
62
+ 'n_licito_train': int((y_train == 1).sum()),
63
+ 'n_ilicito_test': int((y_test == 0).sum()),
64
+ 'n_licito_test': int((y_test == 1).sum()),
65
+ 'taxa_fraude_train': float((y_train==0).sum()/len(y_train)),
66
+ 'taxa_fraude_test': float((y_test ==0).sum()/len(y_test)),
67
+ }
68
+
69
+ data.train_mask_labeled = train_mask
70
+ data.test_mask_labeled = test_mask
71
+
72
+ return data, stats
73
+
74
+
75
+ def criar_loaders(data, num_neighbors=[10, 5], batch_size=512):
76
+ """
77
+ Mini-batch com NeighborLoader para GraphSAGE inductive.
78
+ Amostra vizinhos em vez de usar o grafo completo.
79
+ """
80
+ train_loader = NeighborLoader(
81
+ data,
82
+ num_neighbors=num_neighbors,
83
+ batch_size=batch_size,
84
+ input_nodes=data.train_mask_labeled,
85
+ shuffle=True,
86
+ )
87
+ test_loader = NeighborLoader(
88
+ data,
89
+ num_neighbors=num_neighbors,
90
+ batch_size=batch_size,
91
+ input_nodes=data.test_mask_labeled,
92
+ shuffle=False,
93
+ )
94
+ return train_loader, test_loader