dleemiller commited on
Commit
763dd75
·
verified ·
1 Parent(s): 3b6d8b5

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -44,20 +44,19 @@ model.eval()
44
 
45
  # Example: Predict word from swipe path
46
  from datasets import load_dataset
47
- from swipealot.data.dataset import normalize_coordinates, sample_path_points
48
 
49
  # Load sample
50
  dataset = load_dataset("futo-org/swipe.futo.org", split="test[:1]")
51
  item = dataset[0]
52
 
53
- # Preprocess path
54
  # 1. Normalize timestamps (x,y already normalized in futo dataset)
55
- normalized = normalize_coordinates(item["data"], item["canvas_width"], item["canvas_height"])
56
 
57
  # 2. Resample to fixed length (max_path_len=128)
58
  # - Pads with zeros if path < 128 points
59
  # - Interpolates if path > 128 points
60
- path_coords, _ = sample_path_points(normalized, processor.max_path_len)
61
  path = torch.tensor([path_coords], dtype=torch.float32)
62
 
63
  # Get predictions
@@ -196,3 +195,4 @@ outputs.last_hidden_state # [batch, seq_len, d_model] - Hidden representations
196
  ## License
197
 
198
  Apache 2.0
 
 
44
 
45
  # Example: Predict word from swipe path
46
  from datasets import load_dataset
 
47
 
48
  # Load sample
49
  dataset = load_dataset("futo-org/swipe.futo.org", split="test[:1]")
50
  item = dataset[0]
51
 
52
+ # Preprocess swipe path using processor methods
53
  # 1. Normalize timestamps (x,y already normalized in futo dataset)
54
+ normalized = processor.normalize_coordinates(item["data"], item["canvas_width"], item["canvas_height"])
55
 
56
  # 2. Resample to fixed length (max_path_len=128)
57
  # - Pads with zeros if path < 128 points
58
  # - Interpolates if path > 128 points
59
+ path_coords, _ = processor.sample_path_points(normalized, processor.max_path_len)
60
  path = torch.tensor([path_coords], dtype=torch.float32)
61
 
62
  # Get predictions
 
195
  ## License
196
 
197
  Apache 2.0
198
+
__pycache__/preprocessing.cpython-312.pyc ADDED
Binary file (4.58 kB). View file
 
conversion_metadata.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "original_checkpoint": "checkpoints/base_20251213_164813/best.pt",
3
  "original_config": "embedded_in_checkpoint",
4
- "converted_at": "2025-12-15 08:03:26.074041",
5
  "model_type": "base",
6
  "vocab_size": 43,
7
  "epoch": 38,
 
1
  {
2
  "original_checkpoint": "checkpoints/base_20251213_164813/best.pt",
3
  "original_config": "embedded_in_checkpoint",
4
+ "converted_at": "2025-12-15 08:28:11.703039",
5
  "model_type": "base",
6
  "vocab_size": 43,
7
  "epoch": 38,
preprocessing.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Standalone preprocessing utilities for swipe path data.
2
+
3
+ This module provides preprocessing functions for the SwipeALot model that are
4
+ completely standalone and don't require the full swipealot training package.
5
+ """
6
+
7
+ import numpy as np
8
+
9
+
10
+ def normalize_coordinates(
11
+ data_points: list[dict], canvas_width: float, canvas_height: float
12
+ ) -> list[dict]:
13
+ """
14
+ Normalize swipe coordinates and timestamps.
15
+
16
+ Args:
17
+ data_points: List of dicts with 'x', 'y', 't' keys
18
+ canvas_width: Canvas width (not used - kept for compatibility)
19
+ canvas_height: Canvas height (not used - kept for compatibility)
20
+
21
+ Returns:
22
+ List of normalized coordinate dicts with x, y in [0,1] and t in [0,1]
23
+
24
+ Note:
25
+ For futo-org/swipe.futo.org dataset, x and y are already normalized to [0,1].
26
+ This function clamps them to ensure they stay in bounds and normalizes timestamps.
27
+ """
28
+ if not data_points:
29
+ return []
30
+
31
+ # Extract timestamps for normalization
32
+ timestamps = [p["t"] for p in data_points]
33
+ t_min = min(timestamps)
34
+ t_max = max(timestamps)
35
+ t_range = t_max - t_min if t_max > t_min else 1.0
36
+
37
+ normalized = []
38
+ for point in data_points:
39
+ # x and y are already normalized to [0,1] in the dataset
40
+ # But sometimes they go slightly outside bounds, so clamp them
41
+ x_norm = max(0.0, min(1.0, point["x"]))
42
+ y_norm = max(0.0, min(1.0, point["y"]))
43
+
44
+ # Normalize timestamp to [0, 1]
45
+ t_norm = (point["t"] - t_min) / t_range
46
+
47
+ normalized.append({"x": x_norm, "y": y_norm, "t": t_norm})
48
+
49
+ return normalized
50
+
51
+
52
+ def sample_path_points(data_points: list[dict], max_len: int) -> tuple:
53
+ """
54
+ Sample or pad path points to fixed length using linear interpolation.
55
+
56
+ Args:
57
+ data_points: List of coordinate dicts with 'x', 'y', 't' keys
58
+ max_len: Target length (typically 128 for SwipeALot models)
59
+
60
+ Returns:
61
+ Tuple of (sampled_points, mask) where:
62
+ - sampled_points: numpy array of shape [max_len, 3] with (x, y, t) coordinates
63
+ - mask: numpy array of shape [max_len] indicating valid (1) vs padding (0) points
64
+
65
+ Note:
66
+ - If path has fewer points than max_len, it's zero-padded
67
+ - If path has more points than max_len, it's downsampled using linear interpolation
68
+ - If path has exactly max_len points, it's returned as-is
69
+ """
70
+ num_points = len(data_points)
71
+
72
+ if num_points == max_len:
73
+ points = data_points
74
+ mask = [1] * max_len
75
+ elif num_points < max_len:
76
+ # Pad with zeros
77
+ points = data_points + [{"x": 0.0, "y": 0.0, "t": 0.0}] * (max_len - num_points)
78
+ mask = [1] * num_points + [0] * (max_len - num_points)
79
+ else:
80
+ # Downsample using linear interpolation
81
+ # Extract coordinates as arrays
82
+ x_coords = np.array([p["x"] for p in data_points])
83
+ y_coords = np.array([p["y"] for p in data_points])
84
+ t_coords = np.array([p["t"] for p in data_points])
85
+
86
+ # Original indices (parameter for interpolation)
87
+ original_indices = np.arange(num_points)
88
+
89
+ # Target indices for interpolation (evenly spaced)
90
+ target_indices = np.linspace(0, num_points - 1, max_len)
91
+
92
+ # Interpolate each coordinate independently
93
+ x_interp = np.interp(target_indices, original_indices, x_coords)
94
+ y_interp = np.interp(target_indices, original_indices, y_coords)
95
+ t_interp = np.interp(target_indices, original_indices, t_coords)
96
+
97
+ # Reconstruct points
98
+ points = [
99
+ {"x": float(x), "y": float(y), "t": float(t)}
100
+ for x, y, t in zip(x_interp, y_interp, t_interp, strict=True)
101
+ ]
102
+ mask = [1] * max_len
103
+
104
+ # Convert to numpy arrays
105
+ coords = np.array([[p["x"], p["y"], p["t"]] for p in points], dtype=np.float32)
106
+ mask = np.array(mask, dtype=np.int64)
107
+
108
+ return coords, mask