Spaces:

HemanM
/

liveEvolutionEVO

Sleeping

App Files Files Community

HemanM commited on Aug 12

Commit

bf6c353

verified ·

1 Parent(s): 30b1fbb

Update data_utils.py

Browse files

Files changed (1) hide show

data_utils.py +82 -35

data_utils.py CHANGED Viewed

@@ -1,72 +1,119 @@
 # data_utils.py
 import numpy as np
 from datasets import load_dataset
-# --------- Hashing vectorizer (no sklearn) ----------
-def hash_vectorize(texts, n_features=4096, seed=1234):
-    rng = np.random.RandomState(seed)
-    # Simple 2-gram hashing
-    feats = np.zeros((len(texts), n_features), dtype=np.float32)
     for i, t in enumerate(texts):
-        t = (t or "").lower()
-        tokens = t.split()
-        for j, tok in enumerate(tokens):
-            h1 = (hash(tok) % n_features)
-            feats[i, h1] += 1.0
-            if j+1 < len(tokens):
-                bg = tok + "_" + tokens[j+1]
-                h2 = (hash(bg) % n_features)
-                feats[i, h2] += 1.0
         # L2 norm
-        nrm = np.linalg.norm(feats[i]) + 1e-8
-        feats[i] /= nrm
-    return feats
-# --------- PIQA loader (tiny subsets) ----------
-def load_piqa(subset=800, seed=42):
     ds = load_dataset("piqa")
     tr = ds["train"]
     va = ds["validation"]
     rng = np.random.RandomState(seed)
     idx_tr = rng.choice(len(tr), size=min(subset, len(tr)), replace=False)
-    idx_va = rng.choice(len(va), size=min(max(subset//4, 200), len(va)), replace=False)
     def pack(rows, idxs):
         X_text, y = [], []
         for k in idxs:
             p = rows[k]
-            stem = p["goal"] or ""
-            a, b = p["sol1"] or "", p["sol2"] or ""
-            # Make two rows (stem+opt, label is which is correct)
-            X_text += [stem + " " + a, stem + " " + b]
-            y += [1 if p["label"]==0 else 0, 1 if p["label"]==1 else 0]
         return X_text, np.array(y, dtype=np.int64)
     Xtr_txt, ytr = pack(tr, idx_tr)
     Xva_txt, yva = pack(va, idx_va)
     return Xtr_txt, ytr, Xva_txt, yva
-# --------- HellaSwag loader (tiny subsets) ----------
-def load_hellaswag(subset=800, seed=42):
     ds = load_dataset("hellaswag")
     tr = ds["train"]
     va = ds["validation"]
     rng = np.random.RandomState(seed)
     idx_tr = rng.choice(len(tr), size=min(subset, len(tr)), replace=False)
-    idx_va = rng.choice(len(va), size=min(max(subset//4, 200), len(va)), replace=False)
     def pack(rows, idxs):
         X_text, y = [], []
         for k in idxs:
             p = rows[k]
-            ctx = (p["ctx"] or "") + " " + (p["ctx_a"] or "")
-            label = int(p["label"])
-            # expand to 4 candidates; supervise one-vs-all over 4 rows
-            endings = p["endings"]
             for i, e in enumerate(endings):
-                X_text.append(ctx + " " + e)
-                y.append(1 if i==label else 0)
         return X_text, np.array(y, dtype=np.int64)
     Xtr_txt, ytr = pack(tr, idx_tr)

 # data_utils.py
+# Lightweight dataset loaders + simple hashing vectorizer (no sklearn)
+# Works on CPU-only Spaces and avoids heavy tokenizers.
+from typing import List, Tuple
 import numpy as np
 from datasets import load_dataset
+# -----------------------------
+# Hashing vectorizer (unigram + bigram)
+# -----------------------------
+def hash_vectorize(texts: List[str], n_features: int = 4096, seed: int = 1234) -> np.ndarray:
+    """
+    Very fast, tokenizer-free vectorizer.
+    - Lowercases text
+    - Splits on whitespace
+    - Uses Python's hash to place unigrams + bigrams into a fixed-size bag
+    - L2-normalizes each row
+    """
+    n = len(texts)
+    X = np.zeros((n, n_features), dtype=np.float32)
     for i, t in enumerate(texts):
+        if t is None:
+            continue
+        toks = t.lower().split()
+        prev = None
+        for tok in toks:
+            h1 = hash(tok) % n_features
+            X[i, h1] += 1.0
+            if prev is not None:
+                bg = prev + "_" + tok
+                h2 = hash(bg) % n_features
+                X[i, h2] += 1.0
+            prev = tok
         # L2 norm
+        norm = float(np.linalg.norm(X[i])) + 1e-8
+        X[i] /= norm
+    return X
+# -----------------------------
+# PIQA tiny subset loader
+# Produces pair-expanded binary rows for a quick proxy classifier.
+# -----------------------------
+def load_piqa(subset: int = 800, seed: int = 42) -> Tuple[list, np.ndarray, list, np.ndarray]:
+    """
+    Returns:
+      Xtr_txt, ytr, Xva_txt, yva
+    Where:
+      - For each original PIQA example, we emit TWO rows:
+        [goal + sol1] with label 1 if sol1 is correct else 0
+        [goal + sol2] with label 1 if sol2 is correct else 0
+    """
     ds = load_dataset("piqa")
     tr = ds["train"]
     va = ds["validation"]
     rng = np.random.RandomState(seed)
     idx_tr = rng.choice(len(tr), size=min(subset, len(tr)), replace=False)
+    idx_va = rng.choice(len(va), size=min(max(subset // 4, 200), len(va)), replace=False)
     def pack(rows, idxs):
         X_text, y = [], []
         for k in idxs:
             p = rows[k]
+            stem = (p.get("goal") or "").strip()
+            sol1 = (p.get("sol1") or "").strip()
+            sol2 = (p.get("sol2") or "").strip()
+            label = int(p.get("label", 0))
+            X_text.append(f"{stem} {sol1}")
+            y.append(1 if label == 0 else 0)
+            X_text.append(f"{stem} {sol2}")
+            y.append(1 if label == 1 else 0)
         return X_text, np.array(y, dtype=np.int64)
     Xtr_txt, ytr = pack(tr, idx_tr)
     Xva_txt, yva = pack(va, idx_va)
     return Xtr_txt, ytr, Xva_txt, yva
+# -----------------------------
+# HellaSwag tiny subset loader
+# Expands each example into 4 rows (one-vs-all), later regrouped into argmax.
+# -----------------------------
+def load_hellaswag(subset: int = 800, seed: int = 42) -> Tuple[list, np.ndarray, list, np.ndarray]:
+    """
+    Returns:
+      Xtr_txt, ytr, Xva_txt, yva
+    Where:
+      - For each original example, we emit FOUR rows:
+        [context + ending_i] with label 1 if i is correct else 0
+    """
     ds = load_dataset("hellaswag")
     tr = ds["train"]
     va = ds["validation"]
     rng = np.random.RandomState(seed)
     idx_tr = rng.choice(len(tr), size=min(subset, len(tr)), replace=False)
+    idx_va = rng.choice(len(va), size=min(max(subset // 4, 200), len(va)), replace=False)
     def pack(rows, idxs):
         X_text, y = [], []
         for k in idxs:
             p = rows[k]
+            # Some variants have keys like 'ctx' + 'ctx_a'; fall back defensively.
+            ctx = f"{(p.get('ctx') or '')} {(p.get('ctx_a') or '')}".strip()
+            endings = p.get("endings") or []
+            label = int(p.get("label", 0))
             for i, e in enumerate(endings):
+                X_text.append(f"{ctx} {e}".strip())
+                y.append(1 if i == label else 0)
         return X_text, np.array(y, dtype=np.int64)
     Xtr_txt, ytr = pack(tr, idx_tr)