File size: 1,860 Bytes
ccbe063
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env bash
# Minimal end-to-end demo on synthetic A3M + FI matrix.
# Produces:
#   demo_out/mosaic/   -- 12 mosaic subsets
#   demo_out/gradient/ -- 12 gradient subsets
set -euo pipefail

HERE="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
OUT="${HERE}/demo_out"
mkdir -p "${OUT}"

# 1. Generate synthetic inputs (200 random "sequences", L=60, random FI).
python - <<'PY'
import os
import numpy as np
from pathlib import Path

OUT = Path(os.environ.get("DEMO_OUT", "examples/demo_out"))
OUT.mkdir(parents=True, exist_ok=True)

rng = np.random.default_rng(0)
N, L = 200, 60
alphabet = np.array(list("ACDEFGHIKLMNPQRSTVWY-"))
seqs = rng.choice(alphabet, size=(N, L))

a3m_path = OUT / "synthetic.a3m"
with open(a3m_path, "w") as f:
    f.write(f"#{L}\t1\n")
    for i, row in enumerate(seqs):
        tag = "query" if i == 0 else f"seq{i:04d}"
        f.write(f">{tag}\n{''.join(row)}\n")

# Synthetic FI matrix: random but with a few high-variance columns.
fi = rng.normal(loc=0.0, scale=0.3, size=(N, L)).astype(np.float64)
hv_cols = rng.choice(L, size=L // 5, replace=False)
fi[:, hv_cols] += rng.normal(loc=0.0, scale=1.2, size=(N, len(hv_cols)))
np.save(OUT / "synthetic_fi.npy", fi)

print(f"wrote {a3m_path}")
print(f"wrote {OUT/'synthetic_fi.npy'}  shape={fi.shape}")
PY
export DEMO_OUT="${OUT}"

# 2. Build mosaic subsets.
sf-cluster build \
    --a3m   "${OUT}/synthetic.a3m" \
    --fi    "${OUT}/synthetic_fi.npy" \
    --method mosaic \
    --n-subsets 12 \
    --subset-size 32 \
    --seed 20260422 \
    --out   "${OUT}/mosaic"

# 3. Build gradient subsets.
sf-cluster build \
    --a3m   "${OUT}/synthetic.a3m" \
    --fi    "${OUT}/synthetic_fi.npy" \
    --method gradient \
    --n-subsets 12 \
    --subset-size 32 \
    --seed 20260422 \
    --out   "${OUT}/gradient"

echo
echo "Done. Inspect ${OUT}/mosaic and ${OUT}/gradient."