| import re |
| import glob |
| import numpy as np |
| import pandas as pd |
|
|
| LOG_FILE = "extract_log.txt" |
|
|
| |
| token_counts = {} |
| pattern = re.compile(r"Saved token (\d+) \(N=(\d+)\)") |
|
|
| with open(LOG_FILE, "r") as f: |
| for line in f: |
| m = pattern.search(line) |
| if m: |
| t = int(m.group(1)) |
| n = int(m.group(2)) |
| token_counts[t] = n |
|
|
| print(f"Found {len(token_counts)} tokens with counts from log.") |
|
|
| |
| rows = [] |
|
|
| def pwm_entropy(pwm, eps=1e-8): |
| """ |
| pwm: (L, 4) array of mean one-hot probs |
| returns: mean Shannon entropy across positions, in bits |
| """ |
| p = pwm / (pwm.sum(axis=1, keepdims=True) + eps) |
| H = -np.sum(p * np.log2(p + eps), axis=1) |
| return H.mean() |
|
|
| for pwm_path in glob.glob("token*_pwm.npy"): |
| |
| m = re.search(r"token(\d+)_pwm\.npy", pwm_path) |
| if not m: |
| continue |
| t = int(m.group(1)) |
|
|
| pwm = np.load(pwm_path) |
| phy = np.load(f"token{t}_phy.npy") |
|
|
| H = pwm_entropy(pwm) |
| avg_phy = float(phy.mean()) |
| N_hits = token_counts.get(t, None) |
|
|
| rows.append({ |
| "token_id": t, |
| "N_hits": N_hits, |
| "pwm_entropy_bits": H, |
| "avg_phyloP": avg_phy |
| }) |
|
|
| df = pd.DataFrame(rows) |
| df = df.sort_values(["pwm_entropy_bits", "avg_phyloP"], ascending=[True, False]) |
|
|
| print(df.head(20)) |
|
|
| df.to_csv("token_summary.tsv", sep="\t", index=False) |
| print("\nSaved summary to token_summary.tsv") |
|
|
|
|