File size: 1,578 Bytes
b46126b | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 | import re
import glob
import numpy as np
import pandas as pd
LOG_FILE = "extract_log.txt"
# 1. Parse N hits from log file
token_counts = {}
pattern = re.compile(r"Saved token (\d+) \(N=(\d+)\)")
with open(LOG_FILE, "r") as f:
for line in f:
m = pattern.search(line)
if m:
t = int(m.group(1))
n = int(m.group(2))
token_counts[t] = n
print(f"Found {len(token_counts)} tokens with counts from log.")
# 2. For each token, load PWM + phyloP, compute entropy + avg phyloP
rows = []
def pwm_entropy(pwm, eps=1e-8):
"""
pwm: (L, 4) array of mean one-hot probs
returns: mean Shannon entropy across positions, in bits
"""
p = pwm / (pwm.sum(axis=1, keepdims=True) + eps) # normalize safety
H = -np.sum(p * np.log2(p + eps), axis=1) # (L,)
return H.mean()
for pwm_path in glob.glob("token*_pwm.npy"):
# token ID from filename
m = re.search(r"token(\d+)_pwm\.npy", pwm_path)
if not m:
continue
t = int(m.group(1))
pwm = np.load(pwm_path) # (L, 4)
phy = np.load(f"token{t}_phy.npy") # (L,)
H = pwm_entropy(pwm)
avg_phy = float(phy.mean())
N_hits = token_counts.get(t, None)
rows.append({
"token_id": t,
"N_hits": N_hits,
"pwm_entropy_bits": H,
"avg_phyloP": avg_phy
})
df = pd.DataFrame(rows)
df = df.sort_values(["pwm_entropy_bits", "avg_phyloP"], ascending=[True, False])
print(df.head(20))
df.to_csv("token_summary.tsv", sep="\t", index=False)
print("\nSaved summary to token_summary.tsv")
|