File size: 1,578 Bytes
b46126b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
import re
import glob
import numpy as np
import pandas as pd

LOG_FILE = "extract_log.txt"

# 1. Parse N hits from log file
token_counts = {}
pattern = re.compile(r"Saved token (\d+) \(N=(\d+)\)")

with open(LOG_FILE, "r") as f:
    for line in f:
        m = pattern.search(line)
        if m:
            t = int(m.group(1))
            n = int(m.group(2))
            token_counts[t] = n

print(f"Found {len(token_counts)} tokens with counts from log.")

# 2. For each token, load PWM + phyloP, compute entropy + avg phyloP
rows = []

def pwm_entropy(pwm, eps=1e-8):
    """
    pwm: (L, 4) array of mean one-hot probs
    returns: mean Shannon entropy across positions, in bits
    """
    p = pwm / (pwm.sum(axis=1, keepdims=True) + eps)  # normalize safety
    H = -np.sum(p * np.log2(p + eps), axis=1)         # (L,)
    return H.mean()

for pwm_path in glob.glob("token*_pwm.npy"):
    # token ID from filename
    m = re.search(r"token(\d+)_pwm\.npy", pwm_path)
    if not m:
        continue
    t = int(m.group(1))

    pwm = np.load(pwm_path)       # (L, 4)
    phy = np.load(f"token{t}_phy.npy")  # (L,)

    H = pwm_entropy(pwm)
    avg_phy = float(phy.mean())
    N_hits = token_counts.get(t, None)

    rows.append({
        "token_id": t,
        "N_hits": N_hits,
        "pwm_entropy_bits": H,
        "avg_phyloP": avg_phy
    })

df = pd.DataFrame(rows)
df = df.sort_values(["pwm_entropy_bits", "avg_phyloP"], ascending=[True, False])

print(df.head(20))

df.to_csv("token_summary.tsv", sep="\t", index=False)
print("\nSaved summary to token_summary.tsv")