File size: 7,744 Bytes
acbef3a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 | import sys
sys.path.append("..")
from dataset import load_data
from Bio.pairwise2 import align
def calculate_similarity_score(seq_gen, seq_tem):
"""
对两个序列执行全局比对,并返回归一化得分(得分/序列长度)
"""
score = align.globalxx(seq_gen, seq_tem)[0].score
return score
def read_data():
data = load_data("../dataset/train.xlsx")
for k, v in data.items():
data[k] = [i for i in v.keys() if i is not k]
return data
# visualize_peptide_dataset.py
import os
from collections import Counter, defaultdict
from joblib import Parallel, delayed
import numpy as np
import matplotlib.pyplot as plt
def extract_templates_and_variants(data_dict):
"""
data_dict: {template_L: [variant_with_D1, variant_with_D2, ...]}
Returns:
templates: list of template sequences (uppercase L-type)
variants_map: dict template -> list of variant sequences (may contain lowercase d-residues)
"""
templates = list(data_dict.keys())
variants_map = data_dict
return templates, variants_map
def compute_length_distribution(templates):
lengths = [len(t) for t in templates]
return lengths
def compute_amino_acid_composition(templates):
"""
Returns:
aa_list_sorted: list of amino acids sorted by frequency desc
freqs_sorted: corresponding normalized frequencies
Note: only uppercase letters are expected in templates.
"""
counter = Counter()
total = 0
for t in templates:
counter.update(t)
total += len(t)
if total == 0:
return [], []
# Normalize to frequencies
aa_freq = {aa: cnt / total for aa, cnt in counter.items()}
# Sort by frequency desc, then alphabetically for stability
aa_list_sorted = sorted(aa_freq.keys(), key=lambda x: (-aa_freq[x], x))
freqs_sorted = [aa_freq[aa] for aa in aa_list_sorted]
return aa_list_sorted, freqs_sorted
def compute_d_substitution_counts(templates, variants_map):
"""
For each variant of a template, count number of D-type residues (lowercase letters).
Returns:
d_counts: list of int counts across all variants of all templates.
"""
d_counts = []
for tpl in templates:
variants = variants_map.get(tpl, [])
for v in variants:
d_counts.append(sum(1 for c in v if c.islower()))
return d_counts
def compute_intra_dataset_diversity(templates, n_jobs=-1):
"""
For each template, compute the maximum similarity to any other template.
Similarity uses calculate_similarity(seq_gen, seq_tem) from dataset_more.py,
which normalizes by len(seq_gen). Here we follow your definition and
treat seq_gen as the query template and seq_tem as the other template.
Returns:
max_sims: list of floats, one per template.
Memory-efficient version with normalization by query length.
Maintains consistency with original calculate_similarity behavior.
"""
n = len(templates)
if n <= 1:
return [1.0] * n
# 生成所有需要计算的(i,j)对
pairs = [(i, j) for i in range(n) for j in range(i + 1, n)]
# 并行计算相似度分数(未归一化)
similarities = Parallel(n_jobs=n_jobs)(
delayed(calculate_similarity_score)(templates[i], templates[j])
for i, j in pairs
)
# 对每个模板,收集所有相关的相似度分数
# 需要分别归一化(因为查询序列不同时分母不同)
max_sims = [-np.inf] * n
for (i, j), sim_score in zip(pairs, similarities):
# 当 i 是查询序列时,归一化
sim_i_query = sim_score / len(templates[i])
max_sims[i] = max(max_sims[i], sim_i_query)
# 当 j 是查询序列时,归一化
sim_j_query = sim_score / len(templates[j])
max_sims[j] = max(max_sims[j], sim_j_query)
return max_sims
def choose_bins(data, kind="auto", min_bins=10, max_bins=50):
"""
Robust bin selection for histograms.
kind="auto" uses numpy's auto strategy but clip within [min_bins, max_bins].
"""
if not data:
return 10
n = len(data)
# Freedman–Diaconis rule as a guide
data_arr = np.asarray(data)
q75, q25 = np.percentile(data_arr, [75, 25])
iqr = max(q75 - q25, 1e-9)
bin_width = 2 * iqr / (len(data_arr) ** (1 / 3))
if bin_width <= 0:
bins = min(max_bins, max(min_bins, int(np.sqrt(n))))
else:
bins = int(np.ceil((data_arr.max() - data_arr.min()) / bin_width))
bins = max(min_bins, min(max_bins, bins))
return bins
def plot_distributions(templates, variants_map, figsize=(10, 12), save_path=None, show=True):
# Compute metrics
lengths = compute_length_distribution(templates)
aa_list, aa_freqs = compute_amino_acid_composition(templates)
d_counts = compute_d_substitution_counts(templates, variants_map)
max_sims = compute_intra_dataset_diversity(templates)
# Prepare figure with 4 vertically stacked subplots
plt.figure(figsize=figsize)
plt.style.use('seaborn-v0_8-whitegrid')
plt.subplots_adjust(hspace=0.5)
# 1) Length distribution (histogram)
plt.subplot(2, 2, 1)
if lengths:
bins_len = choose_bins(lengths)
plt.hist(lengths, bins=bins_len, color="#4C78A8", edgecolor="white")
else:
plt.hist([], bins=10, color="#4C78A8", edgecolor="white")
plt.title("Length Distribution", fontsize=16, fontweight='bold')
plt.xlabel("Length", fontsize=14)
plt.ylabel("Count", fontsize=14)
# 2) Amino acid composition distribution (sorted descending)
plt.subplot(2, 2, 2)
if aa_list:
# Use histogram-like bar chart since composition is categorical
x = np.arange(len(aa_list))
plt.bar(x, aa_freqs, color="#F58518", edgecolor="white")
plt.xticks(x, aa_list, fontsize=9)
else:
plt.bar([], [])
plt.title("Amino Acid Composition", fontsize=16, fontweight='bold')
plt.xlabel("Amino Acid", fontsize=14)
plt.ylabel("Frequency", fontsize=14)
# 3) D-substitution count distribution (across variants)
plt.subplot(2, 2, 3)
if d_counts:
bins_d = np.arange(0, max(d_counts) + 2) - 0.5 # centered integer bins
plt.hist(d_counts, bins=bins_d, color="#E45756", edgecolor="white")
plt.xticks(range(0, max(d_counts) + 1, 2))
else:
plt.hist([], bins=np.arange(-0.5, 1.5), color="#E45756", edgecolor="white")
plt.xticks([0])
plt.xlabel("Number of D substitutions per parent", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.title("D-type Substitution Count", fontsize=16, fontweight='bold')
# 4) Intra-dataset diversity: distribution of max similarity to others
plt.subplot(2, 2, 4)
if max_sims:
bins_sim = min(40, max(10, int(len(max_sims) ** 0.5)))
# Similarity can be negative with mismatch penalties; show full range
plt.hist(max_sims, bins=bins_sim, color="#72B7B2", edgecolor="white")
plt.xlim(min(max_sims) - 0.05, max(max_sims) + 0.05)
else:
plt.hist([], bins=10, color="#72B7B2", edgecolor="white")
plt.xlabel("Max similarity to other templates", fontsize=14)
plt.ylabel("Count", fontsize=14)
plt.title("Intra-dataset Diversity", fontsize=16, fontweight='bold')
if save_path:
plt.savefig(save_path, bbox_inches="tight")
if show:
plt.show()
plt.close()
def main():
# Read data using your provided function
data = read_data()
templates, variants_map = extract_templates_and_variants(data)
plot_distributions(templates, variants_map, figsize=(14, 7), save_path="dataset_more.svg", show=True)
if __name__ == "__main__":
main() |