import sys sys.path.append("..") from dataset import load_data from Bio.pairwise2 import align def calculate_similarity_score(seq_gen, seq_tem): """ 对两个序列执行全局比对,并返回归一化得分(得分/序列长度) """ score = align.globalxx(seq_gen, seq_tem)[0].score return score def read_data(): data = load_data("../dataset/train.xlsx") for k, v in data.items(): data[k] = [i for i in v.keys() if i is not k] return data # visualize_peptide_dataset.py import os from collections import Counter, defaultdict from joblib import Parallel, delayed import numpy as np import matplotlib.pyplot as plt def extract_templates_and_variants(data_dict): """ data_dict: {template_L: [variant_with_D1, variant_with_D2, ...]} Returns: templates: list of template sequences (uppercase L-type) variants_map: dict template -> list of variant sequences (may contain lowercase d-residues) """ templates = list(data_dict.keys()) variants_map = data_dict return templates, variants_map def compute_length_distribution(templates): lengths = [len(t) for t in templates] return lengths def compute_amino_acid_composition(templates): """ Returns: aa_list_sorted: list of amino acids sorted by frequency desc freqs_sorted: corresponding normalized frequencies Note: only uppercase letters are expected in templates. """ counter = Counter() total = 0 for t in templates: counter.update(t) total += len(t) if total == 0: return [], [] # Normalize to frequencies aa_freq = {aa: cnt / total for aa, cnt in counter.items()} # Sort by frequency desc, then alphabetically for stability aa_list_sorted = sorted(aa_freq.keys(), key=lambda x: (-aa_freq[x], x)) freqs_sorted = [aa_freq[aa] for aa in aa_list_sorted] return aa_list_sorted, freqs_sorted def compute_d_substitution_counts(templates, variants_map): """ For each variant of a template, count number of D-type residues (lowercase letters). Returns: d_counts: list of int counts across all variants of all templates. """ d_counts = [] for tpl in templates: variants = variants_map.get(tpl, []) for v in variants: d_counts.append(sum(1 for c in v if c.islower())) return d_counts def compute_intra_dataset_diversity(templates, n_jobs=-1): """ For each template, compute the maximum similarity to any other template. Similarity uses calculate_similarity(seq_gen, seq_tem) from dataset_more.py, which normalizes by len(seq_gen). Here we follow your definition and treat seq_gen as the query template and seq_tem as the other template. Returns: max_sims: list of floats, one per template. Memory-efficient version with normalization by query length. Maintains consistency with original calculate_similarity behavior. """ n = len(templates) if n <= 1: return [1.0] * n # 生成所有需要计算的(i,j)对 pairs = [(i, j) for i in range(n) for j in range(i + 1, n)] # 并行计算相似度分数(未归一化) similarities = Parallel(n_jobs=n_jobs)( delayed(calculate_similarity_score)(templates[i], templates[j]) for i, j in pairs ) # 对每个模板,收集所有相关的相似度分数 # 需要分别归一化(因为查询序列不同时分母不同) max_sims = [-np.inf] * n for (i, j), sim_score in zip(pairs, similarities): # 当 i 是查询序列时,归一化 sim_i_query = sim_score / len(templates[i]) max_sims[i] = max(max_sims[i], sim_i_query) # 当 j 是查询序列时,归一化 sim_j_query = sim_score / len(templates[j]) max_sims[j] = max(max_sims[j], sim_j_query) return max_sims def choose_bins(data, kind="auto", min_bins=10, max_bins=50): """ Robust bin selection for histograms. kind="auto" uses numpy's auto strategy but clip within [min_bins, max_bins]. """ if not data: return 10 n = len(data) # Freedman–Diaconis rule as a guide data_arr = np.asarray(data) q75, q25 = np.percentile(data_arr, [75, 25]) iqr = max(q75 - q25, 1e-9) bin_width = 2 * iqr / (len(data_arr) ** (1 / 3)) if bin_width <= 0: bins = min(max_bins, max(min_bins, int(np.sqrt(n)))) else: bins = int(np.ceil((data_arr.max() - data_arr.min()) / bin_width)) bins = max(min_bins, min(max_bins, bins)) return bins def plot_distributions(templates, variants_map, figsize=(10, 12), save_path=None, show=True): # Compute metrics lengths = compute_length_distribution(templates) aa_list, aa_freqs = compute_amino_acid_composition(templates) d_counts = compute_d_substitution_counts(templates, variants_map) max_sims = compute_intra_dataset_diversity(templates) # Prepare figure with 4 vertically stacked subplots plt.figure(figsize=figsize) plt.style.use('seaborn-v0_8-whitegrid') plt.subplots_adjust(hspace=0.5) # 1) Length distribution (histogram) plt.subplot(2, 2, 1) if lengths: bins_len = choose_bins(lengths) plt.hist(lengths, bins=bins_len, color="#4C78A8", edgecolor="white") else: plt.hist([], bins=10, color="#4C78A8", edgecolor="white") plt.title("Length Distribution", fontsize=16, fontweight='bold') plt.xlabel("Length", fontsize=14) plt.ylabel("Count", fontsize=14) # 2) Amino acid composition distribution (sorted descending) plt.subplot(2, 2, 2) if aa_list: # Use histogram-like bar chart since composition is categorical x = np.arange(len(aa_list)) plt.bar(x, aa_freqs, color="#F58518", edgecolor="white") plt.xticks(x, aa_list, fontsize=9) else: plt.bar([], []) plt.title("Amino Acid Composition", fontsize=16, fontweight='bold') plt.xlabel("Amino Acid", fontsize=14) plt.ylabel("Frequency", fontsize=14) # 3) D-substitution count distribution (across variants) plt.subplot(2, 2, 3) if d_counts: bins_d = np.arange(0, max(d_counts) + 2) - 0.5 # centered integer bins plt.hist(d_counts, bins=bins_d, color="#E45756", edgecolor="white") plt.xticks(range(0, max(d_counts) + 1, 2)) else: plt.hist([], bins=np.arange(-0.5, 1.5), color="#E45756", edgecolor="white") plt.xticks([0]) plt.xlabel("Number of D substitutions per parent", fontsize=14) plt.ylabel("Count", fontsize=14) plt.title("D-type Substitution Count", fontsize=16, fontweight='bold') # 4) Intra-dataset diversity: distribution of max similarity to others plt.subplot(2, 2, 4) if max_sims: bins_sim = min(40, max(10, int(len(max_sims) ** 0.5))) # Similarity can be negative with mismatch penalties; show full range plt.hist(max_sims, bins=bins_sim, color="#72B7B2", edgecolor="white") plt.xlim(min(max_sims) - 0.05, max(max_sims) + 0.05) else: plt.hist([], bins=10, color="#72B7B2", edgecolor="white") plt.xlabel("Max similarity to other templates", fontsize=14) plt.ylabel("Count", fontsize=14) plt.title("Intra-dataset Diversity", fontsize=16, fontweight='bold') if save_path: plt.savefig(save_path, bbox_inches="tight") if show: plt.show() plt.close() def main(): # Read data using your provided function data = read_data() templates, variants_map = extract_templates_and_variants(data) plot_distributions(templates, variants_map, figsize=(14, 7), save_path="dataset_more.svg", show=True) if __name__ == "__main__": main()