Spaces:

Valmbd
/

Petimot

Running

App Files Files Community

Vlmbd commited on May 19, 2025

Commit

f1804f1

unverified ·

1 Parent(s): 0a329d8

split script

Browse files

Files changed (1) hide show

match_with_train_list_commented.py +289 -0

match_with_train_list_commented.py ADDED Viewed

	@@ -0,0 +1,289 @@

+#!/usr/bin/env python3
+# Author: Elodie Laine
+# Usage: python script.py
+# Purpose: This script performs train-validation-evaluation splits based on sequence ID clustering
+#          for the petimot tool, ensuring non-redundant datasets with controlled sequence similarity.
+import sys
+import random
+def getQueries(fname):
+    """
+    Parse a file containing protein pairs and create a dictionary mapping proteins to their queries.
+    Args:
+        fname (str): Path to input file with format "protein_query" on each line
+    Returns:
+        dict: Dictionary where keys are proteins and values are lists of queries
+    """
+    # Open and read the file
+    with open(fname, "r") as fIN:
+        lines = fIN.readlines()
+    # Create the dictionary
+    d = {}
+    for line in lines:
+        # Split each line by underscore to get protein and query
+        prots = line.strip().split("_")
+        if prots[0] not in d:
+            d[prots[0]] = []
+        d[prots[0]].append(prots[1])
+    return d
+def getMatchHigherLevel(fnameDB, dEval, dTrainVal):
+    """
+    Match higher-level clusters with their corresponding lower-level collections.
+    Args:
+        fnameDB (str): Path to the cluster database file
+        dEval (dict): Dictionary of evaluation set proteins and their queries
+        dTrainVal (dict): Dictionary of training and validation sets proteins and their queries
+    Returns:
+        dict: Dictionary where keys are higher-level cluster representatives and values are tuples of
+              ([train-val proteins], [eval proteins]) that belong to that cluster
+    """
+    # Open and read the database file
+    with open(fnameDB, "r") as fIN:
+        lines = fIN.readlines()
+    d = {}
+    # Each line represents a cluster of PDB chains
+    for line in lines:
+        # Get individual chains in the cluster
+        prots = line.strip().split()
+        # Use the first protein as the cluster representative
+        d[prots[0]] = ([], [])  # Initialize tuple with empty lists for train-val and eval proteins
+        # For each protein in the cluster
+        for p in prots:
+            # If it belongs to the train or validation set
+            if p in dTrainVal:
+                d[prots[0]][0].append(p)
+            # If it belongs to the evaluation set
+            if p in dEval:
+                d[prots[0]][1].append(p)
+    return d
+def splitHigherLevel(dEns, random_seed=42):
+    """
+    Split higher-level clusters into training, validation, and evaluation sets.
+    Args:
+        dEns (dict): Dictionary mapping higher-level clusters to their proteins
+        random_seed (int): Random seed for reproducibility
+    Returns:
+        tuple: (dtrainval, lval, ltrain, deval_strict) where:
+               - dtrainval: dictionary of clusters with proteins in train/val sets but not in eval
+               - lval: list of clusters selected for validation
+               - ltrain: list of clusters selected for training
+               - deval_strict: dictionary of clusters with proteins only in eval set
+    """
+    random.seed(random_seed)
+    # Clusters that have at least one protein in any set
+    dall = {k: v[0] for k, v in dEns.items() if (len(v[1]) + len(v[0])) > 0}
+    # Clusters with proteins only in train/val sets (not in eval)
+    dtrainval = {k: v[0] for k, v in dEns.items() if len(v[1]) == 0 and len(v[0]) > 0}
+    # Clusters with proteins only in eval set (not in train/val)
+    deval_strict = {k: v[1] for k, v in dEns.items() if len(v[1]) > 0 and len(v[0]) == 0}
+    # Clusters with at least one protein in eval set (may overlap with train/val)
+    deval_relax = {k: v[1] for k, v in dEns.items() if len(v[1]) > 0}
+    # Calculate number of validation samples (10% of total clusters)
+    n_val = int((len(dtrainval) + len(deval_strict)) / 10)
+    # Randomly sample clusters for validation
+    ival = random.sample(range(len(dtrainval)), n_val)
+    myKeys = list(dtrainval.keys())
+    # Create lists of cluster IDs for validation and training
+    lval = [myKeys[i] for i in ival]
+    ltrain = [myKeys[i] for i in range(len(myKeys)) if i not in ival]
+    # Print statistics about the split
+    print("Number of higher-order collections with members in train, val or eval:", len(dall))
+    print("Number of higher-order collections with members in train or val and nothing from eval:", len(dtrainval))
+    print("Number of higher-order collections with members in eval:", len(deval_relax))
+    print("Number of higher-order collections with members in eval and nothing from train-val:", len(deval_strict))
+    print("Sample validation clusters:", lval[1:5])
+    print("Number of higher-order collections in the new validation set:", len(lval))
+    print("Sample training clusters:", ltrain[1:5])
+    print("Number of higher-order collections in the new training set:", len(ltrain))
+    return dtrainval, lval, ltrain, deval_strict
+def reduceRedundancyInEval(dEns, dEval, random_seed=42):
+    """
+    Reduce redundancy in the evaluation set by selecting one protein per cluster.
+    Args:
+        dEns (dict): Dictionary mapping higher-level clusters to their proteins
+        dEval (dict): Dictionary of evaluation set proteins and their queries
+        random_seed (int): Random seed for reproducibility
+    Returns:
+        dict: Dictionary of selected evaluation proteins and their queries
+    """
+    random.seed(random_seed)
+    # Get clusters with proteins in eval set
+    deval_relax = {k: v[1] for k, v in dEns.items() if len(v[1]) > 0}
+    # Create new evaluation dictionary with reduced redundancy
+    deval = {}
+    for k in deval_relax:
+        # Randomly select one protein from each cluster
+        myColl = random.sample(deval_relax[k], 1)[0]
+        # Check if the selected protein has multiple queries (warning case)
+        if len(dEval[myColl]) > 1:
+            print("warning!!", myColl, dEval[myColl])
+        # Add only the first query for the selected protein
+        deval[myColl] = [dEval[myColl][0]]
+    return deval
+def sampleLowerLevel(dtrainval, myL, myD, random_seed=42):
+    """
+    Sample queries for lower-level collections based on a stratified approach.
+    Args:
+        dtrainval (dict): Dictionary of training and validation proteins
+        myL (list): List of higher-level clusters to process
+        myD (dict): Dictionary mapping proteins to their queries
+        random_seed (int): Random seed for reproducibility
+    Returns:
+        dict: Dictionary of selected proteins and their queries
+    """
+    random.seed(random_seed)
+    dres = {}
+    # Sampling strategy: number of samples to take based on available proteins
+    # For 1 protein, take 5 queries; for 2 proteins, take 3 and 2 queries; etc.
+    n_samples = ([5], [3, 2], [2, 2, 1], [2, 1, 1, 1], [1, 1, 1, 1, 1])
+    for higherColl in myL:
+        # Get lower-level collections for this cluster
+        lowerColl = dtrainval[higherColl]
+        n = len(lowerColl)
+        p = min(n, 5)  # Cap at 5 proteins per cluster
+        # If we have 5 or more proteins, randomly sample 5
+        if p == 5:
+            selectedColl = random.sample(lowerColl, p)
+        else:
+            # Otherwise, use all available proteins
+            selectedColl = lowerColl
+        # Get the sampling distribution for this number of proteins
+        nbs = n_samples[p-1]
+        # Sample queries for each selected protein
+        for i in range(p):
+            j = nbs[i]  # Number of queries to sample for this protein
+            dres[selectedColl[i]] = myD[selectedColl[i]][:j]
+    return dres
+def writeDico(dEns):
+    """
+    Write cluster matching information to a CSV file.
+    Args:
+        dEns (dict): Dictionary of clusters and their proteins
+    """
+    with open("match_eval.csv", "w") as fOUT:
+        for k in dEns:
+            n = len(dEns[k])
+            if n > 0:
+                fOUT.write(k + "," + str(n) + "," + "-".join(dEns[k]) + "\n")
+def write_queries(d, fname):
+    """
+    Write protein-query pairs to a file.
+    Args:
+        d (dict): Dictionary mapping proteins to their queries
+        fname (str): Output file name
+    """
+    with open(fname, "w") as fOUT:
+        for k in d:
+            for q in d[k]:
+                fOUT.write(k + "_" + q + "\n")
+def write_values(d, fname):
+    """
+    Write only query values to a file.
+    Args:
+        d (dict): Dictionary mapping proteins to their queries
+        fname (str): Output file name
+    """
+    with open(fname, "w") as fOUT:
+        for k in d:
+            for q in d[k]:
+                fOUT.write(q + "\n")
+if __name__ == "__main__":
+    # Load evaluation set data
+    evalF = "eval_list.txt"
+    dEval = getQueries(evalF)
+    # Load training and validation sets data
+    trvalF = "train_val_list.txt"
+    dTrainVal = getQueries(trvalF)
+    # Load full training set data
+    trF = "full_train_list.txt"
+    dTrain = getQueries(trF)
+    # Match the collections using 30% sequence identity, 80% coverage threshold
+    dEns = getMatchHigherLevel("rewrited_clusterDB_30_80.tsv", dEval, dTrainVal)
+    # Split into training, validation and evaluation sets
+    dtrainval, lval, ltrain, deval_strict = splitHigherLevel(dEns)
+    # Sample validation queries and write to file
+    dval = sampleLowerLevel(dtrainval, lval, dTrainVal)
+    write_queries(dval, "val_nr_list_12_05.txt")
+    # Sample training queries and write to file
+    dtrain = sampleLowerLevel(dtrainval, ltrain, dTrainVal)
+    write_queries(dtrain, "train_nr_list_12_05.txt")
+    # Reduce redundancy in evaluation set and write to file
+    deval = reduceRedundancyInEval(dEns, dEval)
+    write_queries(deval, "eval_nr_list_12_05.txt")
+    # Print number of strict evaluation clusters
+    print("Number of strict evaluation clusters:", len(deval_strict))
+    # Generate strict evaluation set based on full training
+    dEns2 = getMatchHigherLevel("rewrited_clusterDB_30_80.tsv", deval, dTrain)
+    dtrainval, lval, ltrain, deval_strict = splitHigherLevel(dEns2)
+    write_values(deval_strict, "eval_strict_list_12_05.txt")
+    # Generate even stricter evaluation set based on train+val
+    dEns3 = getMatchHigherLevel("rewrited_clusterDB_30_80.tsv", deval, dTrainVal)
+    dtrainval, lval, ltrain, deval_strict = splitHigherLevel(dEns3)
+    write_values(deval_strict, "eval_even_stricter_list_12_05.txt")