Spaces:
Build error
Build error
File size: 7,885 Bytes
0d00ce9 6d06b0d 7005c0b 77c4d8b b71d37c 0d00ce9 6d06b0d 0d00ce9 6d06b0d 7005c0b 77c4d8b b71d37c 0d00ce9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 |
import csv
import numpy as np
from collections import defaultdict
from scipy.optimize import linear_sum_assignment
import os
def load_clusters(path):
cluster_to_tokens = defaultdict(set)
with open(path, "r", encoding="utf-8") as f:
for line in f:
parts = line.strip().split("|||")
if len(parts) < 2:
continue
token = parts[0]
cluster_id = parts[-1]
cluster_to_tokens[cluster_id].add(token)
return cluster_to_tokens
def compute_jaccard_matrix(clusters_a, clusters_b):
a_keys = list(clusters_a.keys())
b_keys = list(clusters_b.keys())
matrix = np.zeros((len(a_keys), len(b_keys)))
for i, ca in enumerate(a_keys):
for j, cb in enumerate(b_keys):
set_a = clusters_a[ca]
set_b = clusters_b[cb]
intersection = len(set_a & set_b)
union = len(set_a | set_b)
matrix[i, j] = intersection / union if union > 0 else 0.0
return matrix, a_keys, b_keys
# Dictionary mapping perturbation names to their descriptions
perturbation_descriptions = {
"Scope Modification": "Identifies variables in complex scopes and moves them to unrelated blocks.",
"Log Modification": "Adds logging statements to blocks of code for tracking execution flow.",
"Operator Modification": "Modifies boolean expressions by negating them in various contexts.",
"Pointer Modification": "Add C style pointer to the code.",
"POS finetuned": "Clusters based on finetuned POS codebert model",
"Random Modification": "Permutes statements within basic blocks, allowing different execution orders.",
"Try Catch Modification": "Converts switch statements into equivalent if statements.",
"Unused Statement Modification": "Inserts unused statements into blocks of code for testing/debugging.",
"Exact Name Class Variable Modification": "Renames classes and variables to a specific randomly generated name.",
"Casing Class Variable Modification": "Generates lexical variations of class and variable names with different casing.",
"Onecase Modification": "Generates lexical variations of class and variable names with just 1 letter uppercase wither for class anme or variable name.",
"Example Modification": "Generates lexical variations of class and variable names with Example being the class name and example being the variable name or vice versa.",
"Finetuned on compile error": "Clusters based on finetuned codebert model on compile errors",
"Finetuned on language classification": "Clusters based on finetuned codebert model on language classification",
}
def compute_and_log_csi(file_orig, file_pert, perturbation_name, output_csv="results/csi_summary.csv"):
clusters_orig = load_clusters(file_orig)
clusters_pert = load_clusters(file_pert)
if len(clusters_orig) != len(clusters_pert):
raise ValueError(f"Cluster count mismatch: {len(clusters_orig)} (original) vs {len(clusters_pert)} (perturbed)")
jaccard_matrix, orig_ids, pert_ids = compute_jaccard_matrix(clusters_orig, clusters_pert)
row_ind, col_ind = linear_sum_assignment(-jaccard_matrix)
matched_similarities = [jaccard_matrix[i, j] for i, j in zip(row_ind, col_ind)]
avg_jaccard = np.mean(matched_similarities)
csi = 1.0 - avg_jaccard
print(f"Perturbation: {perturbation_name}")
print(f" Average Jaccard Similarity: {avg_jaccard:.4f}")
print(f" Cluster Sensitivity Index (CSI): {csi:.4f}")
# Append to CSV
os.makedirs(os.path.dirname(output_csv), exist_ok=True)
file_exists = os.path.isfile(output_csv)
with open(output_csv, mode="a", newline='', encoding="utf-8") as file:
writer = csv.writer(file)
if not file_exists:
writer.writerow(["Perturbation", "Average Jaccard", "CSI", "Description"])
writer.writerow([perturbation_name, avg_jaccard, csi, perturbation_descriptions.get(perturbation_name, "No description available")])
return avg_jaccard, csi
# Example usage
compute_and_log_csi(
"codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_scope_error/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Scope Modification",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_log/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Log Modification",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_operator/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Operator Modification",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_pointer/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Pointer Modification",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_POS/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="POS finetuned",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_random/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Random Modification",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_trycatch/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Try Catch Modification",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_unusedStatement/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Unused Statement Modification",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_exactNameClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Exact Name Class Variable Modification",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_CasingClassVariable/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Casing Class Variable Modification",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_del_15000/Java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_Onecase/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Onecase Modification",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_Example/Java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_Example/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Example Modification",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_finetuned_compile_error/java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_finetuned_compile_error/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Finetuned on compile error",
output_csv="results/csi_summary.csv"
)
compute_and_log_csi(
"codenet_4000_finetuned_language_classification/java/layer12/kmeans/clusters-kmeans-350.txt",
"codenet_4000_finetuned_language_classification/java/layer12/kmeans/clusters-kmeans-350.txt",
perturbation_name="Finetuned on language classification",
output_csv="results/csi_summary.csv"
)
# You can now call compute_and_log_csi again and again for other perturbations! |