Spaces:
Build error
Build error
| # -*- coding: utf-8 -*- | |
| """ | |
| Martinez-Gil, J. (2025). Augmenting the Interpretability of GraphCodeBERT for Code Similarity Tasks. | |
| International Journal of Software Engineering and Knowledge Engineering, 35(05), 657–678. | |
| """ | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from sklearn.decomposition import PCA | |
| from transformers import RobertaTokenizer, RobertaModel | |
| import torch | |
| import gradio as gr | |
| from io import BytesIO | |
| from PIL import Image | |
| # Load GraphCodeBERT from Hugging Face (with cache) | |
| tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base", cache_dir="models/") | |
| model = RobertaModel.from_pretrained("microsoft/graphcodebert-base", cache_dir="models/") | |
| # Default sorting algorithm code snippets | |
| default_code_1 = """def bubble_sort(arr): | |
| n = len(arr) | |
| for i in range(n): | |
| for j in range(0, n-i-1): | |
| if arr[j] > arr[j+1]: | |
| arr[j], arr[j+1] = arr[j+1], arr[j] | |
| return arr""" | |
| default_code_2 = """def quick_sort(arr, low, high): | |
| if low < high: | |
| pi = partition(arr, low, high) | |
| quick_sort(arr, low, pi - 1) | |
| quick_sort(arr, pi + 1, high) | |
| def partition(arr, low, high): | |
| i = (low - 1) | |
| pivot = arr[high] | |
| for j in range(low, high): | |
| if arr[j] <= pivot: | |
| i += 1 | |
| arr[i], arr[j] = arr[j], arr[i] | |
| arr[i+1], arr[high] = arr[high], arr[i+1] | |
| return (i + 1)""" | |
| # Get token embeddings for a code snippet | |
| def get_token_embeddings(code): | |
| inputs = tokenizer(code, return_tensors="pt", max_length=512, truncation=True, padding=True) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| token_embeddings = outputs.last_hidden_state.squeeze(0).cpu().numpy() | |
| tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze()) | |
| return token_embeddings, tokens | |
| # Plot comparison between two algorithms | |
| def compare_algorithms(code1, code2): | |
| emb1, tokens1 = get_token_embeddings(code1) | |
| emb2, tokens2 = get_token_embeddings(code2) | |
| combined = np.concatenate([emb1, emb2], axis=0) | |
| pca = PCA(n_components=2) | |
| coords = pca.fit_transform(combined) | |
| plt.figure(figsize=(6, 5), dpi=150) | |
| plt.scatter(coords[:len(tokens1), 0], coords[:len(tokens1), 1], color='red', label="Code 1", s=20) | |
| plt.scatter(coords[len(tokens1):, 0], coords[len(tokens1):, 1], color='blue', label="Code 2", s=20) | |
| plt.legend() | |
| plt.xticks([]); plt.yticks([]); plt.grid(False) | |
| buf = BytesIO() | |
| plt.savefig(buf, format='png', bbox_inches='tight') | |
| plt.close() | |
| buf.seek(0) | |
| return Image.open(buf) | |
| interface = gr.Interface( | |
| fn=compare_algorithms, | |
| inputs=[ | |
| gr.Code(language="python", value=default_code_1, label="Code 1"), | |
| gr.Code(language="python", value=default_code_2, label="Code 2") | |
| ], | |
| outputs=gr.Image(type="pil", label="Token Embedding PCA"), | |
| title="GraphCodeBERT Token Embedding Comparison", | |
| description="Edit or paste two Python code snippets. This tool compares their token-level embeddings using GraphCodeBERT and PCA.", | |
| article=""" | |
| **Citation** | |
| Martinez-Gil, J. (2025). *Augmenting the Interpretability of GraphCodeBERT for Code Similarity Tasks.* International Journal of Software Engineering and Knowledge Engineering, 35(05), 657–678. | |
| **GitHub Repository** | |
| [View Source on GitHub](https://github.com/jorge-martinez-gil/graphcodebert-interpretability) | |
| """ | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |