import gradio as gr import torch import numpy as np from transformers import AutoTokenizer, AutoModel from sklearn.metrics.pairwise import cosine_similarity model_name = "Salesforce/codet5p-110m-embedding" tokenizer = AutoTokenizer.from_pretrained( model_name, trust_remote_code=True ) model = AutoModel.from_pretrained( model_name, trust_remote_code=True ) def get_embedding(code): inputs = tokenizer( code, return_tensors="pt", truncation=True, max_length=4096, # <-- 4096 TOKENS padding=True ) with torch.no_grad(): outputs = model(**inputs) # mean pooling for embedding embedding = outputs.last_hidden_state.mean(dim=1) return embedding.numpy() stored_embeddings = [] stored_codes = [] def analyze(code): global stored_embeddings, stored_codes emb = get_embedding(code) text = "" if len(stored_embeddings) == 0: text += "First submission stored. No comparisons yet.\n" else: all_embs = np.vstack(stored_embeddings) sims = cosine_similarity(emb, all_embs)[0] best = float(np.max(sims)) idx = int(np.argmax(sims)) text += f"Most similar previous submission similarity: {best:.3f}\n" if best > 0.9: text += "⚠ Very high similarity — likely same approach / copied\n" elif best > 0.75: text += "🔁 Same algorithmic structure / same method\n" elif best > 0.5: text += "🟡 Partial similarity\n" else: text += "🟢 Unique solution\n" stored_embeddings.append(emb) stored_codes.append(code) return text gr.Interface( fn=analyze, inputs=gr.Textbox(lines=12, label="Student Python Code"), outputs=gr.Textbox(lines=12, label="Code Similarity / Approach Detection (4096 tokens)"), title="CodeT5p – Code Embedding Similarity (4096-token input)" ).launch()