Codet5p / app.py
Kishoreuses5's picture
Update app.py
dd83864 verified
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
model_name = "Salesforce/codet5p-110m-embedding"
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
model = AutoModel.from_pretrained(
model_name,
trust_remote_code=True
)
def get_embedding(code):
inputs = tokenizer(
code,
return_tensors="pt",
truncation=True,
max_length=4096, # <-- 4096 TOKENS
padding=True
)
with torch.no_grad():
outputs = model(**inputs)
# mean pooling for embedding
embedding = outputs.last_hidden_state.mean(dim=1)
return embedding.numpy()
stored_embeddings = []
stored_codes = []
def analyze(code):
global stored_embeddings, stored_codes
emb = get_embedding(code)
text = ""
if len(stored_embeddings) == 0:
text += "First submission stored. No comparisons yet.\n"
else:
all_embs = np.vstack(stored_embeddings)
sims = cosine_similarity(emb, all_embs)[0]
best = float(np.max(sims))
idx = int(np.argmax(sims))
text += f"Most similar previous submission similarity: {best:.3f}\n"
if best > 0.9:
text += "⚠ Very high similarity β€” likely same approach / copied\n"
elif best > 0.75:
text += "πŸ” Same algorithmic structure / same method\n"
elif best > 0.5:
text += "🟑 Partial similarity\n"
else:
text += "🟒 Unique solution\n"
stored_embeddings.append(emb)
stored_codes.append(code)
return text
gr.Interface(
fn=analyze,
inputs=gr.Textbox(lines=12, label="Student Python Code"),
outputs=gr.Textbox(lines=12, label="Code Similarity / Approach Detection (4096 tokens)"),
title="CodeT5p – Code Embedding Similarity (4096-token input)"
).launch()