Spaces:
Sleeping
Sleeping
File size: 1,949 Bytes
7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd dd83864 7e042dd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 |
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity
model_name = "Salesforce/codet5p-110m-embedding"
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
model = AutoModel.from_pretrained(
model_name,
trust_remote_code=True
)
def get_embedding(code):
inputs = tokenizer(
code,
return_tensors="pt",
truncation=True,
max_length=4096, # <-- 4096 TOKENS
padding=True
)
with torch.no_grad():
outputs = model(**inputs)
# mean pooling for embedding
embedding = outputs.last_hidden_state.mean(dim=1)
return embedding.numpy()
stored_embeddings = []
stored_codes = []
def analyze(code):
global stored_embeddings, stored_codes
emb = get_embedding(code)
text = ""
if len(stored_embeddings) == 0:
text += "First submission stored. No comparisons yet.\n"
else:
all_embs = np.vstack(stored_embeddings)
sims = cosine_similarity(emb, all_embs)[0]
best = float(np.max(sims))
idx = int(np.argmax(sims))
text += f"Most similar previous submission similarity: {best:.3f}\n"
if best > 0.9:
text += "⚠ Very high similarity — likely same approach / copied\n"
elif best > 0.75:
text += "🔁 Same algorithmic structure / same method\n"
elif best > 0.5:
text += "🟡 Partial similarity\n"
else:
text += "🟢 Unique solution\n"
stored_embeddings.append(emb)
stored_codes.append(code)
return text
gr.Interface(
fn=analyze,
inputs=gr.Textbox(lines=12, label="Student Python Code"),
outputs=gr.Textbox(lines=12, label="Code Similarity / Approach Detection (4096 tokens)"),
title="CodeT5p – Code Embedding Similarity (4096-token input)"
).launch()
|