Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -4,63 +4,73 @@ import numpy as np
|
|
| 4 |
from transformers import AutoTokenizer, AutoModel
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
|
| 7 |
-
# Load CodeT5p embedding model
|
| 8 |
model_name = "Salesforce/codet5p-110m-embedding"
|
| 9 |
|
| 10 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
def get_embedding(code):
|
| 14 |
inputs = tokenizer(
|
| 15 |
code,
|
| 16 |
return_tensors="pt",
|
| 17 |
truncation=True,
|
| 18 |
-
max_length=
|
|
|
|
| 19 |
)
|
|
|
|
| 20 |
with torch.no_grad():
|
| 21 |
outputs = model(**inputs)
|
|
|
|
|
|
|
| 22 |
embedding = outputs.last_hidden_state.mean(dim=1)
|
|
|
|
| 23 |
return embedding.numpy()
|
| 24 |
|
| 25 |
-
|
| 26 |
-
|
| 27 |
|
| 28 |
def analyze(code):
|
| 29 |
-
global
|
| 30 |
|
| 31 |
emb = get_embedding(code)
|
| 32 |
|
| 33 |
-
|
| 34 |
|
| 35 |
-
if len(
|
| 36 |
-
|
| 37 |
else:
|
| 38 |
-
|
| 39 |
-
sims = cosine_similarity(emb,
|
| 40 |
|
| 41 |
-
|
| 42 |
idx = int(np.argmax(sims))
|
| 43 |
|
| 44 |
-
|
| 45 |
-
result += f"Most similar code index: {idx}\n\n"
|
| 46 |
|
| 47 |
-
if
|
| 48 |
-
|
| 49 |
-
elif
|
| 50 |
-
|
| 51 |
-
elif
|
| 52 |
-
|
| 53 |
else:
|
| 54 |
-
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
|
| 59 |
-
return
|
| 60 |
|
| 61 |
gr.Interface(
|
| 62 |
fn=analyze,
|
| 63 |
-
inputs=gr.Textbox(lines=
|
| 64 |
-
outputs=gr.Textbox(lines=12, label="Code Similarity / Approach
|
| 65 |
-
title="CodeT5p
|
| 66 |
).launch()
|
|
|
|
| 4 |
from transformers import AutoTokenizer, AutoModel
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
|
|
|
|
| 7 |
model_name = "Salesforce/codet5p-110m-embedding"
|
| 8 |
|
| 9 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 10 |
+
model_name,
|
| 11 |
+
trust_remote_code=True
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
model = AutoModel.from_pretrained(
|
| 15 |
+
model_name,
|
| 16 |
+
trust_remote_code=True
|
| 17 |
+
)
|
| 18 |
|
| 19 |
def get_embedding(code):
|
| 20 |
inputs = tokenizer(
|
| 21 |
code,
|
| 22 |
return_tensors="pt",
|
| 23 |
truncation=True,
|
| 24 |
+
max_length=4096, # <-- 4096 TOKENS
|
| 25 |
+
padding=True
|
| 26 |
)
|
| 27 |
+
|
| 28 |
with torch.no_grad():
|
| 29 |
outputs = model(**inputs)
|
| 30 |
+
|
| 31 |
+
# mean pooling for embedding
|
| 32 |
embedding = outputs.last_hidden_state.mean(dim=1)
|
| 33 |
+
|
| 34 |
return embedding.numpy()
|
| 35 |
|
| 36 |
+
stored_embeddings = []
|
| 37 |
+
stored_codes = []
|
| 38 |
|
| 39 |
def analyze(code):
|
| 40 |
+
global stored_embeddings, stored_codes
|
| 41 |
|
| 42 |
emb = get_embedding(code)
|
| 43 |
|
| 44 |
+
text = ""
|
| 45 |
|
| 46 |
+
if len(stored_embeddings) == 0:
|
| 47 |
+
text += "First submission stored. No comparisons yet.\n"
|
| 48 |
else:
|
| 49 |
+
all_embs = np.vstack(stored_embeddings)
|
| 50 |
+
sims = cosine_similarity(emb, all_embs)[0]
|
| 51 |
|
| 52 |
+
best = float(np.max(sims))
|
| 53 |
idx = int(np.argmax(sims))
|
| 54 |
|
| 55 |
+
text += f"Most similar previous submission similarity: {best:.3f}\n"
|
|
|
|
| 56 |
|
| 57 |
+
if best > 0.9:
|
| 58 |
+
text += "β Very high similarity β likely same approach / copied\n"
|
| 59 |
+
elif best > 0.75:
|
| 60 |
+
text += "π Same algorithmic structure / same method\n"
|
| 61 |
+
elif best > 0.5:
|
| 62 |
+
text += "π‘ Partial similarity\n"
|
| 63 |
else:
|
| 64 |
+
text += "π’ Unique solution\n"
|
| 65 |
|
| 66 |
+
stored_embeddings.append(emb)
|
| 67 |
+
stored_codes.append(code)
|
| 68 |
|
| 69 |
+
return text
|
| 70 |
|
| 71 |
gr.Interface(
|
| 72 |
fn=analyze,
|
| 73 |
+
inputs=gr.Textbox(lines=12, label="Student Python Code"),
|
| 74 |
+
outputs=gr.Textbox(lines=12, label="Code Similarity / Approach Detection (4096 tokens)"),
|
| 75 |
+
title="CodeT5p β Code Embedding Similarity (4096-token input)"
|
| 76 |
).launch()
|