File size: 1,949 Bytes
7e042dd
 
 
 
 
 
 
 
dd83864
 
 
 
 
 
 
 
 
7e042dd
 
 
 
 
 
dd83864
 
7e042dd
dd83864
7e042dd
 
dd83864
 
7e042dd
dd83864
7e042dd
 
dd83864
 
7e042dd
 
dd83864
7e042dd
 
 
dd83864
7e042dd
dd83864
 
7e042dd
dd83864
 
7e042dd
dd83864
7e042dd
 
dd83864
7e042dd
dd83864
 
 
 
 
 
7e042dd
dd83864
7e042dd
dd83864
 
7e042dd
dd83864
7e042dd
 
 
dd83864
 
 
7e042dd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

model_name = "Salesforce/codet5p-110m-embedding"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

model = AutoModel.from_pretrained(
    model_name,
    trust_remote_code=True
)

def get_embedding(code):
    inputs = tokenizer(
        code,
        return_tensors="pt",
        truncation=True,
        max_length=4096,   # <-- 4096 TOKENS
        padding=True
    )

    with torch.no_grad():
        outputs = model(**inputs)

        # mean pooling for embedding
        embedding = outputs.last_hidden_state.mean(dim=1)

    return embedding.numpy()

stored_embeddings = []
stored_codes = []

def analyze(code):
    global stored_embeddings, stored_codes

    emb = get_embedding(code)

    text = ""

    if len(stored_embeddings) == 0:
        text += "First submission stored. No comparisons yet.\n"
    else:
        all_embs = np.vstack(stored_embeddings)
        sims = cosine_similarity(emb, all_embs)[0]

        best = float(np.max(sims))
        idx = int(np.argmax(sims))

        text += f"Most similar previous submission similarity: {best:.3f}\n"

        if best > 0.9:
            text += "⚠ Very high similarity — likely same approach / copied\n"
        elif best > 0.75:
            text += "🔁 Same algorithmic structure / same method\n"
        elif best > 0.5:
            text += "🟡 Partial similarity\n"
        else:
            text += "🟢 Unique solution\n"

    stored_embeddings.append(emb)
    stored_codes.append(code)

    return text

gr.Interface(
    fn=analyze,
    inputs=gr.Textbox(lines=12, label="Student Python Code"),
    outputs=gr.Textbox(lines=12, label="Code Similarity / Approach Detection (4096 tokens)"),
    title="CodeT5p – Code Embedding Similarity (4096-token input)"
).launch()