Kishoreuses5 commited on
Commit
dd83864
Β·
verified Β·
1 Parent(s): 7e042dd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +38 -28
app.py CHANGED
@@ -4,63 +4,73 @@ import numpy as np
4
  from transformers import AutoTokenizer, AutoModel
5
  from sklearn.metrics.pairwise import cosine_similarity
6
 
7
- # Load CodeT5p embedding model
8
  model_name = "Salesforce/codet5p-110m-embedding"
9
 
10
- tokenizer = AutoTokenizer.from_pretrained(model_name)
11
- model = AutoModel.from_pretrained(model_name)
 
 
 
 
 
 
 
12
 
13
  def get_embedding(code):
14
  inputs = tokenizer(
15
  code,
16
  return_tensors="pt",
17
  truncation=True,
18
- max_length=512
 
19
  )
 
20
  with torch.no_grad():
21
  outputs = model(**inputs)
 
 
22
  embedding = outputs.last_hidden_state.mean(dim=1)
 
23
  return embedding.numpy()
24
 
25
- student_codes = [] # stored embeddings
26
- raw_codes = [] # original text
27
 
28
  def analyze(code):
29
- global student_codes, raw_codes
30
 
31
  emb = get_embedding(code)
32
 
33
- result = ""
34
 
35
- if len(student_codes) == 0:
36
- result += "First submission stored. No comparison yet.\n"
37
  else:
38
- all_embeddings = np.vstack(student_codes)
39
- sims = cosine_similarity(emb, all_embeddings)[0]
40
 
41
- max_sim = float(np.max(sims))
42
  idx = int(np.argmax(sims))
43
 
44
- result += f"Most similar previous submission score: {max_sim:.3f}\n"
45
- result += f"Most similar code index: {idx}\n\n"
46
 
47
- if max_sim > 0.9:
48
- result += "⚠ Very high similarity β€” likely same approach / plagiarism\n"
49
- elif max_sim > 0.7:
50
- result += "πŸ” Same structure / same algorithm\n"
51
- elif max_sim > 0.5:
52
- result += "🟑 Partially similar approach\n"
53
  else:
54
- result += "🟒 Unique solution style\n"
55
 
56
- student_codes.append(emb)
57
- raw_codes.append(code)
58
 
59
- return result
60
 
61
  gr.Interface(
62
  fn=analyze,
63
- inputs=gr.Textbox(lines=10, label="Student Python Code"),
64
- outputs=gr.Textbox(lines=12, label="Code Similarity / Approach Analysis"),
65
- title="CodeT5p β€” Code Similarity & Approach Clustering"
66
  ).launch()
 
4
  from transformers import AutoTokenizer, AutoModel
5
  from sklearn.metrics.pairwise import cosine_similarity
6
 
 
7
  model_name = "Salesforce/codet5p-110m-embedding"
8
 
9
+ tokenizer = AutoTokenizer.from_pretrained(
10
+ model_name,
11
+ trust_remote_code=True
12
+ )
13
+
14
+ model = AutoModel.from_pretrained(
15
+ model_name,
16
+ trust_remote_code=True
17
+ )
18
 
19
  def get_embedding(code):
20
  inputs = tokenizer(
21
  code,
22
  return_tensors="pt",
23
  truncation=True,
24
+ max_length=4096, # <-- 4096 TOKENS
25
+ padding=True
26
  )
27
+
28
  with torch.no_grad():
29
  outputs = model(**inputs)
30
+
31
+ # mean pooling for embedding
32
  embedding = outputs.last_hidden_state.mean(dim=1)
33
+
34
  return embedding.numpy()
35
 
36
+ stored_embeddings = []
37
+ stored_codes = []
38
 
39
  def analyze(code):
40
+ global stored_embeddings, stored_codes
41
 
42
  emb = get_embedding(code)
43
 
44
+ text = ""
45
 
46
+ if len(stored_embeddings) == 0:
47
+ text += "First submission stored. No comparisons yet.\n"
48
  else:
49
+ all_embs = np.vstack(stored_embeddings)
50
+ sims = cosine_similarity(emb, all_embs)[0]
51
 
52
+ best = float(np.max(sims))
53
  idx = int(np.argmax(sims))
54
 
55
+ text += f"Most similar previous submission similarity: {best:.3f}\n"
 
56
 
57
+ if best > 0.9:
58
+ text += "⚠ Very high similarity β€” likely same approach / copied\n"
59
+ elif best > 0.75:
60
+ text += "πŸ” Same algorithmic structure / same method\n"
61
+ elif best > 0.5:
62
+ text += "🟑 Partial similarity\n"
63
  else:
64
+ text += "🟒 Unique solution\n"
65
 
66
+ stored_embeddings.append(emb)
67
+ stored_codes.append(code)
68
 
69
+ return text
70
 
71
  gr.Interface(
72
  fn=analyze,
73
+ inputs=gr.Textbox(lines=12, label="Student Python Code"),
74
+ outputs=gr.Textbox(lines=12, label="Code Similarity / Approach Detection (4096 tokens)"),
75
+ title="CodeT5p – Code Embedding Similarity (4096-token input)"
76
  ).launch()