Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import numpy as np | |
| from transformers import AutoTokenizer, AutoModel | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| model_name = "Salesforce/codet5p-110m-embedding" | |
| tokenizer = AutoTokenizer.from_pretrained( | |
| model_name, | |
| trust_remote_code=True | |
| ) | |
| model = AutoModel.from_pretrained( | |
| model_name, | |
| trust_remote_code=True | |
| ) | |
| def get_embedding(code): | |
| inputs = tokenizer( | |
| code, | |
| return_tensors="pt", | |
| truncation=True, | |
| max_length=4096, # <-- 4096 TOKENS | |
| padding=True | |
| ) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| # mean pooling for embedding | |
| embedding = outputs.last_hidden_state.mean(dim=1) | |
| return embedding.numpy() | |
| stored_embeddings = [] | |
| stored_codes = [] | |
| def analyze(code): | |
| global stored_embeddings, stored_codes | |
| emb = get_embedding(code) | |
| text = "" | |
| if len(stored_embeddings) == 0: | |
| text += "First submission stored. No comparisons yet.\n" | |
| else: | |
| all_embs = np.vstack(stored_embeddings) | |
| sims = cosine_similarity(emb, all_embs)[0] | |
| best = float(np.max(sims)) | |
| idx = int(np.argmax(sims)) | |
| text += f"Most similar previous submission similarity: {best:.3f}\n" | |
| if best > 0.9: | |
| text += "β Very high similarity β likely same approach / copied\n" | |
| elif best > 0.75: | |
| text += "π Same algorithmic structure / same method\n" | |
| elif best > 0.5: | |
| text += "π‘ Partial similarity\n" | |
| else: | |
| text += "π’ Unique solution\n" | |
| stored_embeddings.append(emb) | |
| stored_codes.append(code) | |
| return text | |
| gr.Interface( | |
| fn=analyze, | |
| inputs=gr.Textbox(lines=12, label="Student Python Code"), | |
| outputs=gr.Textbox(lines=12, label="Code Similarity / Approach Detection (4096 tokens)"), | |
| title="CodeT5p β Code Embedding Similarity (4096-token input)" | |
| ).launch() | |