import gradio as gr
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

model_name = "Salesforce/codet5p-110m-embedding"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

model = AutoModel.from_pretrained(
    model_name,
    trust_remote_code=True
)

def get_embedding(code):
    inputs = tokenizer(
        code,
        return_tensors="pt",
        truncation=True,
        max_length=4096,   # <-- 4096 TOKENS
        padding=True
    )

    with torch.no_grad():
        outputs = model(**inputs)

        # mean pooling for embedding
        embedding = outputs.last_hidden_state.mean(dim=1)

    return embedding.numpy()

stored_embeddings = []
stored_codes = []

def analyze(code):
    global stored_embeddings, stored_codes

    emb = get_embedding(code)

    text = ""

    if len(stored_embeddings) == 0:
        text += "First submission stored. No comparisons yet.\n"
    else:
        all_embs = np.vstack(stored_embeddings)
        sims = cosine_similarity(emb, all_embs)[0]

        best = float(np.max(sims))
        idx = int(np.argmax(sims))

        text += f"Most similar previous submission similarity: {best:.3f}\n"

        if best > 0.9:
            text += "⚠ Very high similarity — likely same approach / copied\n"
        elif best > 0.75:
            text += "🔁 Same algorithmic structure / same method\n"
        elif best > 0.5:
            text += "🟡 Partial similarity\n"
        else:
            text += "🟢 Unique solution\n"

    stored_embeddings.append(emb)
    stored_codes.append(code)

    return text

gr.Interface(
    fn=analyze,
    inputs=gr.Textbox(lines=12, label="Student Python Code"),
    outputs=gr.Textbox(lines=12, label="Code Similarity / Approach Detection (4096 tokens)"),
    title="CodeT5p – Code Embedding Similarity (4096-token input)"
).launch()