Spaces:
Sleeping
Sleeping
File size: 4,014 Bytes
e1f9aa2 ce736c4 dc52bd0 e1f9aa2 dc52bd0 e1f9aa2 dc52bd0 e1f9aa2 70078d8 e1f9aa2 0509812 e1f9aa2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 | import os
import math
import spaces
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForCausalLM
MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen3-4B")
#MODEL_ID = os.getenv("MODEL_ID", "bigcode/starcoder2-3b")
def load_model():
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
# Ensure a pad token exists for safe batching; use eos if needed
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype="auto",
device_map="auto",
trust_remote_code=True,
)
model.eval()
return tokenizer, model
TOKENIZER, MODEL = load_model()
@spaces.GPU
def compute_entropy(code: str):
if not code or not code.strip():
return "Please paste some source code.", None
with torch.no_grad():
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
if next(MODEL.parameters()).device != device:
MODEL.to(device)
enc = TOKENIZER(code, return_tensors="pt")
input_ids = enc["input_ids"]
attention_mask = enc.get("attention_mask")
input_ids = input_ids.to(device)
if attention_mask is not None:
attention_mask = attention_mask.to(device)
# Need at least 2 tokens to compute next-token NLL
if input_ids.shape[1] < 2:
return "Input is too short to compute token-level entropy.", None
outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask)
logits = outputs.logits
# Shift for next-token prediction
shift_logits = logits[:, :-1, :]
shift_labels = input_ids[:, 1:]
log_probs = torch.log_softmax(shift_logits, dim=-1)
# Gather log prob of the true next token
true_log_probs = log_probs.gather(2, shift_labels.unsqueeze(-1)).squeeze(-1)
nll = -true_log_probs # negative log-likelihood (nats)
nll_list = nll.squeeze(0).detach().cpu().tolist()
label_ids = shift_labels.squeeze(0).detach().cpu().tolist()
tokens = TOKENIZER.convert_ids_to_tokens(label_ids)
rows = []
for tok, nll_val in zip(tokens, nll_list):
prob = math.exp(-nll_val)
rows.append([tok, float(nll_val), float(prob)])
avg_nll = sum(nll_list) / len(nll_list)
avg_bits = avg_nll / math.log(2)
# total entropy in bits is the sum of per-token NLL (bits)
total_bits = sum(nll_list) / math.log(2)
# total entropy in nats is simply the sum of per-token NLL (nats)
total_nats = sum(nll_list)
summary = (
f"Tokens evaluated: {len(nll_list)}\n"
f"Average NLL (nats): {avg_nll:.4f}\n"
f"Average NLL (bits): {avg_bits:.4f}\n"
f"Total entropy (nats): {total_nats:.4f}\n"
f"Total entropy (bits): {total_bits:.4f}"
)
return summary, rows
def build_app():
with gr.Blocks(title="Entropy for Source Code") as demo:
gr.Markdown(
f"""
# Source Code Entropy ({MODEL_ID})
Paste code below to compute token-level negative log-likelihood (NLL).
The table shows each token's NLL and probability under the model.
"""
)
code = gr.Textbox(
label="Source Code",
lines=16,
placeholder="Paste your source code here...",
)
btn = gr.Button("Compute Entropy")
summary = gr.Textbox(label="Summary", lines=4)
table = gr.Dataframe(
headers=["token", "nll_nats", "prob"],
datatype=["str", "number", "number"],
label="Token-level NLL",
)
btn.click(fn=compute_entropy, inputs=[code], outputs=[summary, table])
gr.Markdown(
"""
Notes:
- NLL is computed for next-token prediction and excludes the first token.
- Large inputs may take time to process depending on hardware.
"""
)
return demo
app = build_app()
if __name__ == "__main__":
app.launch()
|