Spaces:
Sleeping
Sleeping
| import os | |
| import math | |
| import spaces | |
| import torch | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| MODEL_ID = os.getenv("MODEL_ID", "Qwen/Qwen3-4B") | |
| #MODEL_ID = os.getenv("MODEL_ID", "bigcode/starcoder2-3b") | |
| def load_model(): | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True) | |
| # Ensure a pad token exists for safe batching; use eos if needed | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype="auto", | |
| device_map="auto", | |
| trust_remote_code=True, | |
| ) | |
| model.eval() | |
| return tokenizer, model | |
| TOKENIZER, MODEL = load_model() | |
| def compute_entropy(code: str): | |
| if not code or not code.strip(): | |
| return "Please paste some source code.", None | |
| with torch.no_grad(): | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| if next(MODEL.parameters()).device != device: | |
| MODEL.to(device) | |
| enc = TOKENIZER(code, return_tensors="pt") | |
| input_ids = enc["input_ids"] | |
| attention_mask = enc.get("attention_mask") | |
| input_ids = input_ids.to(device) | |
| if attention_mask is not None: | |
| attention_mask = attention_mask.to(device) | |
| # Need at least 2 tokens to compute next-token NLL | |
| if input_ids.shape[1] < 2: | |
| return "Input is too short to compute token-level entropy.", None | |
| outputs = MODEL(input_ids=input_ids, attention_mask=attention_mask) | |
| logits = outputs.logits | |
| # Shift for next-token prediction | |
| shift_logits = logits[:, :-1, :] | |
| shift_labels = input_ids[:, 1:] | |
| log_probs = torch.log_softmax(shift_logits, dim=-1) | |
| # Gather log prob of the true next token | |
| true_log_probs = log_probs.gather(2, shift_labels.unsqueeze(-1)).squeeze(-1) | |
| nll = -true_log_probs # negative log-likelihood (nats) | |
| nll_list = nll.squeeze(0).detach().cpu().tolist() | |
| label_ids = shift_labels.squeeze(0).detach().cpu().tolist() | |
| tokens = TOKENIZER.convert_ids_to_tokens(label_ids) | |
| rows = [] | |
| for tok, nll_val in zip(tokens, nll_list): | |
| prob = math.exp(-nll_val) | |
| rows.append([tok, float(nll_val), float(prob)]) | |
| avg_nll = sum(nll_list) / len(nll_list) | |
| avg_bits = avg_nll / math.log(2) | |
| # total entropy in bits is the sum of per-token NLL (bits) | |
| total_bits = sum(nll_list) / math.log(2) | |
| # total entropy in nats is simply the sum of per-token NLL (nats) | |
| total_nats = sum(nll_list) | |
| summary = ( | |
| f"Tokens evaluated: {len(nll_list)}\n" | |
| f"Average NLL (nats): {avg_nll:.4f}\n" | |
| f"Average NLL (bits): {avg_bits:.4f}\n" | |
| f"Total entropy (nats): {total_nats:.4f}\n" | |
| f"Total entropy (bits): {total_bits:.4f}" | |
| ) | |
| return summary, rows | |
| def build_app(): | |
| with gr.Blocks(title="Entropy for Source Code") as demo: | |
| gr.Markdown( | |
| f""" | |
| # Source Code Entropy ({MODEL_ID}) | |
| Paste code below to compute token-level negative log-likelihood (NLL). | |
| The table shows each token's NLL and probability under the model. | |
| """ | |
| ) | |
| code = gr.Textbox( | |
| label="Source Code", | |
| lines=16, | |
| placeholder="Paste your source code here...", | |
| ) | |
| btn = gr.Button("Compute Entropy") | |
| summary = gr.Textbox(label="Summary", lines=4) | |
| table = gr.Dataframe( | |
| headers=["token", "nll_nats", "prob"], | |
| datatype=["str", "number", "number"], | |
| label="Token-level NLL", | |
| ) | |
| btn.click(fn=compute_entropy, inputs=[code], outputs=[summary, table]) | |
| gr.Markdown( | |
| """ | |
| Notes: | |
| - NLL is computed for next-token prediction and excludes the first token. | |
| - Large inputs may take time to process depending on hardware. | |
| """ | |
| ) | |
| return demo | |
| app = build_app() | |
| if __name__ == "__main__": | |
| app.launch() | |