| """ |
| Hugging Face Space โ CodeT5-large encoder for the pseudoscore-x backend. |
| |
| Exposes a Gradio API at /encode that: |
| - tokenises text (with the same <criterion> / <score> special tokens |
| the notebook used) |
| - runs the FROZEN encoder forward pass |
| - returns last_hidden_state (float16, base64-encoded), the attention |
| mask, and the cleaned subword tokens used for signal extraction |
| |
| Designed for the FREE CPU tier on HF Spaces. The encoder weights load |
| once at Space startup; subsequent requests are just forward passes. |
| |
| Call from Python: |
| from gradio_client import Client |
| client = Client("YOUR_USERNAME/pseudoscorex-encoder") |
| out = client.predict("hello world", api_name="/encode") |
| """ |
| import base64 |
| import os |
|
|
| import gradio as gr |
| import numpy as np |
| import torch |
| from transformers import AutoTokenizer, AutoModelForSeq2SeqLM |
|
|
|
|
| ENCODER_NAME = os.getenv("ENCODER_NAME", "Salesforce/codet5-large") |
| MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512")) |
|
|
| |
| print(f"[boot] Loading tokenizer: {ENCODER_NAME}") |
| tokenizer = AutoTokenizer.from_pretrained(ENCODER_NAME) |
| tokenizer.add_tokens(["<criterion>", "<score>"], special_tokens=True) |
|
|
| print(f"[boot] Loading encoder: {ENCODER_NAME}") |
| full_model = AutoModelForSeq2SeqLM.from_pretrained(ENCODER_NAME) |
| encoder = full_model.encoder |
| encoder.resize_token_embeddings(len(tokenizer)) |
| encoder.eval() |
| for p in encoder.parameters(): |
| p.requires_grad = False |
| del full_model |
| print("[boot] Encoder ready.") |
|
|
|
|
| SPECIAL_TOKENS = {"", "<s>", "</s>", "<pad>", "<criterion>", "<score>"} |
|
|
|
|
| def _decode_clean_tokens(text: str): |
| """Mirrors model/signals.py::decode_clean_tokens on the server.""" |
| ids = tokenizer(text, max_length=MAX_LENGTH, truncation=True)["input_ids"] |
| toks = tokenizer.convert_ids_to_tokens(ids) |
| special = set(tokenizer.all_special_tokens) |
| clean = [] |
| for t in toks: |
| if t in special or t.strip() in ["", "โ"]: |
| continue |
| cleaned = t.replace("โ", "").replace("ฤ ", "").strip() |
| if cleaned: |
| clean.append(cleaned) |
| return clean |
|
|
|
|
| @torch.no_grad() |
| def encode(text: str): |
| """ |
| Returns a JSON-serialisable dict: |
| { |
| "hidden_b64": <base64 string of float16 array>, |
| "shape": [seq_len, hidden_dim], |
| "attention_mask": [int, ...], # length = seq_len |
| "clean_tokens": [str, ...], # for signal extraction |
| } |
| """ |
| if not isinstance(text, str) or not text.strip(): |
| raise gr.Error("text must be a non-empty string") |
|
|
| inp = tokenizer( |
| text, |
| max_length=MAX_LENGTH, |
| truncation=True, |
| padding="max_length", |
| return_tensors="pt", |
| ) |
| hidden = encoder(**inp).last_hidden_state |
|
|
| arr = hidden[0].cpu().numpy().astype(np.float16) |
| return { |
| "hidden_b64": base64.b64encode(arr.tobytes()).decode("ascii"), |
| "shape": list(arr.shape), |
| "attention_mask": inp["attention_mask"][0].cpu().tolist(), |
| "clean_tokens": _decode_clean_tokens(text), |
| } |
|
|
|
|
| |
| with gr.Blocks(title="pseudoscore-x encoder") as demo: |
| gr.Markdown( |
| "# pseudoscore-x encoder\n" |
| "CodeT5-large encoder with `<criterion>` and `<score>` special tokens.\n" |
| "Use the **/encode** API endpoint from your backend." |
| ) |
| inp = gr.Textbox(label="Text", lines=4, placeholder="Paste text to encodeโฆ") |
| out = gr.JSON(label="Encoded output") |
| btn = gr.Button("Encode") |
| btn.click(fn=encode, inputs=inp, outputs=out, api_name="encode") |
|
|
| if __name__ == "__main__": |
| demo.queue(max_size=8).launch(server_name="0.0.0.0", server_port=7860) |
|
|