""" Hugging Face Space — CodeT5-large encoder for the pseudoscore-x backend. Exposes a Gradio API at /encode that: - tokenises text (with the same / special tokens the notebook used) - runs the FROZEN encoder forward pass - returns last_hidden_state (float16, base64-encoded), the attention mask, and the cleaned subword tokens used for signal extraction Designed for the FREE CPU tier on HF Spaces. The encoder weights load once at Space startup; subsequent requests are just forward passes. Call from Python: from gradio_client import Client client = Client("YOUR_USERNAME/pseudoscorex-encoder") out = client.predict("hello world", api_name="/encode") """ import base64 import os import gradio as gr import numpy as np import torch from transformers import AutoTokenizer, AutoModelForSeq2SeqLM ENCODER_NAME = os.getenv("ENCODER_NAME", "Salesforce/codet5-large") MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512")) # ── Boot: load tokenizer + frozen encoder once ───────────────────────────── print(f"[boot] Loading tokenizer: {ENCODER_NAME}") tokenizer = AutoTokenizer.from_pretrained(ENCODER_NAME) tokenizer.add_tokens(["", ""], special_tokens=True) print(f"[boot] Loading encoder: {ENCODER_NAME}") full_model = AutoModelForSeq2SeqLM.from_pretrained(ENCODER_NAME) encoder = full_model.encoder encoder.resize_token_embeddings(len(tokenizer)) encoder.eval() for p in encoder.parameters(): p.requires_grad = False del full_model # decoder unused print("[boot] Encoder ready.") SPECIAL_TOKENS = {"", "", "", "", "", ""} def _decode_clean_tokens(text: str): """Mirrors model/signals.py::decode_clean_tokens on the server.""" ids = tokenizer(text, max_length=MAX_LENGTH, truncation=True)["input_ids"] toks = tokenizer.convert_ids_to_tokens(ids) special = set(tokenizer.all_special_tokens) clean = [] for t in toks: if t in special or t.strip() in ["", "▁"]: continue cleaned = t.replace("▁", "").replace("Ġ", "").strip() if cleaned: clean.append(cleaned) return clean @torch.no_grad() def encode(text: str): """ Returns a JSON-serialisable dict: { "hidden_b64": , "shape": [seq_len, hidden_dim], "attention_mask": [int, ...], # length = seq_len "clean_tokens": [str, ...], # for signal extraction } """ if not isinstance(text, str) or not text.strip(): raise gr.Error("text must be a non-empty string") inp = tokenizer( text, max_length=MAX_LENGTH, truncation=True, padding="max_length", return_tensors="pt", ) hidden = encoder(**inp).last_hidden_state # (1, seq_len, 1024) arr = hidden[0].cpu().numpy().astype(np.float16) # (seq_len, 1024) return { "hidden_b64": base64.b64encode(arr.tobytes()).decode("ascii"), "shape": list(arr.shape), "attention_mask": inp["attention_mask"][0].cpu().tolist(), "clean_tokens": _decode_clean_tokens(text), } # ── Gradio UI + API ──────────────────────────────────────────────────────── with gr.Blocks(title="pseudoscore-x encoder") as demo: gr.Markdown( "# pseudoscore-x encoder\n" "CodeT5-large encoder with `` and `` special tokens.\n" "Use the **/encode** API endpoint from your backend." ) inp = gr.Textbox(label="Text", lines=4, placeholder="Paste text to encode…") out = gr.JSON(label="Encoded output") btn = gr.Button("Encode") btn.click(fn=encode, inputs=inp, outputs=out, api_name="encode") if __name__ == "__main__": demo.queue(max_size=8).launch(server_name="0.0.0.0", server_port=7860)