File size: 3,988 Bytes
9b75985
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
"""
Hugging Face Space โ€” CodeT5-large encoder for the pseudoscore-x backend.

Exposes a Gradio API at /encode that:
  - tokenises text (with the same <criterion> / <score> special tokens
    the notebook used)
  - runs the FROZEN encoder forward pass
  - returns last_hidden_state (float16, base64-encoded), the attention
    mask, and the cleaned subword tokens used for signal extraction

Designed for the FREE CPU tier on HF Spaces. The encoder weights load
once at Space startup; subsequent requests are just forward passes.

Call from Python:
    from gradio_client import Client
    client = Client("YOUR_USERNAME/pseudoscorex-encoder")
    out = client.predict("hello world", api_name="/encode")
"""
import base64
import os

import gradio as gr
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


ENCODER_NAME = os.getenv("ENCODER_NAME", "Salesforce/codet5-large")
MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))

# โ”€โ”€ Boot: load tokenizer + frozen encoder once โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
print(f"[boot] Loading tokenizer: {ENCODER_NAME}")
tokenizer = AutoTokenizer.from_pretrained(ENCODER_NAME)
tokenizer.add_tokens(["<criterion>", "<score>"], special_tokens=True)

print(f"[boot] Loading encoder: {ENCODER_NAME}")
full_model = AutoModelForSeq2SeqLM.from_pretrained(ENCODER_NAME)
encoder = full_model.encoder
encoder.resize_token_embeddings(len(tokenizer))
encoder.eval()
for p in encoder.parameters():
    p.requires_grad = False
del full_model  # decoder unused
print("[boot] Encoder ready.")


SPECIAL_TOKENS = {"", "<s>", "</s>", "<pad>", "<criterion>", "<score>"}


def _decode_clean_tokens(text: str):
    """Mirrors model/signals.py::decode_clean_tokens on the server."""
    ids = tokenizer(text, max_length=MAX_LENGTH, truncation=True)["input_ids"]
    toks = tokenizer.convert_ids_to_tokens(ids)
    special = set(tokenizer.all_special_tokens)
    clean = []
    for t in toks:
        if t in special or t.strip() in ["", "โ–"]:
            continue
        cleaned = t.replace("โ–", "").replace("ฤ ", "").strip()
        if cleaned:
            clean.append(cleaned)
    return clean


@torch.no_grad()
def encode(text: str):
    """
    Returns a JSON-serialisable dict:
      {
        "hidden_b64": <base64 string of float16 array>,
        "shape":      [seq_len, hidden_dim],
        "attention_mask": [int, ...],   # length = seq_len
        "clean_tokens":   [str, ...],   # for signal extraction
      }
    """
    if not isinstance(text, str) or not text.strip():
        raise gr.Error("text must be a non-empty string")

    inp = tokenizer(
        text,
        max_length=MAX_LENGTH,
        truncation=True,
        padding="max_length",
        return_tensors="pt",
    )
    hidden = encoder(**inp).last_hidden_state  # (1, seq_len, 1024)

    arr = hidden[0].cpu().numpy().astype(np.float16)  # (seq_len, 1024)
    return {
        "hidden_b64": base64.b64encode(arr.tobytes()).decode("ascii"),
        "shape": list(arr.shape),
        "attention_mask": inp["attention_mask"][0].cpu().tolist(),
        "clean_tokens": _decode_clean_tokens(text),
    }


# โ”€โ”€ Gradio UI + API โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
with gr.Blocks(title="pseudoscore-x encoder") as demo:
    gr.Markdown(
        "# pseudoscore-x encoder\n"
        "CodeT5-large encoder with `<criterion>` and `<score>` special tokens.\n"
        "Use the **/encode** API endpoint from your backend."
    )
    inp = gr.Textbox(label="Text", lines=4, placeholder="Paste text to encodeโ€ฆ")
    out = gr.JSON(label="Encoded output")
    btn = gr.Button("Encode")
    btn.click(fn=encode, inputs=inp, outputs=out, api_name="encode")

if __name__ == "__main__":
    demo.queue(max_size=8).launch(server_name="0.0.0.0", server_port=7860)