Nanny7 commited on
Commit
9b75985
ยท
0 Parent(s):

encoder space

Browse files
Files changed (3) hide show
  1. README.md +44 -0
  2. app.py +110 -0
  3. requirements.txt +6 -0
README.md ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Pseudoscorex Encoder
3
+ emoji: ๐Ÿงฎ
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.44.0
8
+ app_file: app.py
9
+ pinned: false
10
+ ---
11
+
12
+ # pseudoscore-x encoder
13
+
14
+ CodeT5-large encoder with `<criterion>` and `<score>` special tokens added
15
+ (matching the training notebook). Returns per-token hidden states for the
16
+ backend's scoring head.
17
+
18
+ ## API
19
+
20
+ ```python
21
+ from gradio_client import Client
22
+
23
+ client = Client("YOUR_USERNAME/pseudoscorex-encoder")
24
+ out = client.predict("hello world", api_name="/encode")
25
+ # out = {
26
+ # "hidden_b64": "<base64 float16 array>",
27
+ # "shape": [512, 1024],
28
+ # "attention_mask": [...],
29
+ # "clean_tokens": [...],
30
+ # }
31
+ ```
32
+
33
+ ## Decoding hidden states
34
+
35
+ ```python
36
+ import base64, numpy as np
37
+ arr = np.frombuffer(base64.b64decode(out["hidden_b64"]), dtype=np.float16)
38
+ arr = arr.reshape(out["shape"]) # (seq_len, 1024)
39
+ ```
40
+
41
+ ## Hardware
42
+
43
+ Runs on the free CPU tier. Encoder is loaded once at boot and weights are
44
+ frozen, so each request is just a forward pass.
app.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Hugging Face Space โ€” CodeT5-large encoder for the pseudoscore-x backend.
3
+
4
+ Exposes a Gradio API at /encode that:
5
+ - tokenises text (with the same <criterion> / <score> special tokens
6
+ the notebook used)
7
+ - runs the FROZEN encoder forward pass
8
+ - returns last_hidden_state (float16, base64-encoded), the attention
9
+ mask, and the cleaned subword tokens used for signal extraction
10
+
11
+ Designed for the FREE CPU tier on HF Spaces. The encoder weights load
12
+ once at Space startup; subsequent requests are just forward passes.
13
+
14
+ Call from Python:
15
+ from gradio_client import Client
16
+ client = Client("YOUR_USERNAME/pseudoscorex-encoder")
17
+ out = client.predict("hello world", api_name="/encode")
18
+ """
19
+ import base64
20
+ import os
21
+
22
+ import gradio as gr
23
+ import numpy as np
24
+ import torch
25
+ from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
26
+
27
+
28
+ ENCODER_NAME = os.getenv("ENCODER_NAME", "Salesforce/codet5-large")
29
+ MAX_LENGTH = int(os.getenv("MAX_LENGTH", "512"))
30
+
31
+ # โ”€โ”€ Boot: load tokenizer + frozen encoder once โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
32
+ print(f"[boot] Loading tokenizer: {ENCODER_NAME}")
33
+ tokenizer = AutoTokenizer.from_pretrained(ENCODER_NAME)
34
+ tokenizer.add_tokens(["<criterion>", "<score>"], special_tokens=True)
35
+
36
+ print(f"[boot] Loading encoder: {ENCODER_NAME}")
37
+ full_model = AutoModelForSeq2SeqLM.from_pretrained(ENCODER_NAME)
38
+ encoder = full_model.encoder
39
+ encoder.resize_token_embeddings(len(tokenizer))
40
+ encoder.eval()
41
+ for p in encoder.parameters():
42
+ p.requires_grad = False
43
+ del full_model # decoder unused
44
+ print("[boot] Encoder ready.")
45
+
46
+
47
+ SPECIAL_TOKENS = {"", "<s>", "</s>", "<pad>", "<criterion>", "<score>"}
48
+
49
+
50
+ def _decode_clean_tokens(text: str):
51
+ """Mirrors model/signals.py::decode_clean_tokens on the server."""
52
+ ids = tokenizer(text, max_length=MAX_LENGTH, truncation=True)["input_ids"]
53
+ toks = tokenizer.convert_ids_to_tokens(ids)
54
+ special = set(tokenizer.all_special_tokens)
55
+ clean = []
56
+ for t in toks:
57
+ if t in special or t.strip() in ["", "โ–"]:
58
+ continue
59
+ cleaned = t.replace("โ–", "").replace("ฤ ", "").strip()
60
+ if cleaned:
61
+ clean.append(cleaned)
62
+ return clean
63
+
64
+
65
+ @torch.no_grad()
66
+ def encode(text: str):
67
+ """
68
+ Returns a JSON-serialisable dict:
69
+ {
70
+ "hidden_b64": <base64 string of float16 array>,
71
+ "shape": [seq_len, hidden_dim],
72
+ "attention_mask": [int, ...], # length = seq_len
73
+ "clean_tokens": [str, ...], # for signal extraction
74
+ }
75
+ """
76
+ if not isinstance(text, str) or not text.strip():
77
+ raise gr.Error("text must be a non-empty string")
78
+
79
+ inp = tokenizer(
80
+ text,
81
+ max_length=MAX_LENGTH,
82
+ truncation=True,
83
+ padding="max_length",
84
+ return_tensors="pt",
85
+ )
86
+ hidden = encoder(**inp).last_hidden_state # (1, seq_len, 1024)
87
+
88
+ arr = hidden[0].cpu().numpy().astype(np.float16) # (seq_len, 1024)
89
+ return {
90
+ "hidden_b64": base64.b64encode(arr.tobytes()).decode("ascii"),
91
+ "shape": list(arr.shape),
92
+ "attention_mask": inp["attention_mask"][0].cpu().tolist(),
93
+ "clean_tokens": _decode_clean_tokens(text),
94
+ }
95
+
96
+
97
+ # โ”€โ”€ Gradio UI + API โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
98
+ with gr.Blocks(title="pseudoscore-x encoder") as demo:
99
+ gr.Markdown(
100
+ "# pseudoscore-x encoder\n"
101
+ "CodeT5-large encoder with `<criterion>` and `<score>` special tokens.\n"
102
+ "Use the **/encode** API endpoint from your backend."
103
+ )
104
+ inp = gr.Textbox(label="Text", lines=4, placeholder="Paste text to encodeโ€ฆ")
105
+ out = gr.JSON(label="Encoded output")
106
+ btn = gr.Button("Encode")
107
+ btn.click(fn=encode, inputs=inp, outputs=out, api_name="encode")
108
+
109
+ if __name__ == "__main__":
110
+ demo.queue(max_size=8).launch(server_name="0.0.0.0", server_port=7860)
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ transformers==4.38.2
2
+ sentencepiece
3
+ torch>=2.0,<3.0
4
+ gradio>=4.36
5
+ numpy
6
+ protobuf