Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re, numpy as np, torch, gradio as gr
|
| 2 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 3 |
+
from scipy.ndimage import uniform_filter1d
|
| 4 |
+
|
| 5 |
+
MODEL = "facebook/xglm-564M" # upgrade to xglm-2.9B if you get a GPU Space
|
| 6 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 7 |
+
|
| 8 |
+
tok = AutoTokenizer.from_pretrained(MODEL)
|
| 9 |
+
lm = AutoModelForCausalLM.from_pretrained(MODEL).to(DEVICE).eval()
|
| 10 |
+
|
| 11 |
+
def split_words(text: str):
|
| 12 |
+
return re.findall(r"\w+|[^\w\s]", text, flags=re.UNICODE)
|
| 13 |
+
|
| 14 |
+
@torch.inference_mode()
|
| 15 |
+
def word_surprisal(text: str):
|
| 16 |
+
enc = tok(text, return_tensors="pt", return_offsets_mapping=True)
|
| 17 |
+
ids = enc["input_ids"].to(DEVICE)
|
| 18 |
+
offs = enc["offset_mapping"][0].tolist()
|
| 19 |
+
|
| 20 |
+
out = lm(ids)
|
| 21 |
+
logits = out.logits[:, :-1, :]
|
| 22 |
+
targets = ids[:, 1:]
|
| 23 |
+
logp = torch.log_softmax(logits, dim=-1)
|
| 24 |
+
ll = logp.gather(-1, targets.unsqueeze(-1)).squeeze(-1) # [1, T-1]
|
| 25 |
+
nll = (-ll).squeeze(0).cpu().numpy()
|
| 26 |
+
sub_offs = offs[1:]
|
| 27 |
+
|
| 28 |
+
words = split_words(text)
|
| 29 |
+
# char spans for words
|
| 30 |
+
spans = []
|
| 31 |
+
pos = 0
|
| 32 |
+
for w in words:
|
| 33 |
+
start = text.find(w, pos)
|
| 34 |
+
spans.append((start, start+len(w)))
|
| 35 |
+
pos = start+len(w)
|
| 36 |
+
|
| 37 |
+
w_scores = np.zeros(len(words), dtype=float)
|
| 38 |
+
for s,(a,b) in zip(nll, sub_offs):
|
| 39 |
+
if a==b: continue
|
| 40 |
+
for i,(ws,we) in enumerate(spans):
|
| 41 |
+
if a>=ws and b<=we:
|
| 42 |
+
w_scores[i] += float(s); break
|
| 43 |
+
return words, w_scores
|
| 44 |
+
|
| 45 |
+
def robust_threshold(scores, k=2.5):
|
| 46 |
+
if len(scores)==0:
|
| 47 |
+
return 1e9
|
| 48 |
+
med = float(np.median(scores))
|
| 49 |
+
mad = float(np.median(np.abs(scores - med))) + 1e-8
|
| 50 |
+
return med + k*mad
|
| 51 |
+
|
| 52 |
+
def infer(text, k=2.5, smooth=2):
|
| 53 |
+
words, scores = word_surprisal(text)
|
| 54 |
+
if smooth and smooth>1:
|
| 55 |
+
scores = uniform_filter1d(scores, size=smooth, mode="nearest")
|
| 56 |
+
thr = robust_threshold(scores, k=k)
|
| 57 |
+
flagged = [i for i,s in enumerate(scores) if s>=thr]
|
| 58 |
+
|
| 59 |
+
out = []
|
| 60 |
+
for i,w in enumerate(words):
|
| 61 |
+
out.append(f"<mark style='background:#ffb3b3'>{w}</mark>" if i in flagged else w)
|
| 62 |
+
md = " ".join(out).replace(" ."," .").replace(" ,"," ,")
|
| 63 |
+
table = [(w, float(s)) for w,s in zip(words, scores)]
|
| 64 |
+
return md, table
|
| 65 |
+
|
| 66 |
+
demo = gr.Interface(
|
| 67 |
+
fn=infer,
|
| 68 |
+
inputs=[
|
| 69 |
+
gr.Textbox(lines=3, label="Cümle girin"),
|
| 70 |
+
gr.Slider(1.5, 4.0, value=2.5, step=0.1, label="Threshold (median + k*MAD)"),
|
| 71 |
+
gr.Slider(1, 7, value=2, step=1, label="Smoothing window (words)")
|
| 72 |
+
],
|
| 73 |
+
outputs=[
|
| 74 |
+
gr.HTML(label="Highlighted"),
|
| 75 |
+
gr.Dataframe(headers=["Word","Score"], label="Word surprisal (lower=more normal)")]
|
| 76 |
+
,
|
| 77 |
+
title="Odd-Word Detector — XGLM (autoregressive)",
|
| 78 |
+
description="Autoregressive next-token likelihood per word; flags unusually unlikely words."
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
demo.launch()
|