batheand commited on
Commit
1de6f95
·
verified ·
1 Parent(s): a93692a

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re, numpy as np, torch, gradio as gr
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM
3
+ from scipy.ndimage import uniform_filter1d
4
+
5
+ MODEL = "facebook/xglm-564M" # upgrade to xglm-2.9B if you get a GPU Space
6
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
+ tok = AutoTokenizer.from_pretrained(MODEL)
9
+ lm = AutoModelForCausalLM.from_pretrained(MODEL).to(DEVICE).eval()
10
+
11
+ def split_words(text: str):
12
+ return re.findall(r"\w+|[^\w\s]", text, flags=re.UNICODE)
13
+
14
+ @torch.inference_mode()
15
+ def word_surprisal(text: str):
16
+ enc = tok(text, return_tensors="pt", return_offsets_mapping=True)
17
+ ids = enc["input_ids"].to(DEVICE)
18
+ offs = enc["offset_mapping"][0].tolist()
19
+
20
+ out = lm(ids)
21
+ logits = out.logits[:, :-1, :]
22
+ targets = ids[:, 1:]
23
+ logp = torch.log_softmax(logits, dim=-1)
24
+ ll = logp.gather(-1, targets.unsqueeze(-1)).squeeze(-1) # [1, T-1]
25
+ nll = (-ll).squeeze(0).cpu().numpy()
26
+ sub_offs = offs[1:]
27
+
28
+ words = split_words(text)
29
+ # char spans for words
30
+ spans = []
31
+ pos = 0
32
+ for w in words:
33
+ start = text.find(w, pos)
34
+ spans.append((start, start+len(w)))
35
+ pos = start+len(w)
36
+
37
+ w_scores = np.zeros(len(words), dtype=float)
38
+ for s,(a,b) in zip(nll, sub_offs):
39
+ if a==b: continue
40
+ for i,(ws,we) in enumerate(spans):
41
+ if a>=ws and b<=we:
42
+ w_scores[i] += float(s); break
43
+ return words, w_scores
44
+
45
+ def robust_threshold(scores, k=2.5):
46
+ if len(scores)==0:
47
+ return 1e9
48
+ med = float(np.median(scores))
49
+ mad = float(np.median(np.abs(scores - med))) + 1e-8
50
+ return med + k*mad
51
+
52
+ def infer(text, k=2.5, smooth=2):
53
+ words, scores = word_surprisal(text)
54
+ if smooth and smooth>1:
55
+ scores = uniform_filter1d(scores, size=smooth, mode="nearest")
56
+ thr = robust_threshold(scores, k=k)
57
+ flagged = [i for i,s in enumerate(scores) if s>=thr]
58
+
59
+ out = []
60
+ for i,w in enumerate(words):
61
+ out.append(f"<mark style='background:#ffb3b3'>{w}</mark>" if i in flagged else w)
62
+ md = " ".join(out).replace(" ."," .").replace(" ,"," ,")
63
+ table = [(w, float(s)) for w,s in zip(words, scores)]
64
+ return md, table
65
+
66
+ demo = gr.Interface(
67
+ fn=infer,
68
+ inputs=[
69
+ gr.Textbox(lines=3, label="Cümle girin"),
70
+ gr.Slider(1.5, 4.0, value=2.5, step=0.1, label="Threshold (median + k*MAD)"),
71
+ gr.Slider(1, 7, value=2, step=1, label="Smoothing window (words)")
72
+ ],
73
+ outputs=[
74
+ gr.HTML(label="Highlighted"),
75
+ gr.Dataframe(headers=["Word","Score"], label="Word surprisal (lower=more normal)")]
76
+ ,
77
+ title="Odd-Word Detector — XGLM (autoregressive)",
78
+ description="Autoregressive next-token likelihood per word; flags unusually unlikely words."
79
+ )
80
+
81
+ if __name__ == "__main__":
82
+ demo.launch()