OpenTransformer commited on
Commit
a0ef586
Β·
verified Β·
1 Parent(s): ba4f115

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +455 -0
app.py ADDED
@@ -0,0 +1,455 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Markov Chain Language Model β€” Interactive Demo
4
+ OpenTransformers Ltd | Part of AGILLM Research
5
+
6
+ Classical n-gram LM with Modified Kneser-Ney smoothing.
7
+ GPU hash tables (sorted int64 + searchsorted) for parallel inference.
8
+ Runs on CPU for HF Spaces compatibility.
9
+ """
10
+
11
+ import os, sys, math, time, pickle, gc
12
+ from pathlib import Path
13
+ from collections import defaultdict
14
+ from typing import Dict, List, Optional, Tuple
15
+
16
+ import torch
17
+ import torch.nn.functional as F
18
+ import gradio as gr
19
+ from huggingface_hub import hf_hub_download
20
+
21
+ # ─── Force CPU for HF Spaces ───
22
+ DEV = torch.device("cpu")
23
+
24
+ # ─── Tokenizer ───
25
+ TOKENIZER_ID = os.environ.get("TOKENIZER_ID", "gpt2")
26
+
27
+ def _load_tokenizer():
28
+ from transformers import AutoTokenizer, logging as hf_log
29
+ hf_log.set_verbosity_error()
30
+ t = AutoTokenizer.from_pretrained(TOKENIZER_ID, use_fast=True)
31
+ if t.pad_token is None:
32
+ t.add_special_tokens({"pad_token": "<|pad|>"})
33
+ return t
34
+
35
+ tok = _load_tokenizer()
36
+ VOCAB = max(tok.get_vocab().values()) + 1
37
+ EOS = tok.eos_token_id if tok.eos_token_id is not None else tok.sep_token_id
38
+
39
+ # ─── FNV-1a Hashing ───
40
+ FNV_OFFSET = 14695981039346656037
41
+ FNV_PRIME = 1099511628211
42
+ MASK64 = (1 << 64) - 1
43
+ INT64_MAX = (1 << 63) - 1
44
+ INT64_WRAP = 1 << 64
45
+ FNV_OFFSET_S = FNV_OFFSET - INT64_WRAP
46
+
47
+ def _hash_ngram_batch_gpu(contexts: torch.Tensor) -> torch.Tensor:
48
+ B, N = contexts.shape
49
+ h = torch.full((B,), FNV_OFFSET_S, dtype=torch.int64, device=contexts.device)
50
+ for i in range(N):
51
+ h = h ^ contexts[:, i]
52
+ h = h * FNV_PRIME
53
+ return h
54
+
55
+
56
+ class GPUHashTable:
57
+ """Immutable hash table using sorted int64 keys + searchsorted."""
58
+
59
+ def __init__(self):
60
+ self.hashes: Optional[torch.Tensor] = None
61
+ self.counts: Optional[torch.Tensor] = None
62
+ self.continuation_counts: Optional[torch.Tensor] = None
63
+ self.total: int = 0
64
+ self.size: int = 0
65
+
66
+ def batch_lookup(self, hashes: torch.Tensor) -> torch.Tensor:
67
+ if self.size == 0:
68
+ return torch.zeros_like(hashes)
69
+ idx = torch.searchsorted(self.hashes, hashes).clamp(0, self.size - 1)
70
+ found = (self.hashes[idx] == hashes)
71
+ return torch.where(found, self.counts[idx], torch.zeros_like(hashes))
72
+
73
+ def batch_lookup_continuations(self, hashes: torch.Tensor) -> torch.Tensor:
74
+ if self.size == 0 or self.continuation_counts is None:
75
+ return torch.zeros_like(hashes)
76
+ idx = torch.searchsorted(self.hashes, hashes).clamp(0, self.size - 1)
77
+ found = (self.hashes[idx] == hashes)
78
+ return torch.where(found, self.continuation_counts[idx], torch.zeros_like(hashes))
79
+
80
+ def memory_bytes(self) -> int:
81
+ total = 0
82
+ for t in [self.hashes, self.counts, self.continuation_counts]:
83
+ if t is not None: total += t.nelement() * t.element_size()
84
+ return total
85
+
86
+
87
+ def _hash_ngram_py(ngram: tuple) -> int:
88
+ h = FNV_OFFSET
89
+ for t in ngram:
90
+ h ^= (t & MASK64)
91
+ h = (h * FNV_PRIME) & MASK64
92
+ return h if h <= INT64_MAX else h - INT64_WRAP
93
+
94
+
95
+ class MarkovLM:
96
+ """N-gram LM with Modified Kneser-Ney smoothing."""
97
+
98
+ def __init__(self, max_order: int = 5):
99
+ self.max_order = max_order
100
+ self.cpu_counts: List[Dict] = []
101
+ self.total_tokens = 0
102
+ self.tokens_trained = 0
103
+ self.gpu_ngram_tables: List[Optional[GPUHashTable]] = [None] * max_order
104
+ self.gpu_context_tables: List[Optional[GPUHashTable]] = [None] * max_order
105
+ self.frozen = False
106
+ self.discounts: List[Tuple[float, float, float]] = [(0.5, 1.0, 1.5)] * max_order
107
+ self.gpu_unigram_probs: Optional[torch.Tensor] = None
108
+
109
+ def freeze(self, device=DEV, prune_threshold: int = 1):
110
+ print(f"[freeze] Building hash tables on {device}...")
111
+ t0 = time.time()
112
+
113
+ for order in range(self.max_order):
114
+ d = self.cpu_counts[order]
115
+ if not d:
116
+ self.gpu_ngram_tables[order] = GPUHashTable()
117
+ self.gpu_context_tables[order] = GPUHashTable()
118
+ continue
119
+
120
+ # Build ngram table
121
+ entries = []
122
+ for ctx, nexts in d.items():
123
+ for next_tok, cnt in nexts.items():
124
+ if cnt >= prune_threshold or order <= 1:
125
+ key = ctx + (next_tok,)
126
+ h = _hash_ngram_py(key)
127
+ entries.append((h, cnt))
128
+
129
+ gt = GPUHashTable()
130
+ if entries:
131
+ entries.sort(key=lambda x: x[0])
132
+ gt.hashes = torch.tensor([e[0] for e in entries], dtype=torch.int64, device=device)
133
+ gt.counts = torch.tensor([e[1] for e in entries], dtype=torch.int64, device=device)
134
+ gt.total = sum(e[1] for e in entries)
135
+ gt.size = len(entries)
136
+ self.gpu_ngram_tables[order] = gt
137
+
138
+ # Build context table
139
+ ctx_entries = []
140
+ for ctx, nexts in d.items():
141
+ h = _hash_ngram_py(ctx)
142
+ total = sum(nexts.values())
143
+ n_unique = len(nexts)
144
+ ctx_entries.append((h, total, n_unique))
145
+
146
+ ct = GPUHashTable()
147
+ if ctx_entries:
148
+ ctx_entries.sort(key=lambda x: x[0])
149
+ ct.hashes = torch.tensor([e[0] for e in ctx_entries], dtype=torch.int64, device=device)
150
+ ct.counts = torch.tensor([e[1] for e in ctx_entries], dtype=torch.int64, device=device)
151
+ ct.continuation_counts = torch.tensor([e[2] for e in ctx_entries], dtype=torch.int64, device=device)
152
+ ct.total = sum(e[1] for e in ctx_entries)
153
+ ct.size = len(ctx_entries)
154
+ self.gpu_context_tables[order] = ct
155
+
156
+ n_ent = gt.size
157
+ mem = (gt.memory_bytes() + ct.memory_bytes()) / 1e6
158
+ print(f" {order+1}-gram: {n_ent:,} entries | {mem:.1f} MB")
159
+
160
+ # Estimate KN discounts
161
+ self._estimate_discounts()
162
+
163
+ # Unigram probs
164
+ if self.cpu_counts[0]:
165
+ uni = self.cpu_counts[0].get((), {})
166
+ probs = torch.zeros(VOCAB, dtype=torch.float32, device=device)
167
+ total = sum(uni.values())
168
+ if total > 0:
169
+ for tok_id, cnt in uni.items():
170
+ if 0 <= tok_id < VOCAB:
171
+ probs[tok_id] = cnt / total
172
+ self.gpu_unigram_probs = probs
173
+
174
+ # Free CPU dicts
175
+ self.cpu_counts = []
176
+ gc.collect()
177
+
178
+ self.frozen = True
179
+ elapsed = time.time() - t0
180
+ gpu_mem = sum(
181
+ (gt.memory_bytes() if gt else 0) + (ct.memory_bytes() if ct else 0)
182
+ for gt, ct in zip(self.gpu_ngram_tables, self.gpu_context_tables)
183
+ )
184
+ print(f"[freeze] Done in {elapsed:.1f}s | {gpu_mem/1e6:.1f} MB total")
185
+
186
+ def _estimate_discounts(self):
187
+ for order in range(self.max_order):
188
+ gt = self.gpu_ngram_tables[order]
189
+ if gt is None or gt.size == 0:
190
+ continue
191
+ counts = gt.counts
192
+ n1 = (counts == 1).sum().item()
193
+ n2 = (counts == 2).sum().item()
194
+ n3 = (counts == 3).sum().item()
195
+ n4 = (counts == 4).sum().item()
196
+ if n1 == 0 or n2 == 0:
197
+ continue
198
+ Y = n1 / (n1 + 2 * n2)
199
+ D1 = max(0.01, min(1 - 2 * Y * (n2 / max(n1, 1)), 0.99))
200
+ D2 = max(D1, min(2 - 3 * Y * (n3 / max(n2, 1)), 1.99))
201
+ D3 = max(D2, min(3 - 4 * Y * (n4 / max(n3, 1)), 2.99))
202
+ self.discounts[order] = (D1, D2, D3)
203
+
204
+ @torch.no_grad()
205
+ def batch_log_probs(self, contexts: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
206
+ B = targets.shape[0]
207
+ device = targets.device
208
+ log_probs = torch.full((B,), math.log(1.0 / VOCAB), dtype=torch.float32, device=device)
209
+
210
+ for order in range(self.max_order):
211
+ gt = self.gpu_ngram_tables[order]
212
+ ct = self.gpu_context_tables[order]
213
+ if gt is None or ct is None or gt.size == 0:
214
+ continue
215
+
216
+ ctx_len = order
217
+ if ctx_len == 0:
218
+ if self.gpu_unigram_probs is not None:
219
+ safe_t = targets.clamp(0, VOCAB - 1)
220
+ uni_p = self.gpu_unigram_probs[safe_t]
221
+ valid = uni_p > 0
222
+ log_probs = torch.where(valid, torch.log(uni_p + 1e-30), log_probs)
223
+ continue
224
+
225
+ if ctx_len > contexts.shape[1]:
226
+ continue
227
+
228
+ ctx = contexts[:, -ctx_len:]
229
+ has_ctx = (ctx >= 0).all(dim=1)
230
+ if not has_ctx.any():
231
+ continue
232
+
233
+ full_ngram = torch.cat([ctx, targets.unsqueeze(1)], dim=1)
234
+ ngram_counts = gt.batch_lookup(_hash_ngram_batch_gpu(full_ngram)).float()
235
+ ctx_hashes = _hash_ngram_batch_gpu(ctx)
236
+ ctx_totals = ct.batch_lookup(ctx_hashes).float()
237
+ ctx_uniques = ct.batch_lookup_continuations(ctx_hashes).float()
238
+
239
+ D1, D2, D3 = self.discounts[order]
240
+ discount = torch.where(ngram_counts >= 3, D3,
241
+ torch.where(ngram_counts >= 2, D2,
242
+ torch.where(ngram_counts >= 1, D1, 0.0)))
243
+
244
+ numerator = (ngram_counts - discount).clamp(min=0)
245
+ denominator = ctx_totals.clamp(min=1)
246
+ gamma = (D3 * ctx_uniques) / denominator
247
+ gamma = gamma.clamp(0, 1)
248
+ p_lower = log_probs.exp()
249
+ p_combined = numerator / denominator + gamma * p_lower
250
+ valid = has_ctx & (ctx_totals > 0)
251
+ log_probs = torch.where(valid, torch.log(p_combined.clamp(min=1e-30)), log_probs)
252
+
253
+ return log_probs
254
+
255
+ @torch.no_grad()
256
+ def generate(self, prompt: str, max_new: int = 200, temperature: float = 0.8,
257
+ top_k: int = 50, top_p: float = 0.9):
258
+ assert self.frozen, "Call freeze() first"
259
+ ctx_len = self.max_order - 1
260
+ ids = tok.encode(prompt)
261
+ n_cands = min(top_k * 10, VOCAB)
262
+
263
+ if self.gpu_unigram_probs is not None:
264
+ _, candidates = self.gpu_unigram_probs.topk(n_cands)
265
+ else:
266
+ candidates = torch.arange(n_cands, device=DEV)
267
+
268
+ for _ in range(max_new):
269
+ if len(ids) >= ctx_len:
270
+ ctx = ids[-ctx_len:]
271
+ else:
272
+ ctx = [-1] * (ctx_len - len(ids)) + ids
273
+
274
+ ctx_t = torch.tensor([ctx], dtype=torch.int64, device=DEV).expand(n_cands, ctx_len)
275
+ log_probs = self.batch_log_probs(ctx_t, candidates)
276
+
277
+ probs = (log_probs / max(temperature, 1e-8)).softmax(0)
278
+
279
+ if top_k > 0 and top_k < n_cands:
280
+ vals, idx = probs.topk(top_k)
281
+ mask = torch.zeros_like(probs)
282
+ mask.scatter_(0, idx, vals)
283
+ probs = mask
284
+
285
+ if top_p < 1.0:
286
+ sp, si = probs.sort(descending=True)
287
+ cum = sp.cumsum(0)
288
+ cutoff = cum > top_p
289
+ cutoff[0] = False
290
+ sp[cutoff] = 0
291
+ probs = torch.zeros_like(probs).scatter_(0, si, sp)
292
+
293
+ if probs.sum() == 0:
294
+ next_tok = candidates[0].item()
295
+ else:
296
+ probs = probs / probs.sum()
297
+ next_tok = candidates[probs.multinomial(1).item()].item()
298
+
299
+ ids.append(next_tok)
300
+ if next_tok == EOS:
301
+ break
302
+
303
+ return tok.decode(ids, skip_special_tokens=True)
304
+
305
+ @classmethod
306
+ def load(cls, path: str) -> 'MarkovLM':
307
+ p = Path(path)
308
+ for suffix in ['.cpu.pkl', '.pkl']:
309
+ candidate = p.with_suffix(suffix)
310
+ if candidate.exists():
311
+ p = candidate; break
312
+
313
+ print(f"[load] {p} ({p.stat().st_size / 1e6:.1f} MB)...")
314
+ with open(p, 'rb') as f:
315
+ data = pickle.load(f)
316
+
317
+ model = cls(max_order=data['max_order'])
318
+ model.total_tokens = data['total_tokens']
319
+ model.tokens_trained = data['tokens_trained']
320
+ model.discounts = data.get('discounts', model.discounts)
321
+
322
+ for order in range(model.max_order):
323
+ raw = data['cpu_counts'][order]
324
+ dd = defaultdict(lambda: defaultdict(int))
325
+ for ctx, nexts in raw.items():
326
+ dd[ctx] = defaultdict(int, nexts)
327
+ model.cpu_counts.append(dd)
328
+
329
+ total_entries = sum(
330
+ sum(len(v) for v in model.cpu_counts[o].values())
331
+ for o in range(model.max_order)
332
+ )
333
+ print(f"[load] {model.max_order}-gram | {model.tokens_trained:,} tokens | {total_entries:,} entries")
334
+ print("[load] Freezing to CPU...")
335
+ model.freeze(device=DEV)
336
+ return model
337
+
338
+
339
+ # ─── Load model at startup ───
340
+ print("Downloading model from HuggingFace...")
341
+ model_path = hf_hub_download(
342
+ repo_id="OpenTransformer/markov-5gram-500m",
343
+ filename="markov_5gram.cpu.pkl",
344
+ cache_dir="/tmp/markov_cache"
345
+ )
346
+ print(f"Loading model from {model_path}...")
347
+ MODEL = MarkovLM.load(model_path)
348
+ print("Model ready!")
349
+
350
+
351
+ # ─── Gradio Interface ───
352
+ def generate_text(prompt, max_tokens, temperature, top_k, top_p):
353
+ if not prompt.strip():
354
+ return "Please enter a prompt."
355
+ t0 = time.time()
356
+ result = MODEL.generate(
357
+ prompt=prompt,
358
+ max_new=int(max_tokens),
359
+ temperature=float(temperature),
360
+ top_k=int(top_k),
361
+ top_p=float(top_p),
362
+ )
363
+ elapsed = time.time() - t0
364
+ gen_tokens = len(tok.encode(result)) - len(tok.encode(prompt))
365
+ stats = f"\n\n---\n*Generated {gen_tokens} tokens in {elapsed:.2f}s ({gen_tokens/max(elapsed,0.01):.0f} tok/s)*"
366
+ return result + stats
367
+
368
+
369
+ def get_model_info():
370
+ total_entries = sum(
371
+ gt.size for gt in MODEL.gpu_ngram_tables if gt
372
+ )
373
+ mem = sum(
374
+ (gt.memory_bytes() if gt else 0) + (ct.memory_bytes() if ct else 0)
375
+ for gt, ct in zip(MODEL.gpu_ngram_tables, MODEL.gpu_context_tables)
376
+ ) / 1e6
377
+
378
+ info = f"""## Model Statistics
379
+ - **Architecture**: {MODEL.max_order}-gram with Modified Kneser-Ney smoothing
380
+ - **Tokens trained**: {MODEL.tokens_trained:,}
381
+ - **Total n-gram entries**: {total_entries:,}
382
+ - **Memory usage**: {mem:.1f} MB
383
+ - **Tokenizer**: GPT-2 ({VOCAB:,} vocab)
384
+ - **Inference**: CPU (searchsorted batch lookup)
385
+
386
+ ### How it works
387
+ This is a classical n-gram language model β€” no neural network, no parameters to learn via gradient descent.
388
+ Instead, it counts how often sequences of tokens appear in the training data and uses those counts
389
+ to predict the next token. Kneser-Ney smoothing interpolates between different context lengths
390
+ (unigram through {MODEL.max_order}-gram) so that even unseen contexts get reasonable predictions.
391
+
392
+ The n-gram counts are stored in sorted hash tables and looked up via binary search (`torch.searchsorted`),
393
+ making inference parallel and efficient even on CPU.
394
+
395
+ ### Per-order breakdown"""
396
+
397
+ for order in range(MODEL.max_order):
398
+ gt = MODEL.gpu_ngram_tables[order]
399
+ if gt and gt.size > 0:
400
+ D1, D2, D3 = MODEL.discounts[order]
401
+ info += f"\n- **{order+1}-gram**: {gt.size:,} entries (D1={D1:.3f}, D2={D2:.3f}, D3+={D3:.3f})"
402
+
403
+ info += f"\n\n*Trained by [OpenTransformers Ltd](https://huggingface.co/OpenTransformer). Part of AGILLM research.*"
404
+ return info
405
+
406
+
407
+ with gr.Blocks(
408
+ title="Markov Chain LM β€” OpenTransformers",
409
+ theme=gr.themes.Base(primary_hue="blue", neutral_hue="slate"),
410
+ ) as demo:
411
+ gr.Markdown("""# Markov Chain Language Model
412
+ ### Classical N-gram LM with Modified Kneser-Ney Smoothing
413
+ *No neural network β€” pure statistical language modelling. [OpenTransformers Ltd](https://huggingface.co/OpenTransformer)*
414
+ """)
415
+
416
+ with gr.Row():
417
+ with gr.Column(scale=3):
418
+ prompt = gr.Textbox(
419
+ label="Prompt",
420
+ placeholder="Enter text to continue...",
421
+ lines=3,
422
+ value="The meaning of life is"
423
+ )
424
+ output = gr.Markdown(label="Generated Text")
425
+ generate_btn = gr.Button("Generate", variant="primary", size="lg")
426
+
427
+ with gr.Column(scale=1):
428
+ max_tokens = gr.Slider(10, 500, value=200, step=10, label="Max tokens")
429
+ temperature = gr.Slider(0.1, 2.0, value=0.8, step=0.1, label="Temperature")
430
+ top_k = gr.Slider(1, 200, value=50, step=1, label="Top-K")
431
+ top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-P")
432
+
433
+ generate_btn.click(
434
+ fn=generate_text,
435
+ inputs=[prompt, max_tokens, temperature, top_k, top_p],
436
+ outputs=output,
437
+ )
438
+
439
+ with gr.Accordion("Model Information", open=False):
440
+ gr.Markdown(get_model_info())
441
+
442
+ gr.Examples(
443
+ examples=[
444
+ ["The meaning of life is"],
445
+ ["In the beginning, there was"],
446
+ ["The president of the United States"],
447
+ ["Machine learning is a field of"],
448
+ ["Once upon a time in a land far away"],
449
+ ["The quick brown fox"],
450
+ ],
451
+ inputs=prompt,
452
+ )
453
+
454
+ if __name__ == "__main__":
455
+ demo.launch()