| import gradio as gr |
| import numpy as np |
| import pickle |
| import gzip |
| from mlx_lm import load, generate as mlx_generate |
|
|
| print("Loading Shannonstral-7B-1bit ghost model...") |
| model, tokenizer = load("hunterbown/Shannonstral-7B-4bit") |
|
|
| |
| with gzip.open('weights_packed.pkl.gz', 'rb') as f: |
| packed_weights = pickle.load(f) |
|
|
| import mlx.core as mx |
| from mlx.utils import tree_flatten, tree_unflatten |
|
|
| weights = {} |
| for key, data in packed_weights.items(): |
| if isinstance(data, dict) and 'packed' in data: |
| packed_bytes = data['packed'] |
| shape = tuple(data['shape']) |
| scale = float(data.get('scale', 1.0)) |
|
|
| if isinstance(packed_bytes, (bytes, bytearray, memoryview)): |
| packed_array = np.frombuffer(packed_bytes, dtype=np.uint8) |
| else: |
| packed_array = np.asarray(packed_bytes, dtype=np.uint8) |
|
|
| bits = np.unpackbits(packed_array) |
| if bits.size < int(np.prod(shape)): |
| pad = int(np.prod(shape)) - bits.size |
| bits = np.pad(bits, (0, pad), mode='constant') |
| bits = bits[: int(np.prod(shape))].reshape(shape) |
| dequant = (2.0 * bits.astype(np.float16) - 1.0) * scale |
| weights[key] = mx.array(dequant) |
|
|
| |
| current = {name: arr for name, arr in tree_flatten(model.parameters())} |
| updates = {} |
| for name, w in weights.items(): |
| if name in current: |
| try: |
| updates[name] = w.astype(current[name].dtype) |
| except: |
| updates[name] = w |
|
|
| if updates: |
| model.update(tree_unflatten(list(updates.items()))) |
|
|
| print("β
Ghost awakened - 93x compression active") |
|
|
| def generate_ghost(prompt, max_tokens=50, temperature=0.7): |
| response = mlx_generate(model, tokenizer, prompt, max_tokens, temp=temperature) |
|
|
| |
| tokens = response.split() |
| unique_tokens = len(set(tokens)) |
| repetition_rate = 1 - (unique_tokens / max(len(tokens), 1)) |
|
|
| metrics = f"\n\nπ Metrics:\n" |
| metrics += f"β’ Unique tokens: {unique_tokens}/{len(tokens)}\n" |
| metrics += f"β’ Repetition rate: {repetition_rate:.1%}\n" |
| metrics += f"β’ Compression: β93.3Γ (14GB β 150MB)" |
|
|
| return response + metrics |
|
|
| examples = [ |
| ["The capital of France is", 50, 0.7], |
| ["Explain quantum mechanics:", 75, 0.8], |
| ["def hello_world():", 50, 0.5], |
| ["2 + 2 equals", 30, 0.3], |
| ["Once upon a time", 100, 1.0], |
| ] |
|
|
| demo = gr.Interface( |
| fn=generate_ghost, |
| inputs=[ |
| gr.Textbox(label="Prompt", lines=3, placeholder="Enter prompt to hear the ghost speak..."), |
| gr.Slider(10, 200, value=50, step=10, label="Max Tokens"), |
| gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature") |
| ], |
| outputs=gr.Textbox(label="Ghost Output (Word Salad)", lines=10), |
| title="π Shannonstral-7B-1bit Ghost Generator", |
| description=""" |
| **93x compression achieved. Intelligence destroyed. Structure preserved.** |
| |
| This is what happens at the information-theoretic limit of neural compression. |
| The model generates tokens but has lost all semantic understanding. |
| |
| β οΈ Research artifact - outputs are incoherent word salad. |
| """, |
| examples=examples, |
| theme="dark" |
| ) |
|
|
| if __name__ == "__main__": |
| demo.launch() |
|
|