hunterbown
/

Shannonstral-7B-1bit

Text Generation

extreme-quantization

information-theory

shannonprotocol

informationtheory

Model card Files Files and versions

Shannonstral-7B-1bit / app.py

hunterbown's picture

Upload folder using huggingface_hub

5d98323 verified 7 months ago

history blame contribute delete

3.27 kB

	import gradio as gr
	import numpy as np
	import pickle
	import gzip
	from mlx_lm import load, generate as mlx_generate

	print("Loading Shannonstral-7B-1bit ghost model...")
	model, tokenizer = load("hunterbown/Shannonstral-7B-4bit") # Load structure

	# Load and apply 1-bit weights
	with gzip.open('weights_packed.pkl.gz', 'rb') as f:
	packed_weights = pickle.load(f)

	import mlx.core as mx
	from mlx.utils import tree_flatten, tree_unflatten

	weights = {}
	for key, data in packed_weights.items():
	if isinstance(data, dict) and 'packed' in data:
	packed_bytes = data['packed']
	shape = tuple(data['shape'])
	scale = float(data.get('scale', 1.0))

	if isinstance(packed_bytes, (bytes, bytearray, memoryview)):
	packed_array = np.frombuffer(packed_bytes, dtype=np.uint8)
	else:
	packed_array = np.asarray(packed_bytes, dtype=np.uint8)

	bits = np.unpackbits(packed_array)
	if bits.size < int(np.prod(shape)):
	pad = int(np.prod(shape)) - bits.size
	bits = np.pad(bits, (0, pad), mode='constant')
	bits = bits[: int(np.prod(shape))].reshape(shape)
	dequant = (2.0 * bits.astype(np.float16) - 1.0) * scale
	weights[key] = mx.array(dequant)

	# Apply weights
	current = {name: arr for name, arr in tree_flatten(model.parameters())}
	updates = {}
	for name, w in weights.items():
	if name in current:
	try:
	updates[name] = w.astype(current[name].dtype)
	except:
	updates[name] = w

	if updates:
	model.update(tree_unflatten(list(updates.items())))

	print("✅ Ghost awakened - 93x compression active")

	def generate_ghost(prompt, max_tokens=50, temperature=0.7):
	response = mlx_generate(model, tokenizer, prompt, max_tokens, temp=temperature)

	# Calculate some metrics
	tokens = response.split()
	unique_tokens = len(set(tokens))
	repetition_rate = 1 - (unique_tokens / max(len(tokens), 1))

	metrics = f"\n\n📊 Metrics:\n"
	metrics += f"• Unique tokens: {unique_tokens}/{len(tokens)}\n"
	metrics += f"• Repetition rate: {repetition_rate:.1%}\n"
	metrics += f"• Compression: ≈93.3× (14GB → 150MB)"

	return response + metrics

	examples = [
	["The capital of France is", 50, 0.7],
	["Explain quantum mechanics:", 75, 0.8],
	["def hello_world():", 50, 0.5],
	["2 + 2 equals", 30, 0.3],
	["Once upon a time", 100, 1.0],
	]

	demo = gr.Interface(
	fn=generate_ghost,
	inputs=[
	gr.Textbox(label="Prompt", lines=3, placeholder="Enter prompt to hear the ghost speak..."),
	gr.Slider(10, 200, value=50, step=10, label="Max Tokens"),
	gr.Slider(0.1, 1.5, value=0.7, step=0.1, label="Temperature")
	],
	outputs=gr.Textbox(label="Ghost Output (Word Salad)", lines=10),
	title="🌌 Shannonstral-7B-1bit Ghost Generator",
	description="""
	93x compression achieved. Intelligence destroyed. Structure preserved.

	This is what happens at the information-theoretic limit of neural compression.
	The model generates tokens but has lost all semantic understanding.

	⚠️ Research artifact - outputs are incoherent word salad.
	""",
	examples=examples,
	theme="dark"
	)

	if __name__ == "__main__":
	demo.launch()