Spaces:

Vishwas1
/

single-layer-pruner

Paused

File size: 9,646 Bytes

17f949e
 
 
71db69e
17f949e
71db69e
 
 
 
 
 
 
1f5c9f6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5574df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f5c9f6
71db69e
 
17f949e
80b56c0
 
 
17f949e
 
71db69e
80b56c0
17f949e
71db69e
 
 
b5574df
80b56c0
1f5c9f6
 
b5574df
1f5c9f6
71db69e
1f5c9f6
2a6178a
b5574df
 
 
1f5c9f6
 
 
b5574df
 
71db69e
80b56c0
71db69e
b5574df
 
80b56c0
b5574df
71db69e
1f5c9f6
 
 
71db69e
 
 
 
8449b1c
2a6178a
b5574df
71db69e
 
b5574df
71db69e
b5574df
80b56c0
b5574df
71db69e
80b56c0
71db69e
b5574df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
80b56c0
b5574df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
71db69e
b5574df
 
80b56c0
71db69e
 
 
 
b5574df
17f949e
b5574df
71db69e
1f5c9f6
b5574df
71db69e
1f5c9f6
 
b5574df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1f5c9f6
 
 
 
 
 
 
 
 
 
 
 
b5574df
 
1f5c9f6
 
 
 
 
 
 
b5574df
 
 
 
 
 
 
1f5c9f6
b5574df
1f5c9f6
 
b5574df
1f5c9f6
 
80b56c0
b5574df
1f5c9f6
 
71db69e
 
b5574df
 
 
 
 
 
17f949e
80b56c0
b5574df

import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
import gc

def get_model_size_mb(model):
    """Rough estimate of model size in MB (parameters only)"""
    param_size = 0
    for param in model.parameters():
        param_size += param.nelement() * param.element_size()
    return round(param_size / (1024 ** 2), 1)

def parse_layer_string(layer_str, max_layers):
    """Parses strings like '0, 5, 20-21' into a sorted list of valid integers."""
    if not layer_str or not str(layer_str).strip():
        return list(range(max_layers))
        
    layers = []
    try:
        for part in str(layer_str).split(","):
            part = part.strip()
            if "-" in part:
                start, end = map(int, part.split("-"))
                layers.extend(range(start, end + 1))
            else:
                layers.append(int(part))
                
        # Deduplicate, sort, and filter out-of-bounds layers
        valid_layers = sorted(list(set([l for l in layers if 0 <= l < max_layers])))
        return valid_layers
    except Exception:
        return None

def apply_pruning(model, layers_to_keep_indices):
    """Helper function to perform the actual lobotomy"""
    new_layers_list = []
    for new_idx, old_idx in enumerate(layers_to_keep_indices):
        layer = model.model.layers[old_idx]
        
        # Reset internal layer indices so KV caching doesn't crash
        if hasattr(layer, "layer_idx"):
            layer.layer_idx = new_idx
        if hasattr(layer, "self_attn") and hasattr(layer.self_attn, "layer_idx"):
            layer.self_attn.layer_idx = new_idx
            
        new_layers_list.append(layer)
        
    # Overwrite the model's layers
    model.model.layers = torch.nn.ModuleList(new_layers_list)
    model.config.num_hidden_layers = len(new_layers_list)
    return model

def prune_and_test(model_id: str, layer_input: str, test_prompt: str):
    status_lines = []
    status_lines.append(f"Loading base model: {model_id}")
    
    model = None
    tokenizer = None
        
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.bfloat16, 
            device_map="cpu",
            low_cpu_mem_usage=True,
            trust_remote_code=True
        )
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
                
        orig_layers = len(model.model.layers) if hasattr(model.model, "layers") else 0
        if orig_layers == 0:
            return "\n".join(status_lines) + "\n\n❌ Model architecture not supported."
            
        orig_size_mb = get_model_size_mb(model)
        status_lines.append(f"→ Original layers: {orig_layers} (Indices 0 to {orig_layers - 1})")
        
        layers_to_keep_indices = parse_layer_string(layer_input, orig_layers)
        if not layers_to_keep_indices:
            return "\n".join(status_lines) + "\n\n❌ Invalid layer selection."
            
        status_lines.append(f"\nTargeting layers: {layers_to_keep_indices}")
                
        # PRUNE
        model = apply_pruning(model, layers_to_keep_indices)
        gc.collect()
                
        new_size_mb = get_model_size_mb(model)
        status_lines.append(f"→ After pruning: {len(layers_to_keep_indices)} layers")
        status_lines.append(f"→ Size reduced from {orig_size_mb} MB to {new_size_mb} MB")
                
        # TEST
        try:
            prompt_to_use = test_prompt if test_prompt.strip() else "Hello, the future of AI is"
            inputs = tokenizer(prompt_to_use, return_tensors="pt")
            
            with torch.no_grad():
                outputs = model.generate(
                    **inputs.to(model.device),
                    max_new_tokens=40,
                    do_sample=False, 
                    pad_token_id=tokenizer.eos_token_id,
                    use_cache=False
                )
            text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            status_lines.append(f"\nQuick generation test:\n→ {text.strip()}")
        except Exception as gen_e:
            status_lines.append(f"\nGeneration test failed: {str(gen_e)}")
                
        status_lines.append("\nPruning test successful ✓")
        return "\n".join(status_lines)
        
    except Exception as e:
        return "\n".join(status_lines) + f"\n\n❌ Failed: {str(e)}"
    finally:
        if model is not None: del model
        if tokenizer is not None: del tokenizer
        gc.collect()

def push_pruned_model(model_id: str, layer_input: str, hf_token: str, repo_id: str):
    if not hf_token or not repo_id:
        return "❌ Please provide both a Hugging Face Write Token and a Repo Name."
        
    status_lines = [f"Preparing to push pruned {model_id} to {repo_id}..."]
    model = None
    tokenizer = None
    
    try:
        model = AutoModelForCausalLM.from_pretrained(
            model_id, torch_dtype=torch.bfloat16, device_map="cpu", low_cpu_mem_usage=True, trust_remote_code=True
        )
        tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
        
        orig_layers = len(model.model.layers)
        layers_to_keep_indices = parse_layer_string(layer_input, orig_layers)
        
        status_lines.append(f"Pruning to layers: {layers_to_keep_indices}...")
        model = apply_pruning(model, layers_to_keep_indices)
        
        status_lines.append("Pushing model and tokenizer to Hub (this may take a minute)...")
        
        # PUSH COMMANDS
        model.push_to_hub(repo_id, token=hf_token)
        tokenizer.push_to_hub(repo_id, token=hf_token)
        
        status_lines.append(f"\n✅ SUCCESS! Model pushed to: https://huggingface.co/{repo_id}")
        status_lines.append("You can now load this in any script using AutoModelForCausalLM.from_pretrained()")
        return "\n".join(status_lines)
        
    except Exception as e:
        return "\n".join(status_lines) + f"\n\n❌ Push Failed: {str(e)}"
    finally:
        if model is not None: del model
        if tokenizer is not None: del tokenizer
        gc.collect()

# ────────────────────────────────────────────────
#                 Gradio Interface
# ────────────────────────────────────────────────
CSS = """.gradio-container { max-width: 950px !important; }"""

with gr.Blocks(title="The Frankenstein Pruner", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🧠 The Frankenstein Pruner
    Lobotomize language models by ripping out their middle layers. Test the output, and if it produces beautiful, semi-coherent gibberish, save your abomination to the Hugging Face Hub!
    """)
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("""
            ### ✂️ Mad Scientist Cheat Sheet
            *Models have between 16 and 28 layers. If a model has 22 layers, valid indices are `0` to `21`.*
            
            **The Bookend (The Best Starter)**
            Keep the first 2 and last 2 layers. It usually remembers grammar but forgets facts.
            *Try:* `0, 1, 20, 21`
            
            **The Swiss Cheese**
            Keep every other layer. The model's logic gets incredibly confused.
            *Try:* `0, 2, 4, 6, 8, 10, 12, 14`
            
            **The Brainless**
            Keep only the first layer and the last layer. Absolute chaos.
            *Try:* `0, 21`
            """)
            
        with gr.Column(scale=1):
            model_choice = gr.Dropdown(
                choices=[
                    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                    "Qwen/Qwen2.5-0.5B-Instruct",
                    "Qwen/Qwen2.5-1.5B-Instruct",
                    "google/gemma-2-2b-it",
                ],
                label="Base Model",
                value="TinyLlama/TinyLlama-1.1B-Chat-v1.0"
            )
            
            layer_choice = gr.Textbox(
                label="Layers to Keep (e.g. '0, 1, 20, 21')",
                value="0, 1, 20, 21"
            )
            
            prompt_choice = gr.Textbox(
                label="Test Prompt",
                value="The capital of France is Paris. The capital of Japan is",
            )
            
            btn_test = gr.Button("🧪 Prune & Test Generate", variant="primary")
            
            with gr.Accordion("🚀 Push to Hugging Face Hub", open=False):
                gr.Markdown("Love your broken model? Save it! You'll need a **Write Token** from your HF account settings.")
                hf_token_input = gr.Textbox(label="HF Write Token", type="password", placeholder="hf_...")
                repo_name_input = gr.Textbox(label="Target Repo Name", placeholder="your-username/my-weird-llama")
                btn_push = gr.Button("Push to Hub", variant="secondary")
            
        with gr.Column(scale=1):
            status = gr.Textbox(
                label="Surgery Log & Output",
                lines=25,
                interactive=False
            )
        
    btn_test.click(
        prune_and_test,
        inputs=[model_choice, layer_choice, prompt_choice],
        outputs=status
    )
    
    btn_push.click(
        push_pruned_model,
        inputs=[model_choice, layer_choice, hf_token_input, repo_name_input],
        outputs=status
    )

if __name__ == "__main__":
    demo.launch(css=CSS)