Vishwas1 commited on
Commit
71db69e
Β·
verified Β·
1 Parent(s): f32ff80

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +129 -33
app.py CHANGED
@@ -1,48 +1,144 @@
1
- import os
2
  import gradio as gr
3
  import torch
4
  from transformers import AutoModelForCausalLM, AutoTokenizer
5
- from huggingface_hub import HfApi
 
6
 
7
- def prune_to_single_layer_and_push(base_model_id: str, new_repo_id: str, make_private: bool):
8
- hf_token = os.getenv("HF_TOKEN")
9
- if not hf_token:
10
- return "❌ Set HF_TOKEN secret in Space Settings first!"
11
-
12
- if not base_model_id or not new_repo_id:
13
- return "❌ Fill model and repo name"
 
 
 
14
 
15
  try:
16
- # Load safely on CPU (free Space friendly)
17
  model = AutoModelForCausalLM.from_pretrained(
18
- base_model_id,
19
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
20
  device_map="cpu",
21
- trust_remote_code=True,
22
- low_cpu_mem_usage=True
 
 
 
 
23
  )
24
- tokenizer = AutoTokenizer.from_pretrained(base_model_id, trust_remote_code=True)
25
 
26
- # === SINGLE LAYER MAGIC ===
27
- if hasattr(model, "model") and hasattr(model.model, "layers"):
28
- model.model.layers = torch.nn.ModuleList([model.model.layers[-1]]) # keep LAST layer only
29
- model.config.num_hidden_layers = 1
30
- else:
31
- return "❌ Model type not supported (needs standard .model.layers)"
 
 
 
 
 
 
 
32
 
33
- # Push
34
- model.push_to_hub(new_repo_id, private=make_private, token=hf_token, safe_serialization=True)
35
- tokenizer.push_to_hub(new_repo_id, private=make_private, token=hf_token)
 
36
 
37
- link = f"https://huggingface.co/{new_repo_id}"
38
- return f"""βœ… SUCCESS!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
- Single-layer model is live at:
41
- **{link}**
 
42
 
43
- Size: ~0.3–0.8 GB β†’ runs at 40–100+ tokens/s on any CPU!
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- Load it anywhere:
46
- ```python
47
- from transformers import AutoModelForCausalLM
48
- model = AutoModelForCausalLM.from_pretrained("{new_repo_id}")
 
 
1
  import gradio as gr
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
+ import gc
5
+ import sys
6
 
7
+ def get_model_size_mb(model):
8
+ """Rough estimate of model size in MB (parameters only)"""
9
+ param_size = 0
10
+ for param in model.parameters():
11
+ param_size += param.nelement() * param.element_size()
12
+ return round(param_size / (1024 ** 2), 1)
13
+
14
+ def prune_to_single_layer(model_id: str):
15
+ status_lines = []
16
+ status_lines.append(f"Loading base model: {model_id}")
17
 
18
  try:
19
+ # Load on CPU with low memory usage flags
20
  model = AutoModelForCausalLM.from_pretrained(
21
+ model_id,
22
+ torch_dtype=torch.float32, # float32 = most compatible on CPU
23
  device_map="cpu",
24
+ low_cpu_mem_usage=True,
25
+ trust_remote_code=True
26
+ )
27
+ tokenizer = AutoTokenizer.from_pretrained(
28
+ model_id,
29
+ trust_remote_code=True
30
  )
 
31
 
32
+ orig_layers = len(model.model.layers) if hasattr(model.model, "layers") else "unknown"
33
+ orig_size_mb = get_model_size_mb(model)
34
+ status_lines.append(f"β†’ Original layers: {orig_layers}")
35
+ status_lines.append(f"β†’ Original size (approx): {orig_size_mb} MB")
36
+
37
+ # ────────────────────────────────────────────────
38
+ # Core pruning step
39
+ if not hasattr(model, "model") or not hasattr(model.model, "layers"):
40
+ return "\n".join(status_lines) + "\n\n❌ Model architecture not supported (no .model.layers found)"
41
+
42
+ # Keep only the LAST layer
43
+ model.model.layers = torch.nn.ModuleList([model.model.layers[-1]])
44
+ model.config.num_hidden_layers = 1
45
 
46
+ # Optional: clear intermediate tensors if possible
47
+ gc.collect()
48
+ if torch.cuda.is_available():
49
+ torch.cuda.empty_cache()
50
 
51
+ new_layers = len(model.model.layers)
52
+ new_size_mb = get_model_size_mb(model)
53
+ status_lines.append(f"β†’ After pruning: {new_layers} layer")
54
+ status_lines.append(f"β†’ New size (approx): {new_size_mb} MB")
55
+ status_lines.append(f"β†’ Size reduction: ~{round((orig_size_mb - new_size_mb)/orig_size_mb*100)}%")
56
+
57
+ # Quick generation smoke test
58
+ try:
59
+ inputs = tokenizer("Hello, the future of single-layer models is", return_tensors="pt")
60
+ with torch.no_grad():
61
+ outputs = model.generate(
62
+ **inputs.to(model.device),
63
+ max_new_tokens=40,
64
+ do_sample=False,
65
+ temperature=0.0
66
+ )
67
+ text = tokenizer.decode(outputs[0], skip_special_tokens=True)
68
+ status_lines.append("\nQuick generation test (should be at least semi-coherent):")
69
+ status_lines.append("β†’ " + text.strip())
70
+ except Exception as gen_e:
71
+ status_lines.append(f"\nGeneration test failed: {str(gen_e)} (still might be usable)")
72
+
73
+ status_lines.append("\nPruning appears successful βœ“")
74
+ status_lines.append("You can now safely close this tab or try another model.")
75
+
76
+ return "\n".join(status_lines)
77
+
78
+ except Exception as e:
79
+ err_msg = str(e)
80
+ if "out of memory" in err_msg.lower():
81
+ return "\n".join(status_lines) + "\n\n❌ Out of memory β€” try an even smaller model (0.5B class)"
82
+ return "\n".join(status_lines) + f"\n\n❌ Failed: {err_msg}"
83
+
84
+ finally:
85
+ # Try to free memory even on failure
86
+ try:
87
+ del model
88
+ del tokenizer
89
+ gc.collect()
90
+ except:
91
+ pass
92
+
93
+ # ────────────────────────────────────────────────
94
+ # Gradio Interface
95
+ # ────────────────────────────────────────────────
96
 
97
+ CSS = """
98
+ .gradio-container { max-width: 780px !important; }
99
+ """
100
 
101
+ with gr.Blocks(title="Minimal Single-Layer Pruner", css=CSS, theme=gr.themes.Default()) as demo:
102
+ gr.Markdown("""
103
+ # Single-Layer Pruner (test version)
104
+
105
+ Loads a small model β†’ keeps **only the last layer** β†’ shows result + quick generation test.
106
+
107
+ **No pushing to Hub yet** β€” just checking if pruning works reliably.
108
+ """)
109
+
110
+ model_choice = gr.Dropdown(
111
+ choices=[
112
+ "Qwen/Qwen2.5-0.5B-Instruct",
113
+ "Qwen/Qwen2.5-1.5B-Instruct",
114
+ "meta-llama/Llama-3.2-1B-Instruct",
115
+ "google/gemma-2-2b-it",
116
+ "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
117
+ ],
118
+ label="Choose small model (0.5B–2B recommended for free CPU Space)",
119
+ value="Qwen/Qwen2.5-0.5B-Instruct"
120
+ )
121
+
122
+ status = gr.Textbox(
123
+ label="Pruning log",
124
+ lines=18,
125
+ interactive=False,
126
+ show_copy_button=True
127
+ )
128
+
129
+ btn = gr.Button("Prune to 1 layer β†’ Test", variant="primary", scale=0)
130
+
131
+ btn.click(
132
+ prune_to_single_layer,
133
+ inputs=model_choice,
134
+ outputs=status
135
+ )
136
+
137
+ gr.Markdown("""
138
+ **Tips**
139
+ β€’ Start with 0.5B or 1.1B models β€” they almost always succeed on free Spaces
140
+ β€’ The generation test often produces short but semi-sensible text
141
+ β€’ Next steps (after this works): add push button, add chat tab, convert to GGUF
142
+ """)
143
 
144
+ demo.launch()