Vishwas1 commited on
Commit
80b56c0
Β·
verified Β·
1 Parent(s): 71db69e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -41
app.py CHANGED
@@ -2,7 +2,6 @@ import gradio as gr
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import gc
5
- import sys
6
 
7
  def get_model_size_mb(model):
8
  """Rough estimate of model size in MB (parameters only)"""
@@ -15,11 +14,15 @@ def prune_to_single_layer(model_id: str):
15
  status_lines = []
16
  status_lines.append(f"Loading base model: {model_id}")
17
 
 
 
 
 
18
  try:
19
- # Load on CPU with low memory usage flags
20
  model = AutoModelForCausalLM.from_pretrained(
21
  model_id,
22
- torch_dtype=torch.float32, # float32 = most compatible on CPU
23
  device_map="cpu",
24
  low_cpu_mem_usage=True,
25
  trust_remote_code=True
@@ -28,32 +31,33 @@ def prune_to_single_layer(model_id: str):
28
  model_id,
29
  trust_remote_code=True
30
  )
31
-
32
  orig_layers = len(model.model.layers) if hasattr(model.model, "layers") else "unknown"
33
  orig_size_mb = get_model_size_mb(model)
34
  status_lines.append(f"β†’ Original layers: {orig_layers}")
35
  status_lines.append(f"β†’ Original size (approx): {orig_size_mb} MB")
36
-
37
  # ────────────────────────────────────────────────
38
  # Core pruning step
39
  if not hasattr(model, "model") or not hasattr(model.model, "layers"):
40
  return "\n".join(status_lines) + "\n\n❌ Model architecture not supported (no .model.layers found)"
41
-
42
  # Keep only the LAST layer
43
  model.model.layers = torch.nn.ModuleList([model.model.layers[-1]])
44
  model.config.num_hidden_layers = 1
45
-
46
- # Optional: clear intermediate tensors if possible
47
  gc.collect()
48
- if torch.cuda.is_available():
49
- torch.cuda.empty_cache()
50
-
51
  new_layers = len(model.model.layers)
52
  new_size_mb = get_model_size_mb(model)
53
  status_lines.append(f"β†’ After pruning: {new_layers} layer")
54
  status_lines.append(f"β†’ New size (approx): {new_size_mb} MB")
55
- status_lines.append(f"β†’ Size reduction: ~{round((orig_size_mb - new_size_mb)/orig_size_mb*100)}%")
56
 
 
 
 
 
57
  # Quick generation smoke test
58
  try:
59
  inputs = tokenizer("Hello, the future of single-layer models is", return_tensors="pt")
@@ -61,52 +65,46 @@ def prune_to_single_layer(model_id: str):
61
  outputs = model.generate(
62
  **inputs.to(model.device),
63
  max_new_tokens=40,
64
- do_sample=False,
65
- temperature=0.0
66
  )
67
  text = tokenizer.decode(outputs[0], skip_special_tokens=True)
68
  status_lines.append("\nQuick generation test (should be at least semi-coherent):")
69
  status_lines.append("β†’ " + text.strip())
70
  except Exception as gen_e:
71
  status_lines.append(f"\nGeneration test failed: {str(gen_e)} (still might be usable)")
72
-
73
  status_lines.append("\nPruning appears successful βœ“")
74
  status_lines.append("You can now safely close this tab or try another model.")
75
-
76
  return "\n".join(status_lines)
77
-
78
  except Exception as e:
79
  err_msg = str(e)
80
- if "out of memory" in err_msg.lower():
81
  return "\n".join(status_lines) + "\n\n❌ Out of memory β€” try an even smaller model (0.5B class)"
82
  return "\n".join(status_lines) + f"\n\n❌ Failed: {err_msg}"
83
-
84
  finally:
85
- # Try to free memory even on failure
86
- try:
87
  del model
 
88
  del tokenizer
89
- gc.collect()
90
- except:
91
- pass
92
 
93
  # ────────────────────────────────────────────────
94
  # Gradio Interface
95
  # ────────────────────────────────────────────────
96
-
97
- CSS = """
98
- .gradio-container { max-width: 780px !important; }
99
- """
100
 
101
  with gr.Blocks(title="Minimal Single-Layer Pruner", css=CSS, theme=gr.themes.Default()) as demo:
102
  gr.Markdown("""
103
  # Single-Layer Pruner (test version)
104
-
105
  Loads a small model β†’ keeps **only the last layer** β†’ shows result + quick generation test.
106
-
107
  **No pushing to Hub yet** β€” just checking if pruning works reliably.
108
  """)
109
-
110
  model_choice = gr.Dropdown(
111
  choices=[
112
  "Qwen/Qwen2.5-0.5B-Instruct",
@@ -118,27 +116,28 @@ with gr.Blocks(title="Minimal Single-Layer Pruner", css=CSS, theme=gr.themes.Def
118
  label="Choose small model (0.5B–2B recommended for free CPU Space)",
119
  value="Qwen/Qwen2.5-0.5B-Instruct"
120
  )
121
-
122
  status = gr.Textbox(
123
  label="Pruning log",
124
  lines=18,
125
  interactive=False,
126
  show_copy_button=True
127
  )
128
-
129
- btn = gr.Button("Prune to 1 layer β†’ Test", variant="primary", scale=0)
130
-
131
  btn.click(
132
  prune_to_single_layer,
133
  inputs=model_choice,
134
  outputs=status
135
  )
136
-
137
  gr.Markdown("""
138
- **Tips**
139
- β€’ Start with 0.5B or 1.1B models β€” they almost always succeed on free Spaces
140
- β€’ The generation test often produces short but semi-sensible text
141
- β€’ Next steps (after this works): add push button, add chat tab, convert to GGUF
142
  """)
143
 
144
- demo.launch()
 
 
2
  import torch
3
  from transformers import AutoModelForCausalLM, AutoTokenizer
4
  import gc
 
5
 
6
  def get_model_size_mb(model):
7
  """Rough estimate of model size in MB (parameters only)"""
 
14
  status_lines = []
15
  status_lines.append(f"Loading base model: {model_id}")
16
 
17
+ # Initialize as None to prevent UnboundLocalError in the finally block
18
+ model = None
19
+ tokenizer = None
20
+
21
  try:
22
+ # Load on CPU with bfloat16 to prevent Out of Memory (OOM) on free spaces
23
  model = AutoModelForCausalLM.from_pretrained(
24
  model_id,
25
+ torch_dtype=torch.bfloat16,
26
  device_map="cpu",
27
  low_cpu_mem_usage=True,
28
  trust_remote_code=True
 
31
  model_id,
32
  trust_remote_code=True
33
  )
34
+
35
  orig_layers = len(model.model.layers) if hasattr(model.model, "layers") else "unknown"
36
  orig_size_mb = get_model_size_mb(model)
37
  status_lines.append(f"β†’ Original layers: {orig_layers}")
38
  status_lines.append(f"β†’ Original size (approx): {orig_size_mb} MB")
39
+
40
  # ────────────────────────────────────────────────
41
  # Core pruning step
42
  if not hasattr(model, "model") or not hasattr(model.model, "layers"):
43
  return "\n".join(status_lines) + "\n\n❌ Model architecture not supported (no .model.layers found)"
44
+
45
  # Keep only the LAST layer
46
  model.model.layers = torch.nn.ModuleList([model.model.layers[-1]])
47
  model.config.num_hidden_layers = 1
48
+
49
+ # Clear intermediate tensors
50
  gc.collect()
51
+
 
 
52
  new_layers = len(model.model.layers)
53
  new_size_mb = get_model_size_mb(model)
54
  status_lines.append(f"β†’ After pruning: {new_layers} layer")
55
  status_lines.append(f"β†’ New size (approx): {new_size_mb} MB")
 
56
 
57
+ if orig_size_mb > 0:
58
+ reduction = round((orig_size_mb - new_size_mb) / orig_size_mb * 100)
59
+ status_lines.append(f"β†’ Size reduction: ~{reduction}%")
60
+
61
  # Quick generation smoke test
62
  try:
63
  inputs = tokenizer("Hello, the future of single-layer models is", return_tensors="pt")
 
65
  outputs = model.generate(
66
  **inputs.to(model.device),
67
  max_new_tokens=40,
68
+ do_sample=False, # Temperature removed to prevent conflict
69
+ pad_token_id=tokenizer.eos_token_id # Prevents warnings/crashes on Llama
70
  )
71
  text = tokenizer.decode(outputs[0], skip_special_tokens=True)
72
  status_lines.append("\nQuick generation test (should be at least semi-coherent):")
73
  status_lines.append("β†’ " + text.strip())
74
  except Exception as gen_e:
75
  status_lines.append(f"\nGeneration test failed: {str(gen_e)} (still might be usable)")
76
+
77
  status_lines.append("\nPruning appears successful βœ“")
78
  status_lines.append("You can now safely close this tab or try another model.")
79
+
80
  return "\n".join(status_lines)
81
+
82
  except Exception as e:
83
  err_msg = str(e)
84
+ if "out of memory" in err_msg.lower() or "killed" in err_msg.lower():
85
  return "\n".join(status_lines) + "\n\n❌ Out of memory β€” try an even smaller model (0.5B class)"
86
  return "\n".join(status_lines) + f"\n\n❌ Failed: {err_msg}"
87
+
88
  finally:
89
+ # Safely try to free memory even on failure
90
+ if model is not None:
91
  del model
92
+ if tokenizer is not None:
93
  del tokenizer
94
+ gc.collect()
 
 
95
 
96
  # ────────────────────────────────────────────────
97
  # Gradio Interface
98
  # ────────────────────────────────────────────────
99
+ CSS = """.gradio-container { max-width: 780px !important; }"""
 
 
 
100
 
101
  with gr.Blocks(title="Minimal Single-Layer Pruner", css=CSS, theme=gr.themes.Default()) as demo:
102
  gr.Markdown("""
103
  # Single-Layer Pruner (test version)
 
104
  Loads a small model β†’ keeps **only the last layer** β†’ shows result + quick generation test.
 
105
  **No pushing to Hub yet** β€” just checking if pruning works reliably.
106
  """)
107
+
108
  model_choice = gr.Dropdown(
109
  choices=[
110
  "Qwen/Qwen2.5-0.5B-Instruct",
 
116
  label="Choose small model (0.5B–2B recommended for free CPU Space)",
117
  value="Qwen/Qwen2.5-0.5B-Instruct"
118
  )
119
+
120
  status = gr.Textbox(
121
  label="Pruning log",
122
  lines=18,
123
  interactive=False,
124
  show_copy_button=True
125
  )
126
+
127
+ btn = gr.Button("Prune to 1 layer β†’ Test", variant="primary")
128
+
129
  btn.click(
130
  prune_to_single_layer,
131
  inputs=model_choice,
132
  outputs=status
133
  )
134
+
135
  gr.Markdown("""
136
+ **Tips**
137
+ β€’ Start with 0.5B or 1.1B models β€” they almost always succeed on free Spaces
138
+ β€’ The generation test often produces short but semi-sensible text
139
+ β€’ Next steps (after this works): add push button, add chat tab, convert to GGUF
140
  """)
141
 
142
+ if __name__ == "__main__":
143
+ demo.launch()