tensiondriven commited on
Commit
c97f0b2
·
verified ·
1 Parent(s): 36277af

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +0 -195
app.py CHANGED
@@ -1,195 +0,0 @@
1
- """
2
- Ultravox Quantizer - Quantizes fixie-ai/ultravox-v0_4_1-mistral-nemo to INT4
3
- and pushes to tensiondriven/ultravox-v0_4_1-mistral-nemo-int8
4
-
5
- Uses INT4 (4-bit) quantization because:
6
- - T4 has 16GB VRAM
7
- - 12B model at INT8 needs ~12GB, too tight with overhead
8
- - INT4 needs ~6GB, leaves room for processing
9
- """
10
-
11
- import os
12
- import gc
13
- import gradio as gr
14
- import torch
15
- from huggingface_hub import HfApi, login
16
-
17
- # Config
18
- SOURCE_MODEL = "fixie-ai/ultravox-v0_4_1-mistral-nemo"
19
- TARGET_REPO = "tensiondriven/ultravox-v0_4_1-mistral-nemo-int8"
20
- HF_TOKEN = os.environ.get("HF_TOKEN")
21
-
22
- def clear_memory():
23
- """Aggressively clear GPU memory"""
24
- gc.collect()
25
- if torch.cuda.is_available():
26
- torch.cuda.empty_cache()
27
- torch.cuda.synchronize()
28
-
29
- def quantize_and_push(use_int4: bool = True) -> str:
30
- """
31
- Quantize the model and push to HuggingFace Hub.
32
-
33
- Args:
34
- use_int4: If True, use INT4 (4-bit). If False, try INT8 (8-bit).
35
-
36
- Returns:
37
- Status message
38
- """
39
- if not HF_TOKEN:
40
- return "ERROR: HF_TOKEN environment variable not set. Add it in Space settings."
41
-
42
- try:
43
- login(token=HF_TOKEN)
44
- api = HfApi(token=HF_TOKEN)
45
- except Exception as e:
46
- return f"ERROR: Failed to authenticate with HuggingFace: {e}"
47
-
48
- # Import here to catch import errors
49
- try:
50
- from transformers import AutoModelForCausalLM, AutoProcessor, BitsAndBytesConfig
51
- except ImportError as e:
52
- return f"ERROR: Missing dependency: {e}"
53
-
54
- quant_type = "INT4 (4-bit)" if use_int4 else "INT8 (8-bit)"
55
- output_lines = [f"Starting {quant_type} quantization of {SOURCE_MODEL}..."]
56
-
57
- # Check GPU
58
- if torch.cuda.is_available():
59
- gpu_name = torch.cuda.get_device_name(0)
60
- gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
61
- output_lines.append(f"GPU: {gpu_name} with {gpu_mem:.1f}GB VRAM")
62
- else:
63
- return "ERROR: No CUDA GPU available. This Space requires GPU hardware."
64
-
65
- clear_memory()
66
-
67
- # Configure quantization
68
- if use_int4:
69
- bnb_config = BitsAndBytesConfig(
70
- load_in_4bit=True,
71
- bnb_4bit_quant_type="nf4",
72
- bnb_4bit_compute_dtype=torch.float16,
73
- bnb_4bit_use_double_quant=True,
74
- )
75
- else:
76
- bnb_config = BitsAndBytesConfig(
77
- load_in_8bit=True,
78
- )
79
-
80
- output_lines.append(f"Loading model with {quant_type} quantization...")
81
-
82
- try:
83
- # Load quantized model
84
- model = AutoModelForCausalLM.from_pretrained(
85
- SOURCE_MODEL,
86
- quantization_config=bnb_config,
87
- device_map="auto",
88
- trust_remote_code=True,
89
- torch_dtype=torch.float16,
90
- low_cpu_mem_usage=True,
91
- )
92
- output_lines.append("Model loaded successfully!")
93
-
94
- # Load processor
95
- processor = AutoProcessor.from_pretrained(
96
- SOURCE_MODEL,
97
- trust_remote_code=True,
98
- )
99
- output_lines.append("Processor loaded successfully!")
100
-
101
- except torch.cuda.OutOfMemoryError:
102
- clear_memory()
103
- return "\n".join(output_lines) + "\n\nERROR: Out of GPU memory. Try INT4 quantization instead."
104
- except Exception as e:
105
- clear_memory()
106
- return "\n".join(output_lines) + f"\n\nERROR loading model: {e}"
107
-
108
- # Create target repo if needed
109
- try:
110
- api.create_repo(repo_id=TARGET_REPO, exist_ok=True, private=False)
111
- output_lines.append(f"Target repo ready: {TARGET_REPO}")
112
- except Exception as e:
113
- output_lines.append(f"Warning: Could not create/verify repo: {e}")
114
-
115
- # Push to hub
116
- output_lines.append(f"Pushing quantized model to {TARGET_REPO}...")
117
-
118
- try:
119
- model.push_to_hub(
120
- TARGET_REPO,
121
- token=HF_TOKEN,
122
- safe_serialization=True,
123
- )
124
- output_lines.append("Model pushed successfully!")
125
-
126
- processor.push_to_hub(
127
- TARGET_REPO,
128
- token=HF_TOKEN,
129
- )
130
- output_lines.append("Processor pushed successfully!")
131
-
132
- except Exception as e:
133
- clear_memory()
134
- return "\n".join(output_lines) + f"\n\nERROR pushing to hub: {e}"
135
-
136
- clear_memory()
137
-
138
- output_lines.append("")
139
- output_lines.append(f"SUCCESS! Quantized model available at:")
140
- output_lines.append(f"https://huggingface.co/{TARGET_REPO}")
141
-
142
- return "\n".join(output_lines)
143
-
144
-
145
- def run_int4():
146
- """Run INT4 quantization (recommended for T4)"""
147
- return quantize_and_push(use_int4=True)
148
-
149
-
150
- def run_int8():
151
- """Run INT8 quantization (may OOM on T4)"""
152
- return quantize_and_push(use_int4=False)
153
-
154
-
155
- def get_status():
156
- """Check current status"""
157
- lines = ["=== Ultravox Quantizer Status ==="]
158
- lines.append(f"Source: {SOURCE_MODEL}")
159
- lines.append(f"Target: {TARGET_REPO}")
160
- lines.append(f"HF_TOKEN set: {'Yes' if HF_TOKEN else 'NO - add in Space settings!'}")
161
-
162
- if torch.cuda.is_available():
163
- lines.append(f"GPU: {torch.cuda.get_device_name(0)}")
164
- free_mem = torch.cuda.mem_get_info()[0] / 1e9
165
- total_mem = torch.cuda.mem_get_info()[1] / 1e9
166
- lines.append(f"VRAM: {free_mem:.1f}GB free / {total_mem:.1f}GB total")
167
- else:
168
- lines.append("GPU: Not available (CUDA required)")
169
-
170
- return "\n".join(lines)
171
-
172
-
173
- # Gradio UI
174
- with gr.Blocks(title="Ultravox Quantizer") as demo:
175
- gr.Markdown("""
176
- # Ultravox Model Quantizer
177
-
178
- Quantizes `fixie-ai/ultravox-v0_4_1-mistral-nemo` and pushes to `tensiondriven/ultravox-v0_4_1-mistral-nemo-int8`
179
-
180
- **Recommended: INT4** - T4 has 16GB VRAM, INT4 uses ~6GB leaving room for processing.
181
- """)
182
-
183
- with gr.Row():
184
- status_btn = gr.Button("Check Status", variant="secondary")
185
- int4_btn = gr.Button("Run INT4 Quantization (Recommended)", variant="primary")
186
- int8_btn = gr.Button("Run INT8 Quantization (May OOM)", variant="secondary")
187
-
188
- output = gr.Textbox(label="Output", lines=20, max_lines=50)
189
-
190
- status_btn.click(fn=get_status, outputs=output)
191
- int4_btn.click(fn=run_int4, outputs=output)
192
- int8_btn.click(fn=run_int8, outputs=output)
193
-
194
- if __name__ == "__main__":
195
- demo.launch()