hammh0a commited on
Commit
ebd50fa
·
verified ·
1 Parent(s): bb84861

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +61 -0
README.md CHANGED
@@ -4,3 +4,64 @@ base_model:
4
  ---
5
 
6
  FP8 Quantized version of: [CohereLabs/command-a-translate-08-2025](https://huggingface.co/CohereLabs/command-a-translate-08-2025)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  ---
5
 
6
  FP8 Quantized version of: [CohereLabs/command-a-translate-08-2025](https://huggingface.co/CohereLabs/command-a-translate-08-2025)
7
+
8
+ Code used to perform quantization using `llmcompressor`.
9
+
10
+ ```
11
+ from transformers import AutoTokenizer, AutoModelForCausalLM
12
+ from llmcompressor import oneshot
13
+ from llmcompressor.modifiers.quantization import QuantizationModifier
14
+ import torch
15
+ import time
16
+
17
+ MODEL_ID = "CohereLabs/command-a-translate-08-2025"
18
+
19
+ # Check your GPUs
20
+ print(f"Found {torch.cuda.device_count()} GPUs")
21
+ for i in range(torch.cuda.device_count()):
22
+ print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
23
+ print(f" Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")
24
+
25
+ start_time = time.time()
26
+
27
+ # Load model across all 4 GPUs
28
+ print("Loading model across 4x A100 GPUs...")
29
+ model = AutoModelForCausalLM.from_pretrained(
30
+ MODEL_ID,
31
+ torch_dtype=torch.bfloat16,
32
+ device_map="auto", # Automatically distributes across all GPUs
33
+ low_cpu_mem_usage=True,
34
+ trust_remote_code=True,
35
+ max_memory={
36
+ 0: "70GB", # Leave some headroom on each GPU
37
+ 1: "70GB",
38
+ 2: "70GB",
39
+ 3: "70GB",
40
+ "cpu": "800GB" # Use CPU for overflow if needed
41
+ }
42
+ )
43
+
44
+ tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
45
+
46
+ print("Model distributed across GPUs!")
47
+ print(model.hf_device_map) # Shows which layers are on which device
48
+
49
+ # Apply FP8 quantization
50
+ recipe = QuantizationModifier(
51
+ targets="Linear",
52
+ scheme="FP8_DYNAMIC",
53
+ ignore=["lm_head"]
54
+ )
55
+
56
+ print("Starting FP8 quantization on multi-GPU setup...")
57
+ oneshot(model=model, recipe=recipe)
58
+
59
+ # Save quantized model
60
+ SAVE_DIR = "command-a-translate-FP8-Dynamic"
61
+ print(f"Saving to {SAVE_DIR}...")
62
+ model.save_pretrained(SAVE_DIR, safe_serialization=True)
63
+ tokenizer.save_pretrained(SAVE_DIR)
64
+
65
+ elapsed = time.time() - start_time
66
+ print(f"✓ Quantization completed in {elapsed/60:.2f} minutes!")
67
+ ```