hammh0a commited on
Commit
681a670
·
verified ·
1 Parent(s): ebd50fa

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +1 -62
README.md CHANGED
@@ -3,65 +3,4 @@ base_model:
3
  - CohereLabs/command-a-translate-08-2025
4
  ---
5
 
6
- FP8 Quantized version of: [CohereLabs/command-a-translate-08-2025](https://huggingface.co/CohereLabs/command-a-translate-08-2025)
7
-
8
- Code used to perform quantization using `llmcompressor`.
9
-
10
- ```
11
- from transformers import AutoTokenizer, AutoModelForCausalLM
12
- from llmcompressor import oneshot
13
- from llmcompressor.modifiers.quantization import QuantizationModifier
14
- import torch
15
- import time
16
-
17
- MODEL_ID = "CohereLabs/command-a-translate-08-2025"
18
-
19
- # Check your GPUs
20
- print(f"Found {torch.cuda.device_count()} GPUs")
21
- for i in range(torch.cuda.device_count()):
22
- print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
23
- print(f" Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB")
24
-
25
- start_time = time.time()
26
-
27
- # Load model across all 4 GPUs
28
- print("Loading model across 4x A100 GPUs...")
29
- model = AutoModelForCausalLM.from_pretrained(
30
- MODEL_ID,
31
- torch_dtype=torch.bfloat16,
32
- device_map="auto", # Automatically distributes across all GPUs
33
- low_cpu_mem_usage=True,
34
- trust_remote_code=True,
35
- max_memory={
36
- 0: "70GB", # Leave some headroom on each GPU
37
- 1: "70GB",
38
- 2: "70GB",
39
- 3: "70GB",
40
- "cpu": "800GB" # Use CPU for overflow if needed
41
- }
42
- )
43
-
44
- tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
45
-
46
- print("Model distributed across GPUs!")
47
- print(model.hf_device_map) # Shows which layers are on which device
48
-
49
- # Apply FP8 quantization
50
- recipe = QuantizationModifier(
51
- targets="Linear",
52
- scheme="FP8_DYNAMIC",
53
- ignore=["lm_head"]
54
- )
55
-
56
- print("Starting FP8 quantization on multi-GPU setup...")
57
- oneshot(model=model, recipe=recipe)
58
-
59
- # Save quantized model
60
- SAVE_DIR = "command-a-translate-FP8-Dynamic"
61
- print(f"Saving to {SAVE_DIR}...")
62
- model.save_pretrained(SAVE_DIR, safe_serialization=True)
63
- tokenizer.save_pretrained(SAVE_DIR)
64
-
65
- elapsed = time.time() - start_time
66
- print(f"✓ Quantization completed in {elapsed/60:.2f} minutes!")
67
- ```
 
3
  - CohereLabs/command-a-translate-08-2025
4
  ---
5
 
6
+ FP8 Quantized version of: [CohereLabs/command-a-translate-08-2025](https://huggingface.co/CohereLabs/command-a-translate-08-2025)