ykhrustalev commited on
Commit
19718dc
Β·
verified Β·
1 Parent(s): 1772f08

Upload README.md with huggingface_hub

Browse files
Files changed (1) hide show
  1. README.md +191 -0
README.md ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: other
3
+ license_name: lfm1.0
4
+ license_link: LICENSE
5
+ language:
6
+ - en
7
+ pipeline_tag: text-generation
8
+ tags:
9
+ - liquid
10
+ - edge
11
+ - lfm2
12
+ - transcript
13
+ - meeting
14
+ - summarization
15
+ - onnx
16
+ - onnxruntime
17
+ - webgpu
18
+ base_model:
19
+ - LiquidAI/LFM2-2.6B-Transcript
20
+ ---
21
+
22
+ <div align="center">
23
+ <img
24
+ src="https://cdn-uploads.huggingface.co/production/uploads/61b8e2ba285851687028d395/2b08LKpev0DNEk6DlnWkY.png"
25
+ alt="Liquid AI"
26
+ style="width: 100%; max-width: 100%; height: auto; display: inline-block; margin-bottom: 0.5em; margin-top: 0.5em;"
27
+ />
28
+ <div style="display: flex; justify-content: center; gap: 0.5em; margin-bottom: 1em;">
29
+ <a href="https://playground.liquid.ai/"><strong>Try LFM</strong></a> β€’
30
+ <a href="https://docs.liquid.ai/lfm"><strong>Documentation</strong></a> β€’
31
+ <a href="https://leap.liquid.ai/"><strong>LEAP</strong></a>
32
+ </div>
33
+ </div>
34
+
35
+ # LFM2-2.6B-Transcript-ONNX
36
+
37
+ ONNX export of [LFM2-2.6B-Transcript](https://huggingface.co/LiquidAI/LFM2-2.6B-Transcript) for cross-platform inference.
38
+
39
+ LFM2-2.6B-Transcript is optimized for processing and summarizing meeting transcripts, extracting key points, action items, and decisions from conversational text.
40
+
41
+ ## Recommended Variants
42
+
43
+ | Precision | Size | Platform | Use Case |
44
+ |-----------|------|----------|----------|
45
+ | Q4 | ~2.0GB | WebGPU, Server | Recommended for most uses |
46
+ | FP16 | ~4.8GB | WebGPU, Server | Higher quality |
47
+ | Q8 | ~3.0GB | Server only | Balance of quality and size |
48
+
49
+ - **WebGPU**: Use Q4 or FP16 (Q8 not supported)
50
+ - **Server**: All variants supported
51
+
52
+ ## Model Files
53
+
54
+ ```
55
+ onnx/
56
+ β”œβ”€β”€ model.onnx # FP32
57
+ β”œβ”€β”€ model_fp16.onnx # FP16
58
+ β”œβ”€β”€ model_q4.onnx # Q4 (recommended)
59
+ └── model_q8.onnx # Q8
60
+ ```
61
+
62
+ ## Python
63
+
64
+ ### Installation
65
+
66
+ ```bash
67
+ pip install onnxruntime transformers numpy huggingface_hub
68
+ # or with GPU support:
69
+ pip install onnxruntime-gpu transformers numpy huggingface_hub
70
+ ```
71
+
72
+ ### Inference
73
+
74
+ ```python
75
+ import numpy as np
76
+ import onnxruntime as ort
77
+ from huggingface_hub import hf_hub_download
78
+ from transformers import AutoTokenizer
79
+
80
+ # Download model (Q4 recommended)
81
+ model_id = "LiquidAI/LFM2-2.6B-Transcript-ONNX"
82
+ model_path = hf_hub_download(model_id, "onnx/model_q4.onnx")
83
+ data_path = hf_hub_download(model_id, "onnx/model_q4.onnx_data")
84
+
85
+ # Load model and tokenizer
86
+ session = ort.InferenceSession(model_path)
87
+ tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
88
+
89
+ # Prepare chat input
90
+ messages = [{"role": "user", "content": "Summarize this meeting transcript: ..."}]
91
+ prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
92
+ input_ids = np.array([tokenizer.encode(prompt, add_special_tokens=False)], dtype=np.int64)
93
+
94
+ # Initialize KV cache
95
+ ONNX_DTYPE = {"tensor(float)": np.float32, "tensor(float16)": np.float16, "tensor(int64)": np.int64}
96
+ cache = {}
97
+ for inp in session.get_inputs():
98
+ if inp.name in {"input_ids", "attention_mask", "position_ids"}:
99
+ continue
100
+ shape = [d if isinstance(d, int) else 1 for d in inp.shape]
101
+ for i, d in enumerate(inp.shape):
102
+ if isinstance(d, str) and "sequence" in d.lower():
103
+ shape[i] = 0
104
+ cache[inp.name] = np.zeros(shape, dtype=ONNX_DTYPE.get(inp.type, np.float32))
105
+
106
+ # Check if model uses position_ids
107
+ input_names = {inp.name for inp in session.get_inputs()}
108
+ use_position_ids = "position_ids" in input_names
109
+
110
+ # Generate tokens
111
+ seq_len = input_ids.shape[1]
112
+ generated_tokens = []
113
+
114
+ for step in range(100): # max tokens
115
+ if step == 0:
116
+ ids = input_ids
117
+ pos = np.arange(seq_len, dtype=np.int64).reshape(1, -1)
118
+ else:
119
+ ids = np.array([[generated_tokens[-1]]], dtype=np.int64)
120
+ pos = np.array([[seq_len + len(generated_tokens) - 1]], dtype=np.int64)
121
+
122
+ attn_mask = np.ones((1, seq_len + len(generated_tokens)), dtype=np.int64)
123
+ feed = {"input_ids": ids, "attention_mask": attn_mask, **cache}
124
+ if use_position_ids:
125
+ feed["position_ids"] = pos
126
+
127
+ outputs = session.run(None, feed)
128
+ next_token = int(np.argmax(outputs[0][0, -1]))
129
+ generated_tokens.append(next_token)
130
+
131
+ # Update cache
132
+ for i, out in enumerate(session.get_outputs()[1:], 1):
133
+ name = out.name.replace("present_conv", "past_conv").replace("present.", "past_key_values.")
134
+ if name in cache:
135
+ cache[name] = outputs[i]
136
+
137
+ if next_token == tokenizer.eos_token_id:
138
+ break
139
+
140
+ print(tokenizer.decode(generated_tokens, skip_special_tokens=True))
141
+ ```
142
+
143
+ ## WebGPU (Browser)
144
+
145
+ ### Installation
146
+
147
+ ```bash
148
+ npm install @huggingface/transformers
149
+ ```
150
+
151
+ ### Inference
152
+
153
+ ```javascript
154
+ import { AutoModelForCausalLM, AutoTokenizer, TextStreamer } from "@huggingface/transformers";
155
+
156
+ const modelId = "LiquidAI/LFM2-2.6B-Transcript-ONNX";
157
+
158
+ // Load model and tokenizer
159
+ const tokenizer = await AutoTokenizer.from_pretrained(modelId);
160
+ const model = await AutoModelForCausalLM.from_pretrained(modelId, {
161
+ device: "webgpu",
162
+ dtype: "q4", // or "fp16"
163
+ });
164
+
165
+ // Prepare input
166
+ const messages = [{ role: "user", content: "Summarize this meeting transcript: ..." }];
167
+ const input = tokenizer.apply_chat_template(messages, {
168
+ add_generation_prompt: true,
169
+ return_dict: true,
170
+ });
171
+
172
+ // Generate with streaming
173
+ const streamer = new TextStreamer(tokenizer, { skip_prompt: true });
174
+ const output = await model.generate({
175
+ ...input,
176
+ max_new_tokens: 256,
177
+ do_sample: false,
178
+ streamer,
179
+ });
180
+
181
+ console.log(tokenizer.decode(output[0], { skip_special_tokens: true }));
182
+ ```
183
+
184
+ ### WebGPU Notes
185
+
186
+ - Enable WebGPU: `chrome://flags/#enable-unsafe-webgpu`
187
+ - Supported: Q4, FP16 (Q8 not supported on WebGPU)
188
+
189
+ ## License
190
+
191
+ This model is released under the [LFM 1.0 License](LICENSE).