ping98k commited on
Commit
f08e6a1
·
1 Parent(s): 7376f34

move to main

Browse files
Files changed (1) hide show
  1. main.js +176 -0
main.js ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { pipeline, TextStreamer, AutoTokenizer, AutoModelForCausalLM } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.6.0';
2
+ import { UMAP } from "https://cdn.jsdelivr.net/npm/umap-js@1.4.0/+esm";
3
+
4
+ const embed = await pipeline(
5
+ "feature-extraction",
6
+ "onnx-community/Qwen3-Embedding-0.6B-ONNX",
7
+ { device: "webgpu", dtype: "q4f16" },
8
+ );
9
+ const tokenizer = await AutoTokenizer.from_pretrained("onnx-community/Qwen3-0.6B-ONNX");
10
+ const model = await AutoModelForCausalLM.from_pretrained("onnx-community/Qwen3-0.6B-ONNX", { device: "webgpu", dtype: "q4f16" });
11
+
12
+ const task = "Given a textual input sentence, retrieve relevant categories that best describe it.";
13
+
14
+ document.getElementById("run").onclick = async () => {
15
+ const text = document.getElementById("input").value;
16
+ const groups = text.split(/\n{3,}/);
17
+ const groupEmbeddings = [];
18
+ for (const g of groups) {
19
+ const lines = g.split(/\n/).filter(x => x.trim() != "");
20
+ const prompts = lines.map(s => `Instruct: ${task}\nQuery:${s}`);
21
+ const out = await embed(prompts, { pooling: "mean", normalize: true });
22
+ const embeddings = typeof out.tolist === 'function' ? out.tolist() : out.data;
23
+ const dim = embeddings[0].length;
24
+ const avg = new Float32Array(dim);
25
+ for (const e of embeddings) { for (let i = 0; i < dim; i++) avg[i] += e[i]; }
26
+ for (let i = 0; i < dim; i++) avg[i] /= embeddings.length;
27
+ groupEmbeddings.push(avg);
28
+ }
29
+ const n = groupEmbeddings.length;
30
+ const sim = [];
31
+ for (let i = 0; i < n; i++) {
32
+ const row = [];
33
+ for (let j = 0; j < n; j++) {
34
+ let dot = 0, na = 0, nb = 0;
35
+ for (let k = 0; k < groupEmbeddings[i].length; k++) {
36
+ dot += groupEmbeddings[i][k] * groupEmbeddings[j][k];
37
+ na += groupEmbeddings[i][k] ** 2;
38
+ nb += groupEmbeddings[j][k] ** 2;
39
+ }
40
+ row.push(dot / Math.sqrt(na * nb));
41
+ }
42
+ sim.push(row);
43
+ }
44
+ const data = [{ z: sim, type: "heatmap", colorscale: "Viridis", zmin: 0, zmax: 1 }];
45
+ Plotly.newPlot("plot-heatmap", data, {
46
+ xaxis: { title: "Group", scaleanchor: "y", scaleratio: 1 },
47
+ yaxis: { title: "Group", scaleanchor: "x", scaleratio: 1 },
48
+ width: 500,
49
+ height: 500,
50
+ margin: { t: 40, l: 40, r: 10, b: 40 },
51
+ title: "Group Similarity Heatmap"
52
+ });
53
+ };
54
+
55
+ // --- K-Means Clustering ---
56
+ document.getElementById("kmeans-btn").onclick = async () => {
57
+ const progressBar = document.getElementById("progress-bar");
58
+ const progressBarInner = document.getElementById("progress-bar-inner");
59
+ progressBar.style.display = "block";
60
+ progressBarInner.style.width = "0%";
61
+
62
+ const text = document.getElementById("input").value;
63
+ const lines = text.split(/\n/).map(x => x.trim()).filter(x => x);
64
+ const prompts = lines.map(s => `Instruct: ${task}\nQuery:${s}`);
65
+ const out = await embed(prompts, { pooling: "mean", normalize: true });
66
+ const embeddings = typeof out.tolist === 'function' ? out.tolist() : out.data;
67
+
68
+ // K-Means implementation
69
+ const k = Math.max(2, Math.min(20, parseInt(document.getElementById("kmeans-k").value) || 3));
70
+ const n = embeddings.length, dim = embeddings[0].length;
71
+ let centroids = Array.from({ length: k }, () => embeddings[Math.floor(Math.random() * n)].slice());
72
+ let labels = new Array(n).fill(0);
73
+ for (let iter = 0; iter < 20; ++iter) {
74
+ for (let i = 0; i < n; ++i) {
75
+ let best = 0, bestDist = Infinity;
76
+ for (let c = 0; c < k; ++c) {
77
+ let dist = 0;
78
+ for (let d = 0; d < dim; ++d) dist += (embeddings[i][d] - centroids[c][d]) ** 2;
79
+ if (dist < bestDist) { bestDist = dist; best = c; }
80
+ }
81
+ labels[i] = best;
82
+ }
83
+ centroids = Array.from({ length: k }, () => new Array(dim).fill(0));
84
+ const counts = new Array(k).fill(0);
85
+ for (let i = 0; i < n; ++i) {
86
+ counts[labels[i]]++;
87
+ for (let d = 0; d < dim; ++d) centroids[labels[i]][d] += embeddings[i][d];
88
+ }
89
+ for (let c = 0; c < k; ++c) if (counts[c]) for (let d = 0; d < dim; ++d) centroids[c][d] /= counts[c];
90
+ }
91
+ // UMAP for 2D projection
92
+ const umap = new UMAP({ nComponents: 2 });
93
+ const proj = umap.fit(embeddings);
94
+ // Group lines by cluster
95
+ const clustered = Array.from({ length: k }, (_, c) => []);
96
+ for (let i = 0; i < n; ++i) clustered[labels[i]].push(lines[i]);
97
+ // Generate cluster names using text generation pipeline (async with progress)
98
+ const clusterNames = [];
99
+ for (let c = 0; c < k; ++c) {
100
+ progressBarInner.style.width = `${Math.round(((c) / k) * 100)}%`;
101
+ const joined = clustered[c].join("\n");
102
+ const messages = [
103
+ { role: "system", content: "You are a helpful assistant." },
104
+ { role: "user", content: `Given the following texts, provide a short, descriptive name for this group:\n\n${joined}` }
105
+ ];
106
+ const reasonEnabled = false;
107
+ const inputs = tokenizer.apply_chat_template(messages, {
108
+ add_generation_prompt: true,
109
+ return_dict: true,
110
+ enable_thinking: reasonEnabled,
111
+ });
112
+ const [START_THINKING_TOKEN_ID, END_THINKING_TOKEN_ID] = tokenizer.encode("<think></think>", { add_special_tokens: false });
113
+ let state = "answering";
114
+ let startTime;
115
+ let numTokens = 0;
116
+ let tps;
117
+ const token_callback_function = (tokens) => {
118
+ startTime ??= performance.now();
119
+ if (numTokens++ > 0) {
120
+ tps = (numTokens / (performance.now() - startTime)) * 1000;
121
+ }
122
+ switch (Number(tokens[0])) {
123
+ case START_THINKING_TOKEN_ID:
124
+ state = "thinking";
125
+ break;
126
+ case END_THINKING_TOKEN_ID:
127
+ state = "answering";
128
+ break;
129
+ }
130
+ console.log(state, tokens, tokenizer.decode(tokens));
131
+ };
132
+ const callback_function = (output) => {
133
+ // You can update UI here if desired
134
+ console.log({ output, tps, numTokens, state });
135
+ };
136
+ const streamer = new TextStreamer(tokenizer, {
137
+ skip_prompt: true,
138
+ skip_special_tokens: true,
139
+ callback_function,
140
+ token_callback_function,
141
+ });
142
+ const outputTokens = await model.generate({
143
+ ...inputs,
144
+ max_new_tokens: 32,
145
+ do_sample: false,
146
+ streamer,
147
+ });
148
+ let name = tokenizer.decode(outputTokens[0], { skip_special_tokens: false }).trim();
149
+ clusterNames.push(name.length > 0 ? name : `Cluster ${c + 1}`);
150
+ }
151
+ progressBarInner.style.width = "100%";
152
+ setTimeout(() => { progressBar.style.display = "none"; }, 400);
153
+ // Plot
154
+ const colors = ["red", "blue", "green", "orange", "purple", "cyan", "magenta", "yellow", "brown", "black", "lime", "navy", "teal", "olive", "maroon", "pink", "gray", "gold", "aqua", "indigo"];
155
+ const traces = Array.from({ length: k }, (_, c) => ({
156
+ x: [], y: [], text: [], mode: "markers", type: "scatter", name: clusterNames[c],
157
+ marker: { color: colors[c % colors.length], size: 12, line: { width: 1, color: '#333' } }
158
+ }));
159
+ for (let i = 0; i < n; ++i) {
160
+ traces[labels[i]].x.push(proj[i][0]);
161
+ traces[labels[i]].y.push(proj[i][1]);
162
+ traces[labels[i]].text.push(lines[i]);
163
+ }
164
+ Plotly.newPlot("plot-scatter", traces, {
165
+ xaxis: { title: "UMAP-1", scaleanchor: "y", scaleratio: 1 },
166
+ yaxis: { title: "UMAP-2", scaleanchor: "x", scaleratio: 1 },
167
+ width: 1000,
168
+ height: 500,
169
+ margin: { t: 40, l: 40, r: 10, b: 40 },
170
+ title: `K-Means Clustering (k=${k})`
171
+ });
172
+ // Update textarea: group by cluster, separated by triple newlines
173
+ document.getElementById("input").value = clustered.map(g => g.join("\n")).join("\n\n\n");
174
+ // Re-run heatmap after updating textarea
175
+ document.getElementById("run").onclick();
176
+ };