Spaces:
Running
Running
ping98k
Enhance cluster naming and K-Means functionality; implement random selection for cluster names, improve centroid initialization, and adjust UMAP parameters for better projection accuracy.
935873d | import { AutoTokenizer, AutoModelForCausalLM } from 'https://cdn.jsdelivr.net/npm/@huggingface/transformers@3.6.0'; | |
| import { prompt_cluster } from "./prompt_cluster.js"; | |
| const tokenizer = await AutoTokenizer.from_pretrained("onnx-community/Qwen3-0.6B-ONNX"); | |
| const model = await AutoModelForCausalLM.from_pretrained("onnx-community/Qwen3-0.6B-ONNX", { device: "webgpu", dtype: "q4f16" }); | |
| export async function nameCluster(lines) { | |
| // If more than 5 lines, randomly pick 5 | |
| let selected = lines; | |
| if (lines.length > 5) { | |
| selected = []; | |
| const used = new Set(); | |
| while (selected.length < 5) { | |
| const idx = Math.floor(Math.random() * lines.length); | |
| if (!used.has(idx)) { | |
| used.add(idx); | |
| selected.push(lines[idx]); | |
| } | |
| } | |
| } | |
| const joined = selected.join("\n"); | |
| const messages = [ | |
| { role: "system", content: prompt_cluster }, | |
| { role: "user", content: `Input:\n${joined}\nOutput:` } | |
| ]; | |
| const inputs = tokenizer.apply_chat_template(messages, { | |
| add_generation_prompt: true, | |
| return_dict: true, | |
| enable_thinking: false, | |
| }); | |
| const outputTokens = await model.generate({ | |
| ...inputs, | |
| max_new_tokens: 1024, | |
| do_sample: true, | |
| temperature: 0.6 | |
| }); | |
| let rawName = tokenizer.decode(outputTokens[0], { skip_special_tokens: false }).trim(); | |
| const THINK_TAG = "</think>"; | |
| const END_TAG = "<|im_end|>"; | |
| if (rawName.includes(THINK_TAG)) { | |
| rawName = rawName.substring(rawName.lastIndexOf(THINK_TAG) + THINK_TAG.length).trim(); | |
| } | |
| if (rawName.includes(END_TAG)) { | |
| rawName = rawName.substring(0, rawName.indexOf(END_TAG)).trim(); | |
| } | |
| return rawName; | |
| } | |