File size: 3,404 Bytes
8b41737 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 | export interface ParsedMessage {
role: string;
content: string;
}
export function parsePrompt(text: string): ParsedMessage[] {
if (!text || !text.trim()) return [];
// Try 1: JSON array of {role, content} objects
try {
const parsed = JSON.parse(text);
if (Array.isArray(parsed) && parsed.length > 0 && parsed[0].role !== undefined) {
return parsed.map((m: Record<string, unknown>) => ({
role: String(m.role || "unknown"),
content: String(m.content ?? ""),
}));
}
} catch {
// Not JSON
}
// Try 2: ChatML — <|im_start|>role\ncontent<|im_end|>
if (text.includes("<|im_start|>")) {
const parts = text.split("<|im_start|>").filter(Boolean);
return parts.map((part) => {
const nlIdx = part.indexOf("\n");
const role = nlIdx > 0 ? part.slice(0, nlIdx).trim() : "unknown";
const content = (nlIdx > 0 ? part.slice(nlIdx + 1) : part)
.replace(/<\|im_end\|>/g, "")
.trim();
return { role, content };
});
}
// Try 3: Generic chat template — <|system|>, <|user|>, <|assistant|>
if (/<\|(system|user|assistant)\|>/.test(text)) {
const regex = /<\|(system|user|assistant)\|>/g;
const positions: { role: string; start: number; tagEnd: number }[] = [];
let match;
while ((match = regex.exec(text)) !== null) {
positions.push({
role: match[1],
start: match.index,
tagEnd: match.index + match[0].length,
});
}
return positions.map((pos, i) => {
const end = i + 1 < positions.length ? positions[i + 1].start : text.length;
return { role: pos.role, content: text.slice(pos.tagEnd, end).trim() };
});
}
// Try 4: Llama-style — <<SYS>>, [INST], [/INST]
if (text.includes("[INST]") || text.includes("<<SYS>>")) {
const messages: ParsedMessage[] = [];
const sysMatch = text.match(/<<SYS>>([\s\S]*?)<<\/SYS>>/);
if (sysMatch) {
messages.push({ role: "system", content: sysMatch[1].trim() });
}
// Split on [INST] and [/INST] markers
const withoutSys = text.replace(/<<SYS>>[\s\S]*?<<\/SYS>>/g, "");
const segments = withoutSys.split(/\[INST\]|\[\/INST\]/).map((s) => s.trim()).filter(Boolean);
let isUser = true;
for (const seg of segments) {
messages.push({ role: isUser ? "user" : "assistant", content: seg });
isUser = !isUser;
}
return messages.length > 0 ? messages : [{ role: "prompt", content: text }];
}
// Try 5: Plain labeled — "System:", "User:", "Assistant:", "Human:"
if (/^(System|User|Assistant|Human):\s/m.test(text)) {
const regex = /^(System|User|Assistant|Human):\s*/gm;
const positions: { role: string; contentStart: number }[] = [];
let match;
while ((match = regex.exec(text)) !== null) {
const role = match[1].toLowerCase() === "human" ? "user" : match[1].toLowerCase();
positions.push({ role, contentStart: match.index + match[0].length });
}
return positions.map((pos, i) => {
const end = i + 1 < positions.length
? text.lastIndexOf("\n", positions[i + 1].contentStart - positions[i + 1].role.length - 2)
: text.length;
return {
role: pos.role,
content: text.slice(pos.contentStart, end > pos.contentStart ? end : text.length).trim(),
};
});
}
// Fallback: single prompt block
return [{ role: "prompt", content: text }];
}
|