File size: 4,201 Bytes
8fc8501 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 | import type { EmbedDataFileMeta } from "../editor/embeds/embed-data-store";
/**
* Light-weight client-side parsing of uploaded data files. Extracts
* column names and row counts so the sidebar and agent system prompt
* can preview dataset shape without shipping the full content every
* time.
*
* Parsing is deliberately forgiving: it never throws on malformed
* input, and returns `undefined` metadata fields when shape cannot be
* inferred.
*/
export const ACCEPTED_DATA_EXTS = ["csv", "tsv", "json", "txt", "ndjson"] as const;
export type AcceptedDataExt = (typeof ACCEPTED_DATA_EXTS)[number];
export const MAX_DATA_FILE_SIZE = 3 * 1024 * 1024;
export function extFromName(name: string): string {
const match = name.toLowerCase().match(/\.([a-z0-9]+)$/);
return match ? match[1] : "";
}
export function isAcceptedExt(ext: string): ext is AcceptedDataExt {
return (ACCEPTED_DATA_EXTS as readonly string[]).includes(ext);
}
function splitCsvLine(line: string, delim: string): string[] {
const out: string[] = [];
let cur = "";
let inQuotes = false;
for (let i = 0; i < line.length; i++) {
const c = line[i];
if (inQuotes) {
if (c === '"') {
if (line[i + 1] === '"') {
cur += '"';
i++;
} else {
inQuotes = false;
}
} else {
cur += c;
}
} else if (c === '"') {
inQuotes = true;
} else if (c === delim) {
out.push(cur);
cur = "";
} else {
cur += c;
}
}
out.push(cur);
return out;
}
interface ParsedShape {
rowCount?: number;
columns?: string[];
}
function parseDelimited(content: string, delim: string): ParsedShape {
const lines = content
.split(/\r\n|\n|\r/)
.filter((l) => l.length > 0);
if (lines.length === 0) return {};
const header = splitCsvLine(lines[0], delim).map((c) => c.trim());
return {
columns: header,
rowCount: Math.max(0, lines.length - 1),
};
}
function parseJson(content: string): ParsedShape {
try {
const parsed = JSON.parse(content);
if (Array.isArray(parsed)) {
const first = parsed.find((r) => r && typeof r === "object");
return {
rowCount: parsed.length,
columns: first ? Object.keys(first as Record<string, unknown>) : undefined,
};
}
if (parsed && typeof parsed === "object") {
return { columns: Object.keys(parsed as Record<string, unknown>) };
}
} catch {
// swallow - return empty shape
}
return {};
}
function parseNdjson(content: string): ParsedShape {
const lines = content.split(/\r\n|\n|\r/).filter((l) => l.trim().length > 0);
if (lines.length === 0) return {};
let columns: string[] | undefined;
try {
const first = JSON.parse(lines[0]);
if (first && typeof first === "object" && !Array.isArray(first)) {
columns = Object.keys(first as Record<string, unknown>);
}
} catch {
// ignore
}
return { rowCount: lines.length, columns };
}
export function inferDataShape(ext: string, content: string): ParsedShape {
switch (ext) {
case "csv":
return parseDelimited(content, ",");
case "tsv":
return parseDelimited(content, "\t");
case "json":
return parseJson(content);
case "ndjson":
return parseNdjson(content);
default:
return {};
}
}
/**
* Build a lightweight manifest line per file suitable for inclusion in
* the agent system prompt. Keeps it to one line per file so many
* datasets can coexist without exploding prompt size.
*/
export function formatManifestLine(meta: EmbedDataFileMeta): string {
const size = formatBytes(meta.size);
const shape =
meta.rowCount !== undefined
? ` - ${meta.rowCount} rows`
: "";
const cols = meta.columns && meta.columns.length > 0
? ` - columns: ${meta.columns.slice(0, 12).join(", ")}${meta.columns.length > 12 ? ", ..." : ""}`
: "";
return `- ${meta.name} (${meta.ext.toUpperCase()}, ${size}${shape})${cols}`;
}
export function formatBytes(bytes: number): string {
if (bytes < 1024) return `${bytes} B`;
if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
return `${(bytes / (1024 * 1024)).toFixed(2)} MB`;
}
|