File size: 4,201 Bytes
8fc8501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import type { EmbedDataFileMeta } from "../editor/embeds/embed-data-store";

/**
 * Light-weight client-side parsing of uploaded data files. Extracts
 * column names and row counts so the sidebar and agent system prompt
 * can preview dataset shape without shipping the full content every
 * time.
 *
 * Parsing is deliberately forgiving: it never throws on malformed
 * input, and returns `undefined` metadata fields when shape cannot be
 * inferred.
 */

export const ACCEPTED_DATA_EXTS = ["csv", "tsv", "json", "txt", "ndjson"] as const;
export type AcceptedDataExt = (typeof ACCEPTED_DATA_EXTS)[number];

export const MAX_DATA_FILE_SIZE = 3 * 1024 * 1024;

export function extFromName(name: string): string {
  const match = name.toLowerCase().match(/\.([a-z0-9]+)$/);
  return match ? match[1] : "";
}

export function isAcceptedExt(ext: string): ext is AcceptedDataExt {
  return (ACCEPTED_DATA_EXTS as readonly string[]).includes(ext);
}

function splitCsvLine(line: string, delim: string): string[] {
  const out: string[] = [];
  let cur = "";
  let inQuotes = false;
  for (let i = 0; i < line.length; i++) {
    const c = line[i];
    if (inQuotes) {
      if (c === '"') {
        if (line[i + 1] === '"') {
          cur += '"';
          i++;
        } else {
          inQuotes = false;
        }
      } else {
        cur += c;
      }
    } else if (c === '"') {
      inQuotes = true;
    } else if (c === delim) {
      out.push(cur);
      cur = "";
    } else {
      cur += c;
    }
  }
  out.push(cur);
  return out;
}

interface ParsedShape {
  rowCount?: number;
  columns?: string[];
}

function parseDelimited(content: string, delim: string): ParsedShape {
  const lines = content
    .split(/\r\n|\n|\r/)
    .filter((l) => l.length > 0);
  if (lines.length === 0) return {};
  const header = splitCsvLine(lines[0], delim).map((c) => c.trim());
  return {
    columns: header,
    rowCount: Math.max(0, lines.length - 1),
  };
}

function parseJson(content: string): ParsedShape {
  try {
    const parsed = JSON.parse(content);
    if (Array.isArray(parsed)) {
      const first = parsed.find((r) => r && typeof r === "object");
      return {
        rowCount: parsed.length,
        columns: first ? Object.keys(first as Record<string, unknown>) : undefined,
      };
    }
    if (parsed && typeof parsed === "object") {
      return { columns: Object.keys(parsed as Record<string, unknown>) };
    }
  } catch {
    // swallow - return empty shape
  }
  return {};
}

function parseNdjson(content: string): ParsedShape {
  const lines = content.split(/\r\n|\n|\r/).filter((l) => l.trim().length > 0);
  if (lines.length === 0) return {};
  let columns: string[] | undefined;
  try {
    const first = JSON.parse(lines[0]);
    if (first && typeof first === "object" && !Array.isArray(first)) {
      columns = Object.keys(first as Record<string, unknown>);
    }
  } catch {
    // ignore
  }
  return { rowCount: lines.length, columns };
}

export function inferDataShape(ext: string, content: string): ParsedShape {
  switch (ext) {
    case "csv":
      return parseDelimited(content, ",");
    case "tsv":
      return parseDelimited(content, "\t");
    case "json":
      return parseJson(content);
    case "ndjson":
      return parseNdjson(content);
    default:
      return {};
  }
}

/**
 * Build a lightweight manifest line per file suitable for inclusion in
 * the agent system prompt. Keeps it to one line per file so many
 * datasets can coexist without exploding prompt size.
 */
export function formatManifestLine(meta: EmbedDataFileMeta): string {
  const size = formatBytes(meta.size);
  const shape =
    meta.rowCount !== undefined
      ? ` - ${meta.rowCount} rows`
      : "";
  const cols = meta.columns && meta.columns.length > 0
    ? ` - columns: ${meta.columns.slice(0, 12).join(", ")}${meta.columns.length > 12 ? ", ..." : ""}`
    : "";
  return `- ${meta.name} (${meta.ext.toUpperCase()}, ${size}${shape})${cols}`;
}

export function formatBytes(bytes: number): string {
  if (bytes < 1024) return `${bytes} B`;
  if (bytes < 1024 * 1024) return `${(bytes / 1024).toFixed(1)} KB`;
  return `${(bytes / (1024 * 1024)).toFixed(2)} MB`;
}