#!/usr/bin/env node /* scripts/merge_csv.js 简单的 Node 脚本:合并指定目录或指定多个 CSV 文件为一个 CSV 文件(基本合并,处理不同表头) 用法示例: node scripts/merge_csv.js --inputDir ./data --out merged.csv node scripts/merge_csv.js file1.csv file2.csv --out merged.csv 限制说明:这是一个轻量实现,处理常见 CSV(含双引号字段)。不保证对包含任意换行的复杂嵌套引用字段 100% 兼容。 */ import fs from 'fs/promises' import path from 'path' function usage() { console.log(`Usage:\n node scripts/merge_csv.js --inputDir --out \n node scripts/merge_csv.js --out \n\nOptions:\n --inputDir 读取指定目录下的所有 .csv 文件(非递归)\n --out 输出文件路径(默认: merged.csv)\n --help 显示帮助\n`) } function parseArgs(argv) { const args = argv.slice(2) const opts = { files: [], inputDir: null, out: 'merged.csv' } for (let i = 0; i < args.length; i++) { const a = args[i] if (a === '--help' || a === '-h') { opts.help = true; break } if (a === '--inputDir') { opts.inputDir = args[++i]; continue } if (a === '--out') { opts.out = args[++i]; continue } if (a.startsWith('--')) { console.warn('Unknown option', a) continue } opts.files.push(a) } return opts } // 基本的单行 CSV 字段解析,支持双引号包含和双引号转义("") function parseCSVLine(line) { const res = [] let cur = '' let inQuotes = false for (let i = 0; i < line.length; i++) { const ch = line[i] if (inQuotes) { if (ch === '"') { if (i + 1 < line.length && line[i + 1] === '"') { cur += '"' i++ } else { inQuotes = false } } else { cur += ch } } else { if (ch === ',') { res.push(cur) cur = '' } else if (ch === '"') { inQuotes = true } else { cur += ch } } } res.push(cur) return res } function csvEscape(value) { if (value == null) return '' const s = String(value) if (s.includes('"')) return '"' + s.replace(/"/g, '""') + '"' if (s.includes(',') || s.includes('\n') || s.includes('\r')) return '"' + s + '"' return s } async function readCSVFile(filePath) { const txt = await fs.readFile(filePath, 'utf8') // 兼容 CRLF const lines = txt.split(/\r?\n/) // 找到第一行非空作为 header let headerLineIndex = null for (let i = 0; i < lines.length; i++) { if (lines[i].trim().length > 0) { headerLineIndex = i; break } } if (headerLineIndex === null) return { headers: [], rows: [] } const headers = parseCSVLine(lines[headerLineIndex]) const rows = [] for (let i = headerLineIndex + 1; i < lines.length; i++) { const l = lines[i] if (l == null || l.trim() === '') continue const vals = parseCSVLine(l) const obj = {} for (let j = 0; j < headers.length; j++) { obj[headers[j]] = vals[j] ?? '' } rows.push(obj) } return { headers, rows } } async function main() { const opts = parseArgs(process.argv) if (opts.help) { usage(); return } const cwd = process.cwd() let files = [] if (opts.inputDir) { const dir = path.isAbsolute(opts.inputDir) ? opts.inputDir : path.join(cwd, opts.inputDir) try { const names = await fs.readdir(dir) files = names.filter(n => n.toLowerCase().endsWith('.csv')).map(n => path.join(dir, n)) } catch (e) { console.error('Failed to read inputDir', e.message) process.exit(2) } } if (opts.files && opts.files.length) { const explicit = opts.files.map(f => path.isAbsolute(f) ? f : path.join(cwd, f)) files = files.concat(explicit) } // 去重并保持顺序 files = [...new Set(files)] if (files.length === 0) { console.error('No CSV files specified. Use --inputDir or pass file paths.') usage(); process.exit(1) } const allRows = [] const headerOrder = [] const headerSet = new Set() // 先读取所有文件内容到内存,标记文件名最后字符 const fileDatas = [] for (const f of files) { try { const stat = await fs.stat(f) if (!stat.isFile()) { console.warn('Skipping (not a file):', f); continue } } catch (e) { console.warn('Skipping (not found):', f); continue } const { headers, rows } = await readCSVFile(f) const base = path.basename(f) const nameNoExt = base.replace(/\.[^/.]+$/, '') const lastChar = nameNoExt.slice(-1) fileDatas.push({ path: f, headers, rows, nameNoExt, lastChar }) } // 找到文件3(末尾字符为 '3')并创建第一列值的集合 let file3Set = null const file3 = fileDatas.find(d => d.lastChar === '3') if (file3) { const firstHdr = file3.headers && file3.headers.length > 0 ? file3.headers[0] : null file3Set = new Set() if (firstHdr) { for (const r of file3.rows) { const v = r[firstHdr] if (v != null) file3Set.add(String(v)) } } } // 现在按原顺序合并表头并收集行。对文件0(末尾字符为 '0')如果存在 file3Set,进行过滤: const removedRows = [] for (const d of fileDatas) { const { headers, rows, lastChar } = d for (const h of headers) { if (!headerSet.has(h)) { headerSet.add(h) headerOrder.push(h) } } if (lastChar === '0' && file3Set) { // 使用此文件自身的第一列作为 model_name 字段 const firstHdr = headers && headers.length > 0 ? headers[0] : null if (!firstHdr) continue for (const r of rows) { const val = r[firstHdr] if (val != null && file3Set.has(String(val))) { allRows.push(r) } else { // 记录被删除的行信息,便于日志输出 removedRows.push({ source: d.path, key: firstHdr, value: val, row: r }) } } } else { for (const r of rows) allRows.push(r) } } // 另外一遍,确保所有行都有 headerOrder 中的字段(填空) const outRows = allRows.map(r => { const o = {} for (const h of headerOrder) o[h] = (h in r) ? r[h] : '' // 也把那些在 headerOrder 之后才出现的字段加上(理论上我们已把所有文件头收集到 headerOrder) for (const k of Object.keys(r)) if (!headerSet.has(k)) { headerSet.add(k); headerOrder.push(k); o[k] = r[k] } return o }) // 写出 CSV let outPath = path.isAbsolute(opts.out) ? opts.out : path.join(cwd, opts.out) // 如果用户传入的 out 以路径分隔符结尾或显式是目录,写入该目录下的 merged.csv const looksLikeDir = opts.out.endsWith('/') || opts.out.endsWith('\\') if (looksLikeDir) { outPath = path.join(outPath, 'merged.csv') } try { const st = await fs.stat(outPath) if (st.isDirectory()) { outPath = path.join(outPath, 'merged.csv') } } catch (e) { // not exists -> will create parent directory below } const headerLine = headerOrder.map(csvEscape).join(',') const lines = [headerLine] for (const r of outRows) { const vals = headerOrder.map(h => csvEscape(r[h])) lines.push(vals.join(',')) } await fs.mkdir(path.dirname(outPath), { recursive: true }) await fs.writeFile(outPath, lines.join('\n'), 'utf8') console.log(`Wrote merged CSV to ${outPath} (${outRows.length} rows, ${headerOrder.length} columns)`) // 如果有被删除的行,打印并写日志 if (removedRows.length > 0) { console.log(`Removed ${removedRows.length} rows from files (not present in file3). Logging to removed_rows.log`) const logLines = [] logLines.push(`Removed ${removedRows.length} rows - details:`) for (const it of removedRows) { logLines.push(`source=${it.source} ${it.key}=${it.value} row=${JSON.stringify(it.row)}`) } // 写入到输出目录下的 removed_rows.log const logPath = path.join(path.dirname(outPath), 'removed_rows.log') await fs.writeFile(logPath, logLines.join('\n'), 'utf8') for (let i = 0; i < Math.min(50, logLines.length); i++) console.log(logLines[i]) if (logLines.length > 50) console.log(`... see ${logPath} for full log`) } else { console.log('No rows were removed by file3 filtering.') } } main().catch(err => { console.error('Error:', err && err.stack ? err.stack : err) process.exit(3) })