#!/usr/bin/env node
/*
scripts/merge_csv.js
简单的 Node 脚本:合并指定目录或指定多个 CSV 文件为一个 CSV 文件(基本合并,处理不同表头)
用法示例:
node scripts/merge_csv.js --inputDir ./data --out merged.csv
node scripts/merge_csv.js file1.csv file2.csv --out merged.csv
限制说明:这是一个轻量实现,处理常见 CSV(含双引号字段)。不保证对包含任意换行的复杂嵌套引用字段 100% 兼容。
*/
import fs from 'fs/promises'
import path from 'path'
function usage() {
console.log(`Usage:\n node scripts/merge_csv.js --inputDir
--out \n node scripts/merge_csv.js --out \n\nOptions:\n --inputDir 读取指定目录下的所有 .csv 文件(非递归)\n --out 输出文件路径(默认: merged.csv)\n --help 显示帮助\n`)
}
function parseArgs(argv) {
const args = argv.slice(2)
const opts = { files: [], inputDir: null, out: 'merged.csv' }
for (let i = 0; i < args.length; i++) {
const a = args[i]
if (a === '--help' || a === '-h') { opts.help = true; break }
if (a === '--inputDir') { opts.inputDir = args[++i]; continue }
if (a === '--out') { opts.out = args[++i]; continue }
if (a.startsWith('--')) {
console.warn('Unknown option', a)
continue
}
opts.files.push(a)
}
return opts
}
// 基本的单行 CSV 字段解析,支持双引号包含和双引号转义("")
function parseCSVLine(line) {
const res = []
let cur = ''
let inQuotes = false
for (let i = 0; i < line.length; i++) {
const ch = line[i]
if (inQuotes) {
if (ch === '"') {
if (i + 1 < line.length && line[i + 1] === '"') {
cur += '"'
i++
} else {
inQuotes = false
}
} else {
cur += ch
}
} else {
if (ch === ',') {
res.push(cur)
cur = ''
} else if (ch === '"') {
inQuotes = true
} else {
cur += ch
}
}
}
res.push(cur)
return res
}
function csvEscape(value) {
if (value == null) return ''
const s = String(value)
if (s.includes('"')) return '"' + s.replace(/"/g, '""') + '"'
if (s.includes(',') || s.includes('\n') || s.includes('\r')) return '"' + s + '"'
return s
}
async function readCSVFile(filePath) {
const txt = await fs.readFile(filePath, 'utf8')
// 兼容 CRLF
const lines = txt.split(/\r?\n/)
// 找到第一行非空作为 header
let headerLineIndex = null
for (let i = 0; i < lines.length; i++) {
if (lines[i].trim().length > 0) { headerLineIndex = i; break }
}
if (headerLineIndex === null) return { headers: [], rows: [] }
const headers = parseCSVLine(lines[headerLineIndex])
const rows = []
for (let i = headerLineIndex + 1; i < lines.length; i++) {
const l = lines[i]
if (l == null || l.trim() === '') continue
const vals = parseCSVLine(l)
const obj = {}
for (let j = 0; j < headers.length; j++) {
obj[headers[j]] = vals[j] ?? ''
}
rows.push(obj)
}
return { headers, rows }
}
async function main() {
const opts = parseArgs(process.argv)
if (opts.help) { usage(); return }
const cwd = process.cwd()
let files = []
if (opts.inputDir) {
const dir = path.isAbsolute(opts.inputDir) ? opts.inputDir : path.join(cwd, opts.inputDir)
try {
const names = await fs.readdir(dir)
files = names.filter(n => n.toLowerCase().endsWith('.csv')).map(n => path.join(dir, n))
} catch (e) {
console.error('Failed to read inputDir', e.message)
process.exit(2)
}
}
if (opts.files && opts.files.length) {
const explicit = opts.files.map(f => path.isAbsolute(f) ? f : path.join(cwd, f))
files = files.concat(explicit)
}
// 去重并保持顺序
files = [...new Set(files)]
if (files.length === 0) {
console.error('No CSV files specified. Use --inputDir or pass file paths.')
usage();
process.exit(1)
}
const allRows = []
const headerOrder = []
const headerSet = new Set()
// 先读取所有文件内容到内存,标记文件名最后字符
const fileDatas = []
for (const f of files) {
try {
const stat = await fs.stat(f)
if (!stat.isFile()) { console.warn('Skipping (not a file):', f); continue }
} catch (e) { console.warn('Skipping (not found):', f); continue }
const { headers, rows } = await readCSVFile(f)
const base = path.basename(f)
const nameNoExt = base.replace(/\.[^/.]+$/, '')
const lastChar = nameNoExt.slice(-1)
fileDatas.push({ path: f, headers, rows, nameNoExt, lastChar })
}
// 找到文件3(末尾字符为 '3')并创建第一列值的集合
let file3Set = null
const file3 = fileDatas.find(d => d.lastChar === '3')
if (file3) {
const firstHdr = file3.headers && file3.headers.length > 0 ? file3.headers[0] : null
file3Set = new Set()
if (firstHdr) {
for (const r of file3.rows) {
const v = r[firstHdr]
if (v != null) file3Set.add(String(v))
}
}
}
// 现在按原顺序合并表头并收集行。对文件0(末尾字符为 '0')如果存在 file3Set,进行过滤:
const removedRows = []
for (const d of fileDatas) {
const { headers, rows, lastChar } = d
for (const h of headers) {
if (!headerSet.has(h)) {
headerSet.add(h)
headerOrder.push(h)
}
}
if (lastChar === '0' && file3Set) {
// 使用此文件自身的第一列作为 model_name 字段
const firstHdr = headers && headers.length > 0 ? headers[0] : null
if (!firstHdr) continue
for (const r of rows) {
const val = r[firstHdr]
if (val != null && file3Set.has(String(val))) {
allRows.push(r)
} else {
// 记录被删除的行信息,便于日志输出
removedRows.push({ source: d.path, key: firstHdr, value: val, row: r })
}
}
} else {
for (const r of rows) allRows.push(r)
}
}
// 另外一遍,确保所有行都有 headerOrder 中的字段(填空)
const outRows = allRows.map(r => {
const o = {}
for (const h of headerOrder) o[h] = (h in r) ? r[h] : ''
// 也把那些在 headerOrder 之后才出现的字段加上(理论上我们已把所有文件头收集到 headerOrder)
for (const k of Object.keys(r)) if (!headerSet.has(k)) { headerSet.add(k); headerOrder.push(k); o[k] = r[k] }
return o
})
// 写出 CSV
let outPath = path.isAbsolute(opts.out) ? opts.out : path.join(cwd, opts.out)
// 如果用户传入的 out 以路径分隔符结尾或显式是目录,写入该目录下的 merged.csv
const looksLikeDir = opts.out.endsWith('/') || opts.out.endsWith('\\')
if (looksLikeDir) {
outPath = path.join(outPath, 'merged.csv')
}
try {
const st = await fs.stat(outPath)
if (st.isDirectory()) {
outPath = path.join(outPath, 'merged.csv')
}
} catch (e) {
// not exists -> will create parent directory below
}
const headerLine = headerOrder.map(csvEscape).join(',')
const lines = [headerLine]
for (const r of outRows) {
const vals = headerOrder.map(h => csvEscape(r[h]))
lines.push(vals.join(','))
}
await fs.mkdir(path.dirname(outPath), { recursive: true })
await fs.writeFile(outPath, lines.join('\n'), 'utf8')
console.log(`Wrote merged CSV to ${outPath} (${outRows.length} rows, ${headerOrder.length} columns)`)
// 如果有被删除的行,打印并写日志
if (removedRows.length > 0) {
console.log(`Removed ${removedRows.length} rows from files (not present in file3). Logging to removed_rows.log`)
const logLines = []
logLines.push(`Removed ${removedRows.length} rows - details:`)
for (const it of removedRows) {
logLines.push(`source=${it.source} ${it.key}=${it.value} row=${JSON.stringify(it.row)}`)
}
// 写入到输出目录下的 removed_rows.log
const logPath = path.join(path.dirname(outPath), 'removed_rows.log')
await fs.writeFile(logPath, logLines.join('\n'), 'utf8')
for (let i = 0; i < Math.min(50, logLines.length); i++) console.log(logLines[i])
if (logLines.length > 50) console.log(`... see ${logPath} for full log`)
} else {
console.log('No rows were removed by file3 filtering.')
}
}
main().catch(err => {
console.error('Error:', err && err.stack ? err.stack : err)
process.exit(3)
})