File size: 8,447 Bytes
309320b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 |
#!/usr/bin/env node
/*
scripts/merge_csv.js
简单的 Node 脚本:合并指定目录或指定多个 CSV 文件为一个 CSV 文件(基本合并,处理不同表头)
用法示例:
node scripts/merge_csv.js --inputDir ./data --out merged.csv
node scripts/merge_csv.js file1.csv file2.csv --out merged.csv
限制说明:这是一个轻量实现,处理常见 CSV(含双引号字段)。不保证对包含任意换行的复杂嵌套引用字段 100% 兼容。
*/
import fs from 'fs/promises'
import path from 'path'
function usage() {
console.log(`Usage:\n node scripts/merge_csv.js --inputDir <dir> --out <out.csv>\n node scripts/merge_csv.js <file1.csv> <file2.csv> --out <out.csv>\n\nOptions:\n --inputDir 读取指定目录下的所有 .csv 文件(非递归)\n --out 输出文件路径(默认: merged.csv)\n --help 显示帮助\n`)
}
function parseArgs(argv) {
const args = argv.slice(2)
const opts = { files: [], inputDir: null, out: 'merged.csv' }
for (let i = 0; i < args.length; i++) {
const a = args[i]
if (a === '--help' || a === '-h') { opts.help = true; break }
if (a === '--inputDir') { opts.inputDir = args[++i]; continue }
if (a === '--out') { opts.out = args[++i]; continue }
if (a.startsWith('--')) {
console.warn('Unknown option', a)
continue
}
opts.files.push(a)
}
return opts
}
// 基本的单行 CSV 字段解析,支持双引号包含和双引号转义("")
function parseCSVLine(line) {
const res = []
let cur = ''
let inQuotes = false
for (let i = 0; i < line.length; i++) {
const ch = line[i]
if (inQuotes) {
if (ch === '"') {
if (i + 1 < line.length && line[i + 1] === '"') {
cur += '"'
i++
} else {
inQuotes = false
}
} else {
cur += ch
}
} else {
if (ch === ',') {
res.push(cur)
cur = ''
} else if (ch === '"') {
inQuotes = true
} else {
cur += ch
}
}
}
res.push(cur)
return res
}
function csvEscape(value) {
if (value == null) return ''
const s = String(value)
if (s.includes('"')) return '"' + s.replace(/"/g, '""') + '"'
if (s.includes(',') || s.includes('\n') || s.includes('\r')) return '"' + s + '"'
return s
}
async function readCSVFile(filePath) {
const txt = await fs.readFile(filePath, 'utf8')
// 兼容 CRLF
const lines = txt.split(/\r?\n/)
// 找到第一行非空作为 header
let headerLineIndex = null
for (let i = 0; i < lines.length; i++) {
if (lines[i].trim().length > 0) { headerLineIndex = i; break }
}
if (headerLineIndex === null) return { headers: [], rows: [] }
const headers = parseCSVLine(lines[headerLineIndex])
const rows = []
for (let i = headerLineIndex + 1; i < lines.length; i++) {
const l = lines[i]
if (l == null || l.trim() === '') continue
const vals = parseCSVLine(l)
const obj = {}
for (let j = 0; j < headers.length; j++) {
obj[headers[j]] = vals[j] ?? ''
}
rows.push(obj)
}
return { headers, rows }
}
async function main() {
const opts = parseArgs(process.argv)
if (opts.help) { usage(); return }
const cwd = process.cwd()
let files = []
if (opts.inputDir) {
const dir = path.isAbsolute(opts.inputDir) ? opts.inputDir : path.join(cwd, opts.inputDir)
try {
const names = await fs.readdir(dir)
files = names.filter(n => n.toLowerCase().endsWith('.csv')).map(n => path.join(dir, n))
} catch (e) {
console.error('Failed to read inputDir', e.message)
process.exit(2)
}
}
if (opts.files && opts.files.length) {
const explicit = opts.files.map(f => path.isAbsolute(f) ? f : path.join(cwd, f))
files = files.concat(explicit)
}
// 去重并保持顺序
files = [...new Set(files)]
if (files.length === 0) {
console.error('No CSV files specified. Use --inputDir or pass file paths.')
usage();
process.exit(1)
}
const allRows = []
const headerOrder = []
const headerSet = new Set()
// 先读取所有文件内容到内存,标记文件名最后字符
const fileDatas = []
for (const f of files) {
try {
const stat = await fs.stat(f)
if (!stat.isFile()) { console.warn('Skipping (not a file):', f); continue }
} catch (e) { console.warn('Skipping (not found):', f); continue }
const { headers, rows } = await readCSVFile(f)
const base = path.basename(f)
const nameNoExt = base.replace(/\.[^/.]+$/, '')
const lastChar = nameNoExt.slice(-1)
fileDatas.push({ path: f, headers, rows, nameNoExt, lastChar })
}
// 找到文件3(末尾字符为 '3')并创建第一列值的集合
let file3Set = null
const file3 = fileDatas.find(d => d.lastChar === '3')
if (file3) {
const firstHdr = file3.headers && file3.headers.length > 0 ? file3.headers[0] : null
file3Set = new Set()
if (firstHdr) {
for (const r of file3.rows) {
const v = r[firstHdr]
if (v != null) file3Set.add(String(v))
}
}
}
// 现在按原顺序合并表头并收集行。对文件0(末尾字符为 '0')如果存在 file3Set,进行过滤:
const removedRows = []
for (const d of fileDatas) {
const { headers, rows, lastChar } = d
for (const h of headers) {
if (!headerSet.has(h)) {
headerSet.add(h)
headerOrder.push(h)
}
}
if (lastChar === '0' && file3Set) {
// 使用此文件自身的第一列作为 model_name 字段
const firstHdr = headers && headers.length > 0 ? headers[0] : null
if (!firstHdr) continue
for (const r of rows) {
const val = r[firstHdr]
if (val != null && file3Set.has(String(val))) {
allRows.push(r)
} else {
// 记录被删除的行信息,便于日志输出
removedRows.push({ source: d.path, key: firstHdr, value: val, row: r })
}
}
} else {
for (const r of rows) allRows.push(r)
}
}
// 另外一遍,确保所有行都有 headerOrder 中的字段(填空)
const outRows = allRows.map(r => {
const o = {}
for (const h of headerOrder) o[h] = (h in r) ? r[h] : ''
// 也把那些在 headerOrder 之后才出现的字段加上(理论上我们已把所有文件头收集到 headerOrder)
for (const k of Object.keys(r)) if (!headerSet.has(k)) { headerSet.add(k); headerOrder.push(k); o[k] = r[k] }
return o
})
// 写出 CSV
let outPath = path.isAbsolute(opts.out) ? opts.out : path.join(cwd, opts.out)
// 如果用户传入的 out 以路径分隔符结尾或显式是目录,写入该目录下的 merged.csv
const looksLikeDir = opts.out.endsWith('/') || opts.out.endsWith('\\')
if (looksLikeDir) {
outPath = path.join(outPath, 'merged.csv')
}
try {
const st = await fs.stat(outPath)
if (st.isDirectory()) {
outPath = path.join(outPath, 'merged.csv')
}
} catch (e) {
// not exists -> will create parent directory below
}
const headerLine = headerOrder.map(csvEscape).join(',')
const lines = [headerLine]
for (const r of outRows) {
const vals = headerOrder.map(h => csvEscape(r[h]))
lines.push(vals.join(','))
}
await fs.mkdir(path.dirname(outPath), { recursive: true })
await fs.writeFile(outPath, lines.join('\n'), 'utf8')
console.log(`Wrote merged CSV to ${outPath} (${outRows.length} rows, ${headerOrder.length} columns)`)
// 如果有被删除的行,打印并写日志
if (removedRows.length > 0) {
console.log(`Removed ${removedRows.length} rows from files (not present in file3). Logging to removed_rows.log`)
const logLines = []
logLines.push(`Removed ${removedRows.length} rows - details:`)
for (const it of removedRows) {
logLines.push(`source=${it.source} ${it.key}=${it.value} row=${JSON.stringify(it.row)}`)
}
// 写入到输出目录下的 removed_rows.log
const logPath = path.join(path.dirname(outPath), 'removed_rows.log')
await fs.writeFile(logPath, logLines.join('\n'), 'utf8')
for (let i = 0; i < Math.min(50, logLines.length); i++) console.log(logLines[i])
if (logLines.length > 50) console.log(`... see ${logPath} for full log`)
} else {
console.log('No rows were removed by file3 filtering.')
}
}
main().catch(err => {
console.error('Error:', err && err.stack ? err.stack : err)
process.exit(3)
})
|