File size: 8,447 Bytes
309320b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
#!/usr/bin/env node
/*
  scripts/merge_csv.js
  简单的 Node 脚本:合并指定目录或指定多个 CSV 文件为一个 CSV 文件(基本合并,处理不同表头)

  用法示例:
    node scripts/merge_csv.js --inputDir ./data --out merged.csv
    node scripts/merge_csv.js file1.csv file2.csv --out merged.csv

  限制说明:这是一个轻量实现,处理常见 CSV(含双引号字段)。不保证对包含任意换行的复杂嵌套引用字段 100% 兼容。
*/

import fs from 'fs/promises'
import path from 'path'

function usage() {
  console.log(`Usage:\n  node scripts/merge_csv.js --inputDir <dir> --out <out.csv>\n  node scripts/merge_csv.js <file1.csv> <file2.csv> --out <out.csv>\n\nOptions:\n  --inputDir   读取指定目录下的所有 .csv 文件(非递归)\n  --out        输出文件路径(默认: merged.csv)\n  --help       显示帮助\n`)
}

function parseArgs(argv) {
  const args = argv.slice(2)
  const opts = { files: [], inputDir: null, out: 'merged.csv' }
  for (let i = 0; i < args.length; i++) {
    const a = args[i]
    if (a === '--help' || a === '-h') { opts.help = true; break }
    if (a === '--inputDir') { opts.inputDir = args[++i]; continue }
    if (a === '--out') { opts.out = args[++i]; continue }
    if (a.startsWith('--')) {
      console.warn('Unknown option', a)
      continue
    }
    opts.files.push(a)
  }
  return opts
}

// 基本的单行 CSV 字段解析,支持双引号包含和双引号转义("")
function parseCSVLine(line) {
  const res = []
  let cur = ''
  let inQuotes = false
  for (let i = 0; i < line.length; i++) {
    const ch = line[i]
    if (inQuotes) {
      if (ch === '"') {
        if (i + 1 < line.length && line[i + 1] === '"') {
          cur += '"'
          i++
        } else {
          inQuotes = false
        }
      } else {
        cur += ch
      }
    } else {
      if (ch === ',') {
        res.push(cur)
        cur = ''
      } else if (ch === '"') {
        inQuotes = true
      } else {
        cur += ch
      }
    }
  }
  res.push(cur)
  return res
}

function csvEscape(value) {
  if (value == null) return ''
  const s = String(value)
  if (s.includes('"')) return '"' + s.replace(/"/g, '""') + '"'
  if (s.includes(',') || s.includes('\n') || s.includes('\r')) return '"' + s + '"'
  return s
}

async function readCSVFile(filePath) {
  const txt = await fs.readFile(filePath, 'utf8')
  // 兼容 CRLF
  const lines = txt.split(/\r?\n/)
  // 找到第一行非空作为 header
  let headerLineIndex = null
  for (let i = 0; i < lines.length; i++) {
    if (lines[i].trim().length > 0) { headerLineIndex = i; break }
  }
  if (headerLineIndex === null) return { headers: [], rows: [] }
  const headers = parseCSVLine(lines[headerLineIndex])
  const rows = []
  for (let i = headerLineIndex + 1; i < lines.length; i++) {
    const l = lines[i]
    if (l == null || l.trim() === '') continue
    const vals = parseCSVLine(l)
    const obj = {}
    for (let j = 0; j < headers.length; j++) {
      obj[headers[j]] = vals[j] ?? ''
    }
    rows.push(obj)
  }
  return { headers, rows }
}

async function main() {
  const opts = parseArgs(process.argv)
  if (opts.help) { usage(); return }

  const cwd = process.cwd()
  let files = []
  if (opts.inputDir) {
    const dir = path.isAbsolute(opts.inputDir) ? opts.inputDir : path.join(cwd, opts.inputDir)
    try {
      const names = await fs.readdir(dir)
      files = names.filter(n => n.toLowerCase().endsWith('.csv')).map(n => path.join(dir, n))
    } catch (e) {
      console.error('Failed to read inputDir', e.message)
      process.exit(2)
    }
  }
  if (opts.files && opts.files.length) {
    const explicit = opts.files.map(f => path.isAbsolute(f) ? f : path.join(cwd, f))
    files = files.concat(explicit)
  }
  // 去重并保持顺序
  files = [...new Set(files)]
  if (files.length === 0) {
    console.error('No CSV files specified. Use --inputDir or pass file paths.')
    usage();
    process.exit(1)
  }

  const allRows = []
  const headerOrder = []
  const headerSet = new Set()

  // 先读取所有文件内容到内存,标记文件名最后字符
  const fileDatas = []
  for (const f of files) {
    try {
      const stat = await fs.stat(f)
      if (!stat.isFile()) { console.warn('Skipping (not a file):', f); continue }
    } catch (e) { console.warn('Skipping (not found):', f); continue }
    const { headers, rows } = await readCSVFile(f)
    const base = path.basename(f)
    const nameNoExt = base.replace(/\.[^/.]+$/, '')
    const lastChar = nameNoExt.slice(-1)
    fileDatas.push({ path: f, headers, rows, nameNoExt, lastChar })
  }

  // 找到文件3(末尾字符为 '3')并创建第一列值的集合
  let file3Set = null
  const file3 = fileDatas.find(d => d.lastChar === '3')
  if (file3) {
    const firstHdr = file3.headers && file3.headers.length > 0 ? file3.headers[0] : null
    file3Set = new Set()
    if (firstHdr) {
      for (const r of file3.rows) {
        const v = r[firstHdr]
        if (v != null) file3Set.add(String(v))
      }
    }
  }

  // 现在按原顺序合并表头并收集行。对文件0(末尾字符为 '0')如果存在 file3Set,进行过滤:
  const removedRows = []
  for (const d of fileDatas) {
    const { headers, rows, lastChar } = d
    for (const h of headers) {
      if (!headerSet.has(h)) {
        headerSet.add(h)
        headerOrder.push(h)
      }
    }
    if (lastChar === '0' && file3Set) {
      // 使用此文件自身的第一列作为 model_name 字段
      const firstHdr = headers && headers.length > 0 ? headers[0] : null
      if (!firstHdr) continue
      for (const r of rows) {
        const val = r[firstHdr]
        if (val != null && file3Set.has(String(val))) {
          allRows.push(r)
        } else {
          // 记录被删除的行信息,便于日志输出
          removedRows.push({ source: d.path, key: firstHdr, value: val, row: r })
        }
      }
    } else {
      for (const r of rows) allRows.push(r)
    }
  }

  // 另外一遍,确保所有行都有 headerOrder 中的字段(填空)
  const outRows = allRows.map(r => {
    const o = {}
    for (const h of headerOrder) o[h] = (h in r) ? r[h] : ''
    // 也把那些在 headerOrder 之后才出现的字段加上(理论上我们已把所有文件头收集到 headerOrder)
    for (const k of Object.keys(r)) if (!headerSet.has(k)) { headerSet.add(k); headerOrder.push(k); o[k] = r[k] }
    return o
  })

  // 写出 CSV
  let outPath = path.isAbsolute(opts.out) ? opts.out : path.join(cwd, opts.out)
  // 如果用户传入的 out 以路径分隔符结尾或显式是目录,写入该目录下的 merged.csv
  const looksLikeDir = opts.out.endsWith('/') || opts.out.endsWith('\\')
  if (looksLikeDir) {
    outPath = path.join(outPath, 'merged.csv')
  }

  try {
    const st = await fs.stat(outPath)
    if (st.isDirectory()) {
      outPath = path.join(outPath, 'merged.csv')
    }
  } catch (e) {
    // not exists -> will create parent directory below
  }

  const headerLine = headerOrder.map(csvEscape).join(',')
  const lines = [headerLine]
  for (const r of outRows) {
    const vals = headerOrder.map(h => csvEscape(r[h]))
    lines.push(vals.join(','))
  }
  await fs.mkdir(path.dirname(outPath), { recursive: true })
  await fs.writeFile(outPath, lines.join('\n'), 'utf8')
  console.log(`Wrote merged CSV to ${outPath}  (${outRows.length} rows, ${headerOrder.length} columns)`)

  // 如果有被删除的行,打印并写日志
  if (removedRows.length > 0) {
    console.log(`Removed ${removedRows.length} rows from files (not present in file3). Logging to removed_rows.log`)
    const logLines = []
    logLines.push(`Removed ${removedRows.length} rows - details:`)
    for (const it of removedRows) {
      logLines.push(`source=${it.source} ${it.key}=${it.value} row=${JSON.stringify(it.row)}`)
    }
    // 写入到输出目录下的 removed_rows.log
    const logPath = path.join(path.dirname(outPath), 'removed_rows.log')
    await fs.writeFile(logPath, logLines.join('\n'), 'utf8')
    for (let i = 0; i < Math.min(50, logLines.length); i++) console.log(logLines[i])
    if (logLines.length > 50) console.log(`... see ${logPath} for full log`)
  } else {
    console.log('No rows were removed by file3 filtering.')
  }
}

main().catch(err => {
  console.error('Error:', err && err.stack ? err.stack : err)
  process.exit(3)
})