Spaces:

TeleAI-AI-Flow
/

InformationCapacityLeaderboard

Running

File size: 8,447 Bytes

309320b

#!/usr/bin/env node
/*
  scripts/merge_csv.js
  简单的 Node 脚本：合并指定目录或指定多个 CSV 文件为一个 CSV 文件（基本合并，处理不同表头）

  用法示例：
    node scripts/merge_csv.js --inputDir ./data --out merged.csv
    node scripts/merge_csv.js file1.csv file2.csv --out merged.csv

  限制说明：这是一个轻量实现，处理常见 CSV（含双引号字段）。不保证对包含任意换行的复杂嵌套引用字段 100% 兼容。
*/

import fs from 'fs/promises'
import path from 'path'

function usage() {
  console.log(`Usage:\n  node scripts/merge_csv.js --inputDir <dir> --out <out.csv>\n  node scripts/merge_csv.js <file1.csv> <file2.csv> --out <out.csv>\n\nOptions:\n  --inputDir   读取指定目录下的所有 .csv 文件（非递归）\n  --out        输出文件路径（默认: merged.csv）\n  --help       显示帮助\n`)
}

function parseArgs(argv) {
  const args = argv.slice(2)
  const opts = { files: [], inputDir: null, out: 'merged.csv' }
  for (let i = 0; i < args.length; i++) {
    const a = args[i]
    if (a === '--help' || a === '-h') { opts.help = true; break }
    if (a === '--inputDir') { opts.inputDir = args[++i]; continue }
    if (a === '--out') { opts.out = args[++i]; continue }
    if (a.startsWith('--')) {
      console.warn('Unknown option', a)
      continue
    }
    opts.files.push(a)
  }
  return opts
}

// 基本的单行 CSV 字段解析，支持双引号包含和双引号转义("")
function parseCSVLine(line) {
  const res = []
  let cur = ''
  let inQuotes = false
  for (let i = 0; i < line.length; i++) {
    const ch = line[i]
    if (inQuotes) {
      if (ch === '"') {
        if (i + 1 < line.length && line[i + 1] === '"') {
          cur += '"'
          i++
        } else {
          inQuotes = false
        }
      } else {
        cur += ch
      }
    } else {
      if (ch === ',') {
        res.push(cur)
        cur = ''
      } else if (ch === '"') {
        inQuotes = true
      } else {
        cur += ch
      }
    }
  }
  res.push(cur)
  return res
}

function csvEscape(value) {
  if (value == null) return ''
  const s = String(value)
  if (s.includes('"')) return '"' + s.replace(/"/g, '""') + '"'
  if (s.includes(',') || s.includes('\n') || s.includes('\r')) return '"' + s + '"'
  return s
}

async function readCSVFile(filePath) {
  const txt = await fs.readFile(filePath, 'utf8')
  // 兼容 CRLF
  const lines = txt.split(/\r?\n/)
  // 找到第一行非空作为 header
  let headerLineIndex = null
  for (let i = 0; i < lines.length; i++) {
    if (lines[i].trim().length > 0) { headerLineIndex = i; break }
  }
  if (headerLineIndex === null) return { headers: [], rows: [] }
  const headers = parseCSVLine(lines[headerLineIndex])
  const rows = []
  for (let i = headerLineIndex + 1; i < lines.length; i++) {
    const l = lines[i]
    if (l == null || l.trim() === '') continue
    const vals = parseCSVLine(l)
    const obj = {}
    for (let j = 0; j < headers.length; j++) {
      obj[headers[j]] = vals[j] ?? ''
    }
    rows.push(obj)
  }
  return { headers, rows }
}

async function main() {
  const opts = parseArgs(process.argv)
  if (opts.help) { usage(); return }

  const cwd = process.cwd()
  let files = []
  if (opts.inputDir) {
    const dir = path.isAbsolute(opts.inputDir) ? opts.inputDir : path.join(cwd, opts.inputDir)
    try {
      const names = await fs.readdir(dir)
      files = names.filter(n => n.toLowerCase().endsWith('.csv')).map(n => path.join(dir, n))
    } catch (e) {
      console.error('Failed to read inputDir', e.message)
      process.exit(2)
    }
  }
  if (opts.files && opts.files.length) {
    const explicit = opts.files.map(f => path.isAbsolute(f) ? f : path.join(cwd, f))
    files = files.concat(explicit)
  }
  // 去重并保持顺序
  files = [...new Set(files)]
  if (files.length === 0) {
    console.error('No CSV files specified. Use --inputDir or pass file paths.')
    usage();
    process.exit(1)
  }

  const allRows = []
  const headerOrder = []
  const headerSet = new Set()

  // 先读取所有文件内容到内存，标记文件名最后字符
  const fileDatas = []
  for (const f of files) {
    try {
      const stat = await fs.stat(f)
      if (!stat.isFile()) { console.warn('Skipping (not a file):', f); continue }
    } catch (e) { console.warn('Skipping (not found):', f); continue }
    const { headers, rows } = await readCSVFile(f)
    const base = path.basename(f)
    const nameNoExt = base.replace(/\.[^/.]+$/, '')
    const lastChar = nameNoExt.slice(-1)
    fileDatas.push({ path: f, headers, rows, nameNoExt, lastChar })
  }

  // 找到文件3（末尾字符为 '3'）并创建第一列值的集合
  let file3Set = null
  const file3 = fileDatas.find(d => d.lastChar === '3')
  if (file3) {
    const firstHdr = file3.headers && file3.headers.length > 0 ? file3.headers[0] : null
    file3Set = new Set()
    if (firstHdr) {
      for (const r of file3.rows) {
        const v = r[firstHdr]
        if (v != null) file3Set.add(String(v))
      }
    }
  }

  // 现在按原顺序合并表头并收集行。对文件0（末尾字符为 '0'）如果存在 file3Set，进行过滤：
  const removedRows = []
  for (const d of fileDatas) {
    const { headers, rows, lastChar } = d
    for (const h of headers) {
      if (!headerSet.has(h)) {
        headerSet.add(h)
        headerOrder.push(h)
      }
    }
    if (lastChar === '0' && file3Set) {
      // 使用此文件自身的第一列作为 model_name 字段
      const firstHdr = headers && headers.length > 0 ? headers[0] : null
      if (!firstHdr) continue
      for (const r of rows) {
        const val = r[firstHdr]
        if (val != null && file3Set.has(String(val))) {
          allRows.push(r)
        } else {
          // 记录被删除的行信息，便于日志输出
          removedRows.push({ source: d.path, key: firstHdr, value: val, row: r })
        }
      }
    } else {
      for (const r of rows) allRows.push(r)
    }
  }

  // 另外一遍，确保所有行都有 headerOrder 中的字段（填空）
  const outRows = allRows.map(r => {
    const o = {}
    for (const h of headerOrder) o[h] = (h in r) ? r[h] : ''
    // 也把那些在 headerOrder 之后才出现的字段加上（理论上我们已把所有文件头收集到 headerOrder）
    for (const k of Object.keys(r)) if (!headerSet.has(k)) { headerSet.add(k); headerOrder.push(k); o[k] = r[k] }
    return o
  })

  // 写出 CSV
  let outPath = path.isAbsolute(opts.out) ? opts.out : path.join(cwd, opts.out)
  // 如果用户传入的 out 以路径分隔符结尾或显式是目录，写入该目录下的 merged.csv
  const looksLikeDir = opts.out.endsWith('/') || opts.out.endsWith('\\')
  if (looksLikeDir) {
    outPath = path.join(outPath, 'merged.csv')
  }

  try {
    const st = await fs.stat(outPath)
    if (st.isDirectory()) {
      outPath = path.join(outPath, 'merged.csv')
    }
  } catch (e) {
    // not exists -> will create parent directory below
  }

  const headerLine = headerOrder.map(csvEscape).join(',')
  const lines = [headerLine]
  for (const r of outRows) {
    const vals = headerOrder.map(h => csvEscape(r[h]))
    lines.push(vals.join(','))
  }
  await fs.mkdir(path.dirname(outPath), { recursive: true })
  await fs.writeFile(outPath, lines.join('\n'), 'utf8')
  console.log(`Wrote merged CSV to ${outPath}  (${outRows.length} rows, ${headerOrder.length} columns)`)

  // 如果有被删除的行，打印并写日志
  if (removedRows.length > 0) {
    console.log(`Removed ${removedRows.length} rows from files (not present in file3). Logging to removed_rows.log`)
    const logLines = []
    logLines.push(`Removed ${removedRows.length} rows - details:`)
    for (const it of removedRows) {
      logLines.push(`source=${it.source} ${it.key}=${it.value} row=${JSON.stringify(it.row)}`)
    }
    // 写入到输出目录下的 removed_rows.log
    const logPath = path.join(path.dirname(outPath), 'removed_rows.log')
    await fs.writeFile(logPath, logLines.join('\n'), 'utf8')
    for (let i = 0; i < Math.min(50, logLines.length); i++) console.log(logLines[i])
    if (logLines.length > 50) console.log(`... see ${logPath} for full log`)
  } else {
    console.log('No rows were removed by file3 filtering.')
  }
}

main().catch(err => {
  console.error('Error:', err && err.stack ? err.stack : err)
  process.exit(3)
})