File size: 4,846 Bytes
309320b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 |
#!/usr/bin/env node
import fs from 'fs/promises'
import path from 'path'
function usage() {
console.log(`Usage:
node scripts/count_column.js --file <file.csv> --column <columnName|index> [--out <out.txt>]
Options:
--file CSV 文件路径(必需)。
--column 指定列名或列序号(1 表示第一列)。如果为数字则按 1-based 处理。
--out 可选,将统计结果写入指定文件(默认:打印到控制台)。
--help 显示帮助。
`)
}
function parseArgs(argv) {
const args = argv.slice(2)
const opts = { file: null, column: null, out: null }
for (let i = 0; i < args.length; i++) {
const a = args[i]
if (a === '--help' || a === '-h') { opts.help = true; break }
if (a === '--file') { opts.file = args[++i]; continue }
if (a === '--column') { opts.column = args[++i]; continue }
if (a === '--out') { opts.out = args[++i]; continue }
// positional file
if (!opts.file) opts.file = a
}
return opts
}
// 重用一个简单的 CSV 单行解析(支持双引号与双引号转义)
function parseCSVLine(line) {
const res = []
let cur = ''
let inQuotes = false
for (let i = 0; i < line.length; i++) {
const ch = line[i]
if (inQuotes) {
if (ch === '"') {
if (i + 1 < line.length && line[i + 1] === '"') { cur += '"'; i++ } else { inQuotes = false }
} else { cur += ch }
} else {
if (ch === ',') { res.push(cur); cur = '' }
else if (ch === '"') { inQuotes = true }
else { cur += ch }
}
}
res.push(cur)
return res
}
async function readCSV(filePath) {
const txt = await fs.readFile(filePath, 'utf8')
const lines = txt.split(/\r?\n/)
let headerLineIndex = null
for (let i = 0; i < lines.length; i++) { if (lines[i].trim().length > 0) { headerLineIndex = i; break } }
if (headerLineIndex === null) return { headers: [], rows: [] }
const headers = parseCSVLine(lines[headerLineIndex])
const rows = []
for (let i = headerLineIndex + 1; i < lines.length; i++) {
const l = lines[i]
if (l == null || l.trim() === '') continue
const vals = parseCSVLine(l)
rows.push(vals)
}
return { headers, rows }
}
function normalizeKey(k) {
if (k == null) return ''
return String(k).trim()
}
async function main() {
const opts = parseArgs(process.argv)
if (opts.help) { usage(); return }
if (!opts.file || !opts.column) { console.error('Missing --file or --column'); usage(); process.exit(1) }
const filePath = path.isAbsolute(opts.file) ? opts.file : path.join(process.cwd(), opts.file)
let stat
try { stat = await fs.stat(filePath) } catch (e) { console.error('File not found:', filePath); process.exit(2) }
if (!stat.isFile()) { console.error('Not a file:', filePath); process.exit(3) }
const { headers, rows } = await readCSV(filePath)
if (!headers || headers.length === 0) { console.error('No header found in CSV'); process.exit(4) }
// 决定列索引:如果 opts.column 是纯数字则按 1-based 处理,否则按列名匹配(优先精确匹配,其次大小写忽略)
let colIndex = -1
if (/^\d+$/.test(opts.column)) {
const idx = parseInt(opts.column, 10)
colIndex = idx - 1
if (colIndex < 0 || colIndex >= headers.length) { console.error('Column index out of range'); process.exit(5) }
} else {
// 尝试精确匹配
colIndex = headers.indexOf(opts.column)
if (colIndex === -1) {
// 尝试不区分大小写匹配
const lower = opts.column.toLowerCase()
colIndex = headers.findIndex(h => String(h).toLowerCase() === lower)
if (colIndex === -1) { console.error(`Column name not found: ${opts.column}`); process.exit(6) }
}
}
const counts = new Map()
for (const vals of rows) {
const v = normalizeKey(vals[colIndex])
counts.set(v, (counts.get(v) || 0) + 1)
}
// 排序:按计数降序,再按值字母升序
const items = Array.from(counts.entries()).sort((a, b) => {
if (b[1] !== a[1]) return b[1] - a[1]
return String(a[0]).localeCompare(String(b[0]))
})
const outLines = []
outLines.push(`File: ${filePath}`)
outLines.push(`Column: ${headers[colIndex]} (index ${colIndex + 1})`)
outLines.push(`Total distinct classes: ${items.length}`)
outLines.push('')
outLines.push('Value,Count')
for (const [val, cnt] of items) outLines.push(`${val},${cnt}`)
if (opts.out) {
const outPath = path.isAbsolute(opts.out) ? opts.out : path.join(process.cwd(), opts.out)
await fs.mkdir(path.dirname(outPath), { recursive: true })
await fs.writeFile(outPath, outLines.join('\n'), 'utf8')
console.log(`Wrote counts to ${outPath} (${items.length} distinct)`)
} else {
console.log(outLines.join('\n'))
}
}
main().catch(err => { console.error('Error:', err && err.stack ? err.stack : err); process.exit(10) })
|