TeleAI-AI-Flow's picture
Upload 32 files
309320b verified
#!/usr/bin/env node
import fs from 'fs/promises'
import path from 'path'
function usage() {
console.log(`Usage:
node scripts/count_column.js --file <file.csv> --column <columnName|index> [--out <out.txt>]
Options:
--file CSV 文件路径(必需)。
--column 指定列名或列序号(1 表示第一列)。如果为数字则按 1-based 处理。
--out 可选,将统计结果写入指定文件(默认:打印到控制台)。
--help 显示帮助。
`)
}
function parseArgs(argv) {
const args = argv.slice(2)
const opts = { file: null, column: null, out: null }
for (let i = 0; i < args.length; i++) {
const a = args[i]
if (a === '--help' || a === '-h') { opts.help = true; break }
if (a === '--file') { opts.file = args[++i]; continue }
if (a === '--column') { opts.column = args[++i]; continue }
if (a === '--out') { opts.out = args[++i]; continue }
// positional file
if (!opts.file) opts.file = a
}
return opts
}
// 重用一个简单的 CSV 单行解析(支持双引号与双引号转义)
function parseCSVLine(line) {
const res = []
let cur = ''
let inQuotes = false
for (let i = 0; i < line.length; i++) {
const ch = line[i]
if (inQuotes) {
if (ch === '"') {
if (i + 1 < line.length && line[i + 1] === '"') { cur += '"'; i++ } else { inQuotes = false }
} else { cur += ch }
} else {
if (ch === ',') { res.push(cur); cur = '' }
else if (ch === '"') { inQuotes = true }
else { cur += ch }
}
}
res.push(cur)
return res
}
async function readCSV(filePath) {
const txt = await fs.readFile(filePath, 'utf8')
const lines = txt.split(/\r?\n/)
let headerLineIndex = null
for (let i = 0; i < lines.length; i++) { if (lines[i].trim().length > 0) { headerLineIndex = i; break } }
if (headerLineIndex === null) return { headers: [], rows: [] }
const headers = parseCSVLine(lines[headerLineIndex])
const rows = []
for (let i = headerLineIndex + 1; i < lines.length; i++) {
const l = lines[i]
if (l == null || l.trim() === '') continue
const vals = parseCSVLine(l)
rows.push(vals)
}
return { headers, rows }
}
function normalizeKey(k) {
if (k == null) return ''
return String(k).trim()
}
async function main() {
const opts = parseArgs(process.argv)
if (opts.help) { usage(); return }
if (!opts.file || !opts.column) { console.error('Missing --file or --column'); usage(); process.exit(1) }
const filePath = path.isAbsolute(opts.file) ? opts.file : path.join(process.cwd(), opts.file)
let stat
try { stat = await fs.stat(filePath) } catch (e) { console.error('File not found:', filePath); process.exit(2) }
if (!stat.isFile()) { console.error('Not a file:', filePath); process.exit(3) }
const { headers, rows } = await readCSV(filePath)
if (!headers || headers.length === 0) { console.error('No header found in CSV'); process.exit(4) }
// 决定列索引:如果 opts.column 是纯数字则按 1-based 处理,否则按列名匹配(优先精确匹配,其次大小写忽略)
let colIndex = -1
if (/^\d+$/.test(opts.column)) {
const idx = parseInt(opts.column, 10)
colIndex = idx - 1
if (colIndex < 0 || colIndex >= headers.length) { console.error('Column index out of range'); process.exit(5) }
} else {
// 尝试精确匹配
colIndex = headers.indexOf(opts.column)
if (colIndex === -1) {
// 尝试不区分大小写匹配
const lower = opts.column.toLowerCase()
colIndex = headers.findIndex(h => String(h).toLowerCase() === lower)
if (colIndex === -1) { console.error(`Column name not found: ${opts.column}`); process.exit(6) }
}
}
const counts = new Map()
for (const vals of rows) {
const v = normalizeKey(vals[colIndex])
counts.set(v, (counts.get(v) || 0) + 1)
}
// 排序:按计数降序,再按值字母升序
const items = Array.from(counts.entries()).sort((a, b) => {
if (b[1] !== a[1]) return b[1] - a[1]
return String(a[0]).localeCompare(String(b[0]))
})
const outLines = []
outLines.push(`File: ${filePath}`)
outLines.push(`Column: ${headers[colIndex]} (index ${colIndex + 1})`)
outLines.push(`Total distinct classes: ${items.length}`)
outLines.push('')
outLines.push('Value,Count')
for (const [val, cnt] of items) outLines.push(`${val},${cnt}`)
if (opts.out) {
const outPath = path.isAbsolute(opts.out) ? opts.out : path.join(process.cwd(), opts.out)
await fs.mkdir(path.dirname(outPath), { recursive: true })
await fs.writeFile(outPath, outLines.join('\n'), 'utf8')
console.log(`Wrote counts to ${outPath} (${items.length} distinct)`)
} else {
console.log(outLines.join('\n'))
}
}
main().catch(err => { console.error('Error:', err && err.stack ? err.stack : err); process.exit(10) })