Bentopdf / src /js /workflow /nodes /pdf-to-csv-node.ts
AUXteam's picture
Upload folder using huggingface_hub
1b756c8 verified
import { ClassicPreset } from 'rete';
import { BaseWorkflowNode } from './base-node';
import { pdfSocket } from '../sockets';
import type { SocketData } from '../types';
import { requirePdfInput, extractAllPdfs } from '../types';
import { downloadFile } from '../../utils/helpers.js';
import { loadPyMuPDF } from '../../utils/pymupdf-loader.js';
function tableToCsv(rows: (string | null)[][]): string {
return rows
.map((row) =>
row
.map((cell) => {
const str = String(cell ?? '');
if (str.includes(',') || str.includes('"') || str.includes('\n')) {
return `"${str.replace(/"/g, '""')}"`;
}
return str;
})
.join(',')
)
.join('\n');
}
export class PdfToCsvNode extends BaseWorkflowNode {
readonly category = 'Output' as const;
readonly icon = 'ph-file-csv';
readonly description = 'Extract tables from PDF to CSV';
constructor() {
super('PDF to CSV');
this.addInput('pdf', new ClassicPreset.Input(pdfSocket, 'PDF'));
}
private async extractTables(bytes: Uint8Array): Promise<(string | null)[][]> {
const pymupdf = await loadPyMuPDF();
const blob = new Blob([new Uint8Array(bytes)], { type: 'application/pdf' });
const doc = await pymupdf.open(blob);
const allRows: (string | null)[][] = [];
try {
const pageCount = doc.pageCount;
for (let i = 0; i < pageCount; i++) {
const page = doc.getPage(i);
const tables = page.findTables();
tables.forEach((table: any) => {
allRows.push(...table.rows);
});
}
} finally {
doc.close();
}
return allRows;
}
async data(
inputs: Record<string, SocketData[]>
): Promise<Record<string, SocketData>> {
const pdfInputs = requirePdfInput(inputs, 'PDF to CSV');
const allPdfs = extractAllPdfs(pdfInputs);
if (allPdfs.length === 1) {
const allRows = await this.extractTables(allPdfs[0].bytes);
if (allRows.length === 0) {
throw new Error('No tables found in PDF');
}
const csv = tableToCsv(allRows);
const csvBlob = new Blob([csv], { type: 'text/csv;charset=utf-8;' });
const name = allPdfs[0].filename.replace(/\.pdf$/i, '') + '.csv';
downloadFile(csvBlob, name);
} else {
const JSZip = (await import('jszip')).default;
const zip = new JSZip();
for (const pdf of allPdfs) {
const allRows = await this.extractTables(pdf.bytes);
if (allRows.length === 0) continue;
const csv = tableToCsv(allRows);
const name = pdf.filename.replace(/\.pdf$/i, '') + '.csv';
zip.file(name, csv);
}
const zipBlob = await zip.generateAsync({ type: 'blob' });
downloadFile(zipBlob, 'csv_files.zip');
}
return {};
}
}