ocr-server / server.js
ajihakim201's picture
Update server.js
75331cf verified
raw
history blame
2.07 kB
const express = require("express");
const Tesseract = require("tesseract.js");
const fs = require("fs");
const path = require("path");
const PDF2Pic = require("pdf2pic");
const { PDFDocument } = require("pdf-lib"); // untuk hitung jumlah halaman
const gm = require("gm").subClass({ imageMagick: true });
const app = express();
app.use(express.json({ limit: "50mb" }));
// Buat folder uploads (dalam /tmp agar bisa ditulis di HuggingFace)
const uploadDir = path.join("/tmp", "uploads");
if (!fs.existsSync(uploadDir)) fs.mkdirSync(uploadDir, { recursive: true });
app.post("/ocr", async (req, res) => {
try {
const { file } = req.body;
if (!file) return res.status(400).json({ success: false, error: "Base64 file string not provided" });
// Simpan base64 β†’ PDF
const pdfPath = path.join(uploadDir, `file_${Date.now()}.pdf`);
fs.writeFileSync(pdfPath, Buffer.from(file, "base64"));
// Hitung jumlah halaman PDF
const pdfBuffer = fs.readFileSync(pdfPath);
const pdfDoc = await PDFDocument.load(pdfBuffer);
const totalPages = pdfDoc.getPageCount();
// Konfigurasi pdf2pic
const pdf2pic = new PDF2Pic({
density: 150,
saveFilename: "page",
savePath: uploadDir,
format: "png",
width: 1024,
height: 1024,
});
let allText = "";
// Loop semua halaman PDF
for (let page = 1; page <= totalPages; page++) {
const result = await pdf2pic.convert(pdfPath, page);
const { data: { text } } = await Tesseract.recognize(result.path, "ind+eng");
allText += `\n\n--- Halaman ${page} ---\n${text}`;
// Hapus file gambar sementara
if (fs.existsSync(result.path)) fs.unlinkSync(result.path);
}
// Hapus file PDF sementara
fs.unlinkSync(pdfPath);
res.json({ success: true, text: allText });
} catch (error) {
res.status(500).json({ success: false, error: error.message });
}
});
const PORT = process.env.PORT || 7860;
app.listen(PORT, "0.0.0.0", () => {
console.log(`πŸš€ OCR server running on http://0.0.0.0:${PORT}`);
});