ajihakim201 commited on
Commit
75331cf
Β·
verified Β·
1 Parent(s): 820469a

Update server.js

Browse files
Files changed (1) hide show
  1. server.js +28 -19
server.js CHANGED
@@ -2,16 +2,15 @@ const express = require("express");
2
  const Tesseract = require("tesseract.js");
3
  const fs = require("fs");
4
  const path = require("path");
5
- const { fromPath } = require("pdf2pic");
6
- // gunakan ImageMagick mode agar gm tidak cari binary gm sendiri
7
- const gm = require('gm').subClass({ imageMagick: true });
8
-
9
 
10
  const app = express();
11
- app.use(express.json({ limit: '50mb' }));
12
 
13
  // Buat folder uploads (dalam /tmp agar bisa ditulis di HuggingFace)
14
- const uploadDir = path.join('/tmp', 'uploads');
15
  if (!fs.existsSync(uploadDir)) fs.mkdirSync(uploadDir, { recursive: true });
16
 
17
  app.post("/ocr", async (req, res) => {
@@ -23,33 +22,43 @@ app.post("/ocr", async (req, res) => {
23
  const pdfPath = path.join(uploadDir, `file_${Date.now()}.pdf`);
24
  fs.writeFileSync(pdfPath, Buffer.from(file, "base64"));
25
 
26
- // Convert PDF β†’ PNG (halaman pertama)
27
- const options = {
 
 
 
 
 
28
  density: 150,
29
  saveFilename: "page",
30
  savePath: uploadDir,
31
  format: "png",
32
  width: 1024,
33
- height: 1024
34
- };
35
- const storeAsImage = fromPath(pdfPath, options);
36
- const pageToConvertAsImage = 1;
37
- const result = await storeAsImage(pageToConvertAsImage);
 
 
 
 
 
38
 
39
- // OCR dengan Tesseract
40
- const { data: { text } } = await Tesseract.recognize(result.path, "ind+eng");
 
41
 
42
- // Hapus file sementara
43
  fs.unlinkSync(pdfPath);
44
- fs.unlinkSync(result.path);
45
 
46
- res.json({ success: true, text });
47
  } catch (error) {
48
  res.status(500).json({ success: false, error: error.message });
49
  }
50
  });
51
 
52
  const PORT = process.env.PORT || 7860;
53
- app.listen(PORT, '0.0.0.0', () => {
54
  console.log(`πŸš€ OCR server running on http://0.0.0.0:${PORT}`);
55
  });
 
2
  const Tesseract = require("tesseract.js");
3
  const fs = require("fs");
4
  const path = require("path");
5
+ const PDF2Pic = require("pdf2pic");
6
+ const { PDFDocument } = require("pdf-lib"); // untuk hitung jumlah halaman
7
+ const gm = require("gm").subClass({ imageMagick: true });
 
8
 
9
  const app = express();
10
+ app.use(express.json({ limit: "50mb" }));
11
 
12
  // Buat folder uploads (dalam /tmp agar bisa ditulis di HuggingFace)
13
+ const uploadDir = path.join("/tmp", "uploads");
14
  if (!fs.existsSync(uploadDir)) fs.mkdirSync(uploadDir, { recursive: true });
15
 
16
  app.post("/ocr", async (req, res) => {
 
22
  const pdfPath = path.join(uploadDir, `file_${Date.now()}.pdf`);
23
  fs.writeFileSync(pdfPath, Buffer.from(file, "base64"));
24
 
25
+ // Hitung jumlah halaman PDF
26
+ const pdfBuffer = fs.readFileSync(pdfPath);
27
+ const pdfDoc = await PDFDocument.load(pdfBuffer);
28
+ const totalPages = pdfDoc.getPageCount();
29
+
30
+ // Konfigurasi pdf2pic
31
+ const pdf2pic = new PDF2Pic({
32
  density: 150,
33
  saveFilename: "page",
34
  savePath: uploadDir,
35
  format: "png",
36
  width: 1024,
37
+ height: 1024,
38
+ });
39
+
40
+ let allText = "";
41
+
42
+ // Loop semua halaman PDF
43
+ for (let page = 1; page <= totalPages; page++) {
44
+ const result = await pdf2pic.convert(pdfPath, page);
45
+ const { data: { text } } = await Tesseract.recognize(result.path, "ind+eng");
46
+ allText += `\n\n--- Halaman ${page} ---\n${text}`;
47
 
48
+ // Hapus file gambar sementara
49
+ if (fs.existsSync(result.path)) fs.unlinkSync(result.path);
50
+ }
51
 
52
+ // Hapus file PDF sementara
53
  fs.unlinkSync(pdfPath);
 
54
 
55
+ res.json({ success: true, text: allText });
56
  } catch (error) {
57
  res.status(500).json({ success: false, error: error.message });
58
  }
59
  });
60
 
61
  const PORT = process.env.PORT || 7860;
62
+ app.listen(PORT, "0.0.0.0", () => {
63
  console.log(`πŸš€ OCR server running on http://0.0.0.0:${PORT}`);
64
  });