ajihakim201 commited on
Commit
5b236d7
·
verified ·
1 Parent(s): c807ff6

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +24 -0
  2. package.json +19 -0
  3. server.js +52 -0
Dockerfile ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM node:18
2
+
3
+ # Install tesseract + poppler-utils untuk OCR & PDF konversi
4
+ RUN apt-get update && apt-get install -y \
5
+ tesseract-ocr \
6
+ tesseract-ocr-ind \
7
+ poppler-utils \
8
+ && rm -rf /var/lib/apt/lists/*
9
+
10
+ # Buat folder app
11
+ WORKDIR /app
12
+
13
+ # Copy package.json dan install dependencies
14
+ COPY package.json ./
15
+ RUN npm install
16
+
17
+ # Copy semua file
18
+ COPY . .
19
+
20
+ # Expose port
21
+ EXPOSE 7860
22
+
23
+ # Jalankan server
24
+ CMD ["npm", "start"]
package.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "ocr-server",
3
+ "version": "1.0.0",
4
+ "main": "server.js",
5
+ "scripts": {
6
+ "start": "node server.js"
7
+ },
8
+ "keywords": [],
9
+ "author": "",
10
+ "license": "ISC",
11
+ "description": "",
12
+ "dependencies": {
13
+ "express": "^5.1.0",
14
+ "multer": "^2.0.2",
15
+ "pdf-parse": "^1.1.1",
16
+ "pdf-poppler": "^0.2.1",
17
+ "tesseract.js": "^6.0.1"
18
+ }
19
+ }
server.js ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const express = require('express');
2
+ const Tesseract = require('tesseract.js');
3
+ const fs = require('fs');
4
+ const path = require('path');
5
+ const { convert } = require('pdf-poppler');
6
+
7
+ const app = express();
8
+ app.use(express.json({ limit: '50mb' }));
9
+
10
+ // Buat folder uploads kalau belum ada
11
+ const uploadDir = path.join(__dirname, 'uploads');
12
+ if (!fs.existsSync(uploadDir)) fs.mkdirSync(uploadDir);
13
+
14
+ app.post('/ocr', async (req, res) => {
15
+ try {
16
+ const { file } = req.body;
17
+ if (!file) return res.status(400).json({ success: false, error: 'Base64 file string not provided' });
18
+
19
+ // Simpan base64 → PDF
20
+ const pdfPath = path.join(uploadDir, `file_${Date.now()}.pdf`);
21
+ fs.writeFileSync(pdfPath, Buffer.from(file, 'base64'));
22
+
23
+ // Convert PDF → PNG (halaman pertama)
24
+ const outputBase = pdfPath.replace('.pdf', '');
25
+ await convert(pdfPath, {
26
+ format: 'png',
27
+ out_dir: uploadDir,
28
+ out_prefix: path.basename(outputBase),
29
+ page: 1, // ambil halaman pertama
30
+ scale: 1024 // resolusi
31
+ });
32
+
33
+ const imagePath = path.join(uploadDir, `${path.basename(outputBase)}-1.png`);
34
+
35
+ // OCR dengan Tesseract
36
+ const { data: { text } } = await Tesseract.recognize(imagePath, 'ind+eng');
37
+
38
+ // Bersihkan file sementara
39
+ fs.unlinkSync(pdfPath);
40
+ fs.unlinkSync(imagePath);
41
+
42
+ res.json({ success: true, text });
43
+ } catch (error) {
44
+ res.status(500).json({ success: false, error: error.message });
45
+ }
46
+ });
47
+
48
+ // Hugging Face Spaces pakai port 7860
49
+ const PORT = process.env.PORT || 7860;
50
+ app.listen(PORT, "0.0.0.0", () => {
51
+ console.log(`🚀 OCR server running on http://0.0.0.0:${PORT}`);
52
+ });