Spaces:
Sleeping
Sleeping
Upload 3 files
Browse files- Dockerfile +24 -0
- package.json +19 -0
- server.js +52 -0
Dockerfile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM node:18
|
| 2 |
+
|
| 3 |
+
# Install tesseract + poppler-utils untuk OCR & PDF konversi
|
| 4 |
+
RUN apt-get update && apt-get install -y \
|
| 5 |
+
tesseract-ocr \
|
| 6 |
+
tesseract-ocr-ind \
|
| 7 |
+
poppler-utils \
|
| 8 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 9 |
+
|
| 10 |
+
# Buat folder app
|
| 11 |
+
WORKDIR /app
|
| 12 |
+
|
| 13 |
+
# Copy package.json dan install dependencies
|
| 14 |
+
COPY package.json ./
|
| 15 |
+
RUN npm install
|
| 16 |
+
|
| 17 |
+
# Copy semua file
|
| 18 |
+
COPY . .
|
| 19 |
+
|
| 20 |
+
# Expose port
|
| 21 |
+
EXPOSE 7860
|
| 22 |
+
|
| 23 |
+
# Jalankan server
|
| 24 |
+
CMD ["npm", "start"]
|
package.json
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "ocr-server",
|
| 3 |
+
"version": "1.0.0",
|
| 4 |
+
"main": "server.js",
|
| 5 |
+
"scripts": {
|
| 6 |
+
"start": "node server.js"
|
| 7 |
+
},
|
| 8 |
+
"keywords": [],
|
| 9 |
+
"author": "",
|
| 10 |
+
"license": "ISC",
|
| 11 |
+
"description": "",
|
| 12 |
+
"dependencies": {
|
| 13 |
+
"express": "^5.1.0",
|
| 14 |
+
"multer": "^2.0.2",
|
| 15 |
+
"pdf-parse": "^1.1.1",
|
| 16 |
+
"pdf-poppler": "^0.2.1",
|
| 17 |
+
"tesseract.js": "^6.0.1"
|
| 18 |
+
}
|
| 19 |
+
}
|
server.js
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const express = require('express');
|
| 2 |
+
const Tesseract = require('tesseract.js');
|
| 3 |
+
const fs = require('fs');
|
| 4 |
+
const path = require('path');
|
| 5 |
+
const { convert } = require('pdf-poppler');
|
| 6 |
+
|
| 7 |
+
const app = express();
|
| 8 |
+
app.use(express.json({ limit: '50mb' }));
|
| 9 |
+
|
| 10 |
+
// Buat folder uploads kalau belum ada
|
| 11 |
+
const uploadDir = path.join(__dirname, 'uploads');
|
| 12 |
+
if (!fs.existsSync(uploadDir)) fs.mkdirSync(uploadDir);
|
| 13 |
+
|
| 14 |
+
app.post('/ocr', async (req, res) => {
|
| 15 |
+
try {
|
| 16 |
+
const { file } = req.body;
|
| 17 |
+
if (!file) return res.status(400).json({ success: false, error: 'Base64 file string not provided' });
|
| 18 |
+
|
| 19 |
+
// Simpan base64 → PDF
|
| 20 |
+
const pdfPath = path.join(uploadDir, `file_${Date.now()}.pdf`);
|
| 21 |
+
fs.writeFileSync(pdfPath, Buffer.from(file, 'base64'));
|
| 22 |
+
|
| 23 |
+
// Convert PDF → PNG (halaman pertama)
|
| 24 |
+
const outputBase = pdfPath.replace('.pdf', '');
|
| 25 |
+
await convert(pdfPath, {
|
| 26 |
+
format: 'png',
|
| 27 |
+
out_dir: uploadDir,
|
| 28 |
+
out_prefix: path.basename(outputBase),
|
| 29 |
+
page: 1, // ambil halaman pertama
|
| 30 |
+
scale: 1024 // resolusi
|
| 31 |
+
});
|
| 32 |
+
|
| 33 |
+
const imagePath = path.join(uploadDir, `${path.basename(outputBase)}-1.png`);
|
| 34 |
+
|
| 35 |
+
// OCR dengan Tesseract
|
| 36 |
+
const { data: { text } } = await Tesseract.recognize(imagePath, 'ind+eng');
|
| 37 |
+
|
| 38 |
+
// Bersihkan file sementara
|
| 39 |
+
fs.unlinkSync(pdfPath);
|
| 40 |
+
fs.unlinkSync(imagePath);
|
| 41 |
+
|
| 42 |
+
res.json({ success: true, text });
|
| 43 |
+
} catch (error) {
|
| 44 |
+
res.status(500).json({ success: false, error: error.message });
|
| 45 |
+
}
|
| 46 |
+
});
|
| 47 |
+
|
| 48 |
+
// Hugging Face Spaces pakai port 7860
|
| 49 |
+
const PORT = process.env.PORT || 7860;
|
| 50 |
+
app.listen(PORT, "0.0.0.0", () => {
|
| 51 |
+
console.log(`🚀 OCR server running on http://0.0.0.0:${PORT}`);
|
| 52 |
+
});
|