Spaces:
Running
Running
update
Browse files- Dockerfile +5 -0
- endpoints/antibot.js +161 -176
- endpoints/ocr.py +187 -0
- package.json +1 -0
Dockerfile
CHANGED
|
@@ -1,9 +1,11 @@
|
|
| 1 |
FROM node:20-slim
|
| 2 |
|
|
|
|
| 3 |
RUN apt update && apt install -y \
|
| 4 |
wget gnupg ca-certificates xvfb \
|
| 5 |
fonts-liberation libappindicator3-1 libasound2 libatk-bridge2.0-0 \
|
| 6 |
libatk1.0-0 libxss1 libnss3 libxcomposite1 libxdamage1 libxrandr2 libgbm1 \
|
|
|
|
| 7 |
&& wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
|
| 8 |
&& apt install -y ./google-chrome-stable_current_amd64.deb \
|
| 9 |
&& rm google-chrome-stable_current_amd64.deb
|
|
@@ -13,6 +15,9 @@ WORKDIR /app
|
|
| 13 |
RUN mkdir -p /app/endpoints && \
|
| 14 |
mkdir -p /app/cache
|
| 15 |
|
|
|
|
|
|
|
|
|
|
| 16 |
COPY package*.json ./
|
| 17 |
RUN npm install
|
| 18 |
|
|
|
|
| 1 |
FROM node:20-slim
|
| 2 |
|
| 3 |
+
# Install system dependencies termasuk Python dan OpenCV
|
| 4 |
RUN apt update && apt install -y \
|
| 5 |
wget gnupg ca-certificates xvfb \
|
| 6 |
fonts-liberation libappindicator3-1 libasound2 libatk-bridge2.0-0 \
|
| 7 |
libatk1.0-0 libxss1 libnss3 libxcomposite1 libxdamage1 libxrandr2 libgbm1 \
|
| 8 |
+
python3 python3-pip python3-opencv tesseract-ocr tesseract-ocr-eng \
|
| 9 |
&& wget https://dl.google.com/linux/direct/google-chrome-stable_current_amd64.deb \
|
| 10 |
&& apt install -y ./google-chrome-stable_current_amd64.deb \
|
| 11 |
&& rm google-chrome-stable_current_amd64.deb
|
|
|
|
| 15 |
RUN mkdir -p /app/endpoints && \
|
| 16 |
mkdir -p /app/cache
|
| 17 |
|
| 18 |
+
# Install Python dependencies
|
| 19 |
+
RUN pip3 install pytesseract pillow numpy opencv-python
|
| 20 |
+
|
| 21 |
COPY package*.json ./
|
| 22 |
RUN npm install
|
| 23 |
|
endpoints/antibot.js
CHANGED
|
@@ -1,193 +1,178 @@
|
|
| 1 |
-
const
|
| 2 |
-
const
|
| 3 |
-
const path = require("path");
|
| 4 |
-
const sharp = require("sharp");
|
| 5 |
-
|
| 6 |
-
const WORD_TO_NUM = {
|
| 7 |
-
zero:"0",oh:"0",one:"1",won:"1",two:"2",to:"2",too:"2",three:"3",tree:"3",
|
| 8 |
-
four:"4",for:"4",five:"5",six:"6",seven:"7",eight:"8",ate:"8",nine:"9",ten:"10"
|
| 9 |
-
};
|
| 10 |
-
|
| 11 |
-
const LEET_REPLACE = {
|
| 12 |
-
"4":"a","3":"e","1":"i","0":"o","5":"s","7":"t","@":"a","$":"s","|":"l"
|
| 13 |
-
};
|
| 14 |
|
| 15 |
const buf = b => Buffer.from(b.replace(/^data:image\/\w+;base64,/,""),"base64");
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
function wordToDigit(t){
|
| 22 |
-
if(!t)return"";
|
| 23 |
-
t=t.toLowerCase().trim();
|
| 24 |
-
if(/^\d+$/.test(t))return t;
|
| 25 |
-
const c=t.replace(/[^a-z0-9]/gi,"");
|
| 26 |
-
const f=fixLeetToText(c);
|
| 27 |
-
if(WORD_TO_NUM[f]!==undefined)return WORD_TO_NUM[f];
|
| 28 |
-
const d=t.match(/\d+/);
|
| 29 |
-
if(d)return d[0];
|
| 30 |
-
return"";
|
| 31 |
-
}
|
| 32 |
-
|
| 33 |
-
function normalizeExpression(raw){
|
| 34 |
-
if(!raw)return"";
|
| 35 |
-
let s=raw.toLowerCase().replace(/\s+/g,"");
|
| 36 |
-
s=s.replace(/[×xX·•]/g,"*");
|
| 37 |
-
const parts=s.split(/([+\-*/()])/).filter(p=>p!=="");
|
| 38 |
-
return parts.map(p=>{
|
| 39 |
-
if(/^[+\-*/()]+$/.test(p))return p;
|
| 40 |
-
let num=wordToDigit(p);
|
| 41 |
-
if(num)return num;
|
| 42 |
-
const tf=fixLeetToText(p).replace(/[^a-z0-9]/g,"");
|
| 43 |
-
num=WORD_TO_NUM[tf]??"";
|
| 44 |
-
if(num)return num;
|
| 45 |
-
const dig=p.match(/\d+/);
|
| 46 |
-
if(dig)return dig[0];
|
| 47 |
-
return p;
|
| 48 |
-
}).join("");
|
| 49 |
-
}
|
| 50 |
-
|
| 51 |
-
function safeEvalExpression(expr){
|
| 52 |
-
if(!expr)return null;
|
| 53 |
-
const n=normalizeExpression(expr);
|
| 54 |
-
if(!/^[0-9+\-*/().]+$/.test(n))return null;
|
| 55 |
-
try{
|
| 56 |
-
const v=Function(`"use strict";return(${n});`)();
|
| 57 |
-
if(typeof v==="number"&&isFinite(v)){
|
| 58 |
-
if(Math.abs(v-Math.round(v))<1e-9)return String(Math.round(v));
|
| 59 |
-
return String(v);
|
| 60 |
}
|
| 61 |
-
return null;
|
| 62 |
-
}catch(e){return null;}
|
| 63 |
-
}
|
| 64 |
-
|
| 65 |
-
async function writeTempAndRecognize(b,opt={}){
|
| 66 |
-
const tmp=path.join(__dirname,"tmp_"+Date.now()+"_"+Math.random()+".png");
|
| 67 |
-
fs.writeFileSync(tmp,b);
|
| 68 |
-
try{
|
| 69 |
-
const r=await Tesseract.recognize(tmp,"eng",opt);
|
| 70 |
-
fs.unlinkSync(tmp);
|
| 71 |
-
return r.data.text||"";
|
| 72 |
-
}catch(e){
|
| 73 |
-
if(fs.existsSync(tmp))fs.unlinkSync(tmp);
|
| 74 |
-
return"";
|
| 75 |
-
}
|
| 76 |
-
}
|
| 77 |
-
|
| 78 |
-
async function ocrDigit(b){
|
| 79 |
-
const t=await writeTempAndRecognize(b,{
|
| 80 |
-
tessedit_char_whitelist:"0123456789Il!|OoZSTBGg",
|
| 81 |
-
tessedit_pageseg_mode:"7",
|
| 82 |
-
tessedit_ocr_engine_mode:"1"
|
| 83 |
-
});
|
| 84 |
-
const m={"I":"1","l":"1","|":"1","!":"1","O":"0","o":"0","Q":"0","Z":"2","z":"2",
|
| 85 |
-
"S":"5","s":"5","T":"7","t":"7","B":"8","b":"8","G":"6","g":"9"};
|
| 86 |
-
return t.split("").map(c=>m[c]??c).join("").replace(/[^0-9]/g,"");
|
| 87 |
-
}
|
| 88 |
|
| 89 |
-
async
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
if(d)return d[0];
|
| 122 |
-
return"";
|
| 123 |
-
}).filter(Boolean);
|
| 124 |
-
|
| 125 |
-
const botResults=[];
|
| 126 |
-
for(const b of data.bots){
|
| 127 |
-
const bbuf=buf(b.img);
|
| 128 |
-
const prep=await preprocessForBot(bbuf);
|
| 129 |
-
let v=await ocrDigit(prep);
|
| 130 |
-
if(!v){
|
| 131 |
-
const g=await ocrGeneric(prep);
|
| 132 |
-
if(g){
|
| 133 |
-
const ev=safeEvalExpression(g);
|
| 134 |
-
if(ev!==null)v=ev;
|
| 135 |
-
else v=wordToDigit(g)||fixLeetToText(g).replace(/[^0-9]/g,"");
|
| 136 |
}
|
| 137 |
-
|
| 138 |
-
botResults.push({id:b.id,raw:v||""});
|
| 139 |
}
|
| 140 |
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
}
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
| 168 |
-
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
}
|
| 171 |
-
}
|
| 172 |
-
if(f){result.push(f);used.add(f);}
|
| 173 |
-
else result.push(null);
|
| 174 |
}
|
| 175 |
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
}
|
| 185 |
-
|
|
|
|
| 186 |
}
|
|
|
|
| 187 |
|
| 188 |
-
|
| 189 |
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
|
|
|
| 193 |
};
|
|
|
|
| 1 |
+
const { spawn } = require('child_process');
|
| 2 |
+
const path = require('path');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
const buf = b => Buffer.from(b.replace(/^data:image\/\w+;base64,/,""),"base64");
|
| 5 |
|
| 6 |
+
class SimpleAntibot {
|
| 7 |
+
constructor() {
|
| 8 |
+
this.pythonScript = path.join(__dirname, 'ocr.py');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
+
async runPythonOCR(imageBuffer, type = 'soal') {
|
| 12 |
+
return new Promise((resolve, reject) => {
|
| 13 |
+
const base64Image = imageBuffer.toString('base64');
|
| 14 |
+
const pythonProcess = spawn('python3', [this.pythonScript, base64Image, type]);
|
| 15 |
+
|
| 16 |
+
let result = '';
|
| 17 |
+
let error = '';
|
| 18 |
+
|
| 19 |
+
pythonProcess.stdout.on('data', (data) => {
|
| 20 |
+
result += data.toString();
|
| 21 |
+
});
|
| 22 |
+
|
| 23 |
+
pythonProcess.stderr.on('data', (data) => {
|
| 24 |
+
error += data.toString();
|
| 25 |
+
});
|
| 26 |
+
|
| 27 |
+
pythonProcess.on('close', (code) => {
|
| 28 |
+
if (code === 0) {
|
| 29 |
+
try {
|
| 30 |
+
const parsed = JSON.parse(result);
|
| 31 |
+
resolve(parsed.text || '');
|
| 32 |
+
} catch (e) {
|
| 33 |
+
reject(new Error(`JSON parse error: ${e}`));
|
| 34 |
+
}
|
| 35 |
+
} else {
|
| 36 |
+
reject(new Error(`Python process failed: ${error}`));
|
| 37 |
+
}
|
| 38 |
+
});
|
| 39 |
+
});
|
| 40 |
+
}
|
| 41 |
|
| 42 |
+
levenshtein(a, b) {
|
| 43 |
+
if (!a || !b) return Math.max(a?.length || 0, b?.length || 0);
|
| 44 |
+
const matrix = [];
|
| 45 |
+
const aLen = a.length;
|
| 46 |
+
const bLen = b.length;
|
| 47 |
+
|
| 48 |
+
for (let i = 0; i <= bLen; i++) matrix[i] = [i];
|
| 49 |
+
for (let j = 0; j <= aLen; j++) matrix[0][j] = j;
|
| 50 |
+
|
| 51 |
+
for (let i = 1; i <= bLen; i++) {
|
| 52 |
+
for (let j = 1; j <= aLen; j++) {
|
| 53 |
+
matrix[i][j] = b[i-1] === a[j-1]
|
| 54 |
+
? matrix[i-1][j-1]
|
| 55 |
+
: Math.min(matrix[i-1][j-1] + 1, matrix[i][j-1] + 1, matrix[i-1][j] + 1);
|
| 56 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
}
|
| 58 |
+
return matrix[bLen][aLen];
|
|
|
|
| 59 |
}
|
| 60 |
|
| 61 |
+
similarity(a, b) {
|
| 62 |
+
if (!a || !b) return 0;
|
| 63 |
+
const dist = this.levenshtein(a, b);
|
| 64 |
+
const maxLen = Math.max(a.length, b.length);
|
| 65 |
+
return maxLen === 0 ? 1 : 1 - dist / maxLen;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
async solveAntiBot(data) {
|
| 69 |
+
try {
|
| 70 |
+
// PROCESS SOAL - dapat 3 teks
|
| 71 |
+
const soalBuf = buf(data.main);
|
| 72 |
+
const soalText = await this.runPythonOCR(soalBuf, 'soal');
|
| 73 |
+
|
| 74 |
+
// Split menjadi 3 bagian (soal)
|
| 75 |
+
const soalParts = this.splitSoalText(soalText);
|
| 76 |
+
const soalLeet = soalParts.map(part => part); // Sudah di-leetize dari Python
|
| 77 |
+
|
| 78 |
+
// PROCESS BOTS
|
| 79 |
+
const botResults = [];
|
| 80 |
+
for (const bot of data.bots) {
|
| 81 |
+
const botBuf = buf(bot.img);
|
| 82 |
+
const botLeet = await this.runPythonOCR(botBuf, 'bot');
|
| 83 |
+
botResults.push({
|
| 84 |
+
id: bot.id,
|
| 85 |
+
value: botLeet
|
| 86 |
+
});
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
// SIMPLE MATCHING
|
| 90 |
+
const result = [];
|
| 91 |
+
const usedBots = new Set();
|
| 92 |
+
|
| 93 |
+
// Match each soal part dengan bot
|
| 94 |
+
for (const soal of soalLeet) {
|
| 95 |
+
let bestBot = null;
|
| 96 |
+
let bestScore = 0;
|
| 97 |
+
|
| 98 |
+
for (const bot of botResults) {
|
| 99 |
+
if (usedBots.has(bot.id) || !bot.value) continue;
|
| 100 |
+
|
| 101 |
+
const score = this.similarity(soal, bot.value);
|
| 102 |
+
if (score > bestScore) {
|
| 103 |
+
bestScore = score;
|
| 104 |
+
bestBot = bot.id;
|
| 105 |
+
}
|
| 106 |
+
}
|
| 107 |
+
|
| 108 |
+
if (bestBot && bestScore >= 0.3) {
|
| 109 |
+
result.push(bestBot);
|
| 110 |
+
usedBots.add(bestBot);
|
| 111 |
+
} else {
|
| 112 |
+
result.push(null);
|
| 113 |
+
}
|
| 114 |
+
}
|
| 115 |
+
|
| 116 |
+
// Auto-fill remaining
|
| 117 |
+
for (let i = 0; i < result.length; i++) {
|
| 118 |
+
if (result[i] === null) {
|
| 119 |
+
for (const bot of botResults) {
|
| 120 |
+
if (!usedBots.has(bot.id)) {
|
| 121 |
+
result[i] = bot.id;
|
| 122 |
+
usedBots.add(bot.id);
|
| 123 |
+
break;
|
| 124 |
+
}
|
| 125 |
+
}
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
return {
|
| 130 |
+
soal: soalParts, // Text asli dari OCR
|
| 131 |
+
soalLeet: soalLeet, // Text yang sudah di-leetize
|
| 132 |
+
botResults: botResults, // Bot results dengan value leet
|
| 133 |
+
result: result // Matching result
|
| 134 |
+
};
|
| 135 |
+
|
| 136 |
+
} catch (error) {
|
| 137 |
+
console.error('Antibot error:', error);
|
| 138 |
+
return {
|
| 139 |
+
soal: [],
|
| 140 |
+
soalLeet: [],
|
| 141 |
+
botResults: [],
|
| 142 |
+
result: []
|
| 143 |
+
};
|
| 144 |
}
|
|
|
|
|
|
|
|
|
|
| 145 |
}
|
| 146 |
|
| 147 |
+
splitSoalText(text) {
|
| 148 |
+
if (!text) return ['', '', ''];
|
| 149 |
+
|
| 150 |
+
// Coba split by common separators
|
| 151 |
+
const separators = /[,;|]|\s+/;
|
| 152 |
+
const parts = text.split(separators).filter(part => part.trim());
|
| 153 |
+
|
| 154 |
+
if (parts.length >= 3) {
|
| 155 |
+
return parts.slice(0, 3).map(p => p.trim());
|
| 156 |
+
}
|
| 157 |
+
|
| 158 |
+
// Jika kurang dari 3, split text menjadi 3 bagian
|
| 159 |
+
const result = [];
|
| 160 |
+
const partLength = Math.ceil(text.length / 3);
|
| 161 |
+
|
| 162 |
+
for (let i = 0; i < 3; i++) {
|
| 163 |
+
const start = i * partLength;
|
| 164 |
+
const end = (i + 1) * partLength;
|
| 165 |
+
result.push(text.substring(start, end).trim());
|
| 166 |
}
|
| 167 |
+
|
| 168 |
+
return result;
|
| 169 |
}
|
| 170 |
+
}
|
| 171 |
|
| 172 |
+
module.exports = SimpleAntibot;
|
| 173 |
|
| 174 |
+
// Untuk compatibility
|
| 175 |
+
module.exports.solveAntiBot = async function(data) {
|
| 176 |
+
const antibot = new SimpleAntibot();
|
| 177 |
+
return await antibot.solveAntiBot(data);
|
| 178 |
};
|
endpoints/ocr.py
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
import cv2
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pytesseract
|
| 5 |
+
import re
|
| 6 |
+
import sys
|
| 7 |
+
import json
|
| 8 |
+
import base64
|
| 9 |
+
from PIL import Image
|
| 10 |
+
import io
|
| 11 |
+
|
| 12 |
+
class SimpleOCR:
|
| 13 |
+
def __init__(self):
|
| 14 |
+
# English to number (1-10 only)
|
| 15 |
+
self.word_to_num = {
|
| 16 |
+
'zero': '0', 'oh': '0', 'one': '1', 'won': '1',
|
| 17 |
+
'two': '2', 'to': '2', 'too': '2', 'three': '3', 'tree': '3',
|
| 18 |
+
'four': '4', 'for': '4', 'five': '5', 'six': '6',
|
| 19 |
+
'seven': '7', 'eight': '8', 'ate': '8', 'nine': '9', 'ten': '10'
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
# Leet map
|
| 23 |
+
self.leet_map = {
|
| 24 |
+
'a': '4', 'e': '3', 'g': '9', 'i': '1', 'l': '1',
|
| 25 |
+
'o': '0', 's': '5', 't': '7', 'b': '8', 'z': '2',
|
| 26 |
+
'@': '4', '$': '5', '&': '8'
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
def preprocess_image(self, image_bytes, image_type='soal'):
|
| 30 |
+
"""Preprocess image untuk OCR"""
|
| 31 |
+
try:
|
| 32 |
+
nparr = np.frombuffer(image_bytes, np.uint8)
|
| 33 |
+
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
|
| 34 |
+
|
| 35 |
+
if img is None:
|
| 36 |
+
return image_bytes
|
| 37 |
+
|
| 38 |
+
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
|
| 39 |
+
denoised = cv2.medianBlur(gray, 3)
|
| 40 |
+
|
| 41 |
+
# CLAHE untuk contrast enhancement
|
| 42 |
+
clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
|
| 43 |
+
enhanced = clahe.apply(denoised)
|
| 44 |
+
|
| 45 |
+
# Threshold
|
| 46 |
+
_, thresh = cv2.threshold(enhanced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
|
| 47 |
+
|
| 48 |
+
# Resize jika perlu
|
| 49 |
+
height, width = thresh.shape
|
| 50 |
+
if image_type == 'soal' and width > 1000:
|
| 51 |
+
scale = 1000 / width
|
| 52 |
+
thresh = cv2.resize(thresh, (1000, int(height * scale)))
|
| 53 |
+
elif image_type == 'bot' and width > 600:
|
| 54 |
+
scale = 600 / width
|
| 55 |
+
thresh = cv2.resize(thresh, (600, int(height * scale)))
|
| 56 |
+
|
| 57 |
+
return thresh
|
| 58 |
+
|
| 59 |
+
except Exception as e:
|
| 60 |
+
print(f"Preprocessing error: {e}")
|
| 61 |
+
return image_bytes
|
| 62 |
+
|
| 63 |
+
def leetize(self, text):
|
| 64 |
+
"""Convert text to leet speak"""
|
| 65 |
+
if not text:
|
| 66 |
+
return ""
|
| 67 |
+
return ''.join([self.leet_map.get(c.lower(), c) for c in text])
|
| 68 |
+
|
| 69 |
+
def words_to_number(self, text):
|
| 70 |
+
"""Convert English words to numbers"""
|
| 71 |
+
if not text:
|
| 72 |
+
return ""
|
| 73 |
+
text_lower = text.lower().strip()
|
| 74 |
+
|
| 75 |
+
# Exact match
|
| 76 |
+
if text_lower in self.word_to_num:
|
| 77 |
+
return self.word_to_num[text_lower]
|
| 78 |
+
|
| 79 |
+
# Clean and try again
|
| 80 |
+
clean_text = re.sub(r'[^a-z0-9]', '', text_lower)
|
| 81 |
+
if clean_text in self.word_to_num:
|
| 82 |
+
return self.word_to_num[clean_text]
|
| 83 |
+
|
| 84 |
+
return ""
|
| 85 |
+
|
| 86 |
+
def safe_eval_math(self, text):
|
| 87 |
+
"""Evaluate simple math expressions"""
|
| 88 |
+
try:
|
| 89 |
+
# Replace words with numbers
|
| 90 |
+
for word, num in self.word_to_num.items():
|
| 91 |
+
text = re.sub(r'\b' + word + r'\b', num, text, flags=re.IGNORECASE)
|
| 92 |
+
|
| 93 |
+
# Replace math symbols
|
| 94 |
+
text = text.replace('×', '*').replace('÷', '/').replace('x', '*')
|
| 95 |
+
text = text.replace('plus', '+').replace('minus', '-').replace('times', '*')
|
| 96 |
+
|
| 97 |
+
# Extract math expression
|
| 98 |
+
math_match = re.search(r'([\d+\-*/.()]+)', text)
|
| 99 |
+
if math_match:
|
| 100 |
+
result = eval(math_match.group(1))
|
| 101 |
+
return str(int(result)) if result == int(result) else str(result)
|
| 102 |
+
except:
|
| 103 |
+
pass
|
| 104 |
+
return None
|
| 105 |
+
|
| 106 |
+
def ocr_image(self, image_bytes, image_type='soal'):
|
| 107 |
+
"""Perform OCR pada image"""
|
| 108 |
+
try:
|
| 109 |
+
processed_img = self.preprocess_image(image_bytes, image_type)
|
| 110 |
+
|
| 111 |
+
config = '--oem 1 --psm 8' # Single word untuk bot
|
| 112 |
+
if image_type == 'soal':
|
| 113 |
+
config = '--oem 1 --psm 6' # Block of text untuk soal
|
| 114 |
+
|
| 115 |
+
if isinstance(processed_img, np.ndarray):
|
| 116 |
+
pil_img = Image.fromarray(processed_img)
|
| 117 |
+
else:
|
| 118 |
+
pil_img = Image.open(io.BytesIO(processed_img))
|
| 119 |
+
|
| 120 |
+
text = pytesseract.image_to_string(pil_img, config=config)
|
| 121 |
+
return text.strip()
|
| 122 |
+
|
| 123 |
+
except Exception as e:
|
| 124 |
+
print(f"OCR error: {e}")
|
| 125 |
+
return ""
|
| 126 |
+
|
| 127 |
+
def process_soal(self, image_bytes):
|
| 128 |
+
"""Process soal image dan return leet text"""
|
| 129 |
+
text = self.ocr_image(image_bytes, 'soal')
|
| 130 |
+
|
| 131 |
+
# Coba evaluasi math dulu
|
| 132 |
+
math_result = self.safe_eval_math(text)
|
| 133 |
+
if math_result:
|
| 134 |
+
return math_result
|
| 135 |
+
|
| 136 |
+
# Coba convert words to number
|
| 137 |
+
word_result = self.words_to_number(text)
|
| 138 |
+
if word_result:
|
| 139 |
+
return word_result
|
| 140 |
+
|
| 141 |
+
# Jika tidak ada math/words, langsung leetize
|
| 142 |
+
return self.leetize(text)
|
| 143 |
+
|
| 144 |
+
def process_bot(self, image_bytes):
|
| 145 |
+
"""Process bot image dan return leet text"""
|
| 146 |
+
text = self.ocr_image(image_bytes, 'bot')
|
| 147 |
+
|
| 148 |
+
# Coba evaluasi math
|
| 149 |
+
math_result = self.safe_eval_math(text)
|
| 150 |
+
if math_result:
|
| 151 |
+
return math_result
|
| 152 |
+
|
| 153 |
+
# Coba convert words to number
|
| 154 |
+
word_result = self.words_to_number(text)
|
| 155 |
+
if word_result:
|
| 156 |
+
return word_result
|
| 157 |
+
|
| 158 |
+
# Langsung leetize
|
| 159 |
+
return self.leetize(text)
|
| 160 |
+
|
| 161 |
+
def main():
|
| 162 |
+
if len(sys.argv) < 2:
|
| 163 |
+
print("Usage: python ocr.py <base64_image> [soal|bot]")
|
| 164 |
+
sys.exit(1)
|
| 165 |
+
|
| 166 |
+
base64_image = sys.argv[1]
|
| 167 |
+
image_type = sys.argv[2] if len(sys.argv) > 2 else 'soal'
|
| 168 |
+
|
| 169 |
+
try:
|
| 170 |
+
ocr = SimpleOCR()
|
| 171 |
+
image_bytes = base64.b64decode(base64_image)
|
| 172 |
+
|
| 173 |
+
if image_type == 'soal':
|
| 174 |
+
result_text = ocr.process_soal(image_bytes)
|
| 175 |
+
output = {'text': result_text}
|
| 176 |
+
else:
|
| 177 |
+
result_text = ocr.process_bot(image_bytes)
|
| 178 |
+
output = {'text': result_text}
|
| 179 |
+
|
| 180 |
+
print(json.dumps(output))
|
| 181 |
+
|
| 182 |
+
except Exception as e:
|
| 183 |
+
error_output = {'error': str(e), 'text': ''}
|
| 184 |
+
print(json.dumps(error_output))
|
| 185 |
+
|
| 186 |
+
if __name__ == '__main__':
|
| 187 |
+
main()
|
package.json
CHANGED
|
@@ -12,6 +12,7 @@
|
|
| 12 |
"sharp": "^0.32.0",
|
| 13 |
"puppeteer-real-browser": "^1.4.0",
|
| 14 |
"axios": "^1.9.0",
|
|
|
|
| 15 |
"tesseract.js": "^5.0.3",
|
| 16 |
"jimp": "^0.22.10"
|
| 17 |
},
|
|
|
|
| 12 |
"sharp": "^0.32.0",
|
| 13 |
"puppeteer-real-browser": "^1.4.0",
|
| 14 |
"axios": "^1.9.0",
|
| 15 |
+
"child_process": "*",
|
| 16 |
"tesseract.js": "^5.0.3",
|
| 17 |
"jimp": "^0.22.10"
|
| 18 |
},
|