Fourstore commited on
Commit
ca62693
·
verified ·
1 Parent(s): e5808b7

Update endpoints/antibot.js

Browse files
Files changed (1) hide show
  1. endpoints/antibot.js +173 -122
endpoints/antibot.js CHANGED
@@ -1,19 +1,38 @@
1
  const Tesseract = require("tesseract.js");
2
  const fs = require("fs");
3
  const path = require("path");
4
- const cv = require('opencv4nodejs');
5
 
6
  const buf = b => Buffer.from(b.replace(/^data:image\/\w+;base64,/, ""), "base64");
7
 
8
  // ==========================
9
- // LEET MAP
10
  // ==========================
11
  const LEET = {
12
- a:"4", e:"3", g:"9", i:"1", l:"1", o:"0", s:"5", t:"7", b:"8", z:"2",
13
- "@":"4", "$":"5", "&":"8"
14
  };
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  function leetize(str) {
 
 
 
 
 
17
  return str
18
  .toLowerCase()
19
  .split("")
@@ -21,73 +40,78 @@ function leetize(str) {
21
  .join("");
22
  }
23
 
 
 
 
 
 
 
 
24
  // ==========================
25
- // OPENCV PREPROCESSING
26
  // ==========================
27
- async function preprocessWithOpenCV(buffer) {
28
- try {
29
- // Decode image dari buffer
30
- let img = cv.imdecode(buffer);
31
-
32
- // 1. Resize untuk improve quality
33
- img = img.resize(new cv.Size(0, 0), 2, 2, cv.INTER_CUBIC);
34
-
35
- // 2. Convert ke grayscale
36
- let gray = img.bgrToGray();
37
-
38
- // 3. Gaussian blur untuk noise reduction
39
- let blurred = gray.gaussianBlur(new cv.Size(3, 3), 0);
40
-
41
- // 4. Adaptive threshold (lebih baik dari global threshold)
42
- let thresh = blurred.adaptiveThreshold(
43
- 255,
44
- cv.ADAPTIVE_THRESH_GAUSSIAN_C,
45
- cv.THRESH_BINARY,
46
- 11,
47
- 2
48
- );
49
-
50
- // 5. Morphological operations untuk bersihkan noise
51
- let kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(2, 2));
52
- let cleaned = thresh.morphologyEx(kernel, cv.MORPH_CLOSE);
53
-
54
- // 6. Dilate untuk teks lebih tebal
55
- kernel = cv.getStructuringElement(cv.MORPH_RECT, new cv.Size(1, 1));
56
- let dilated = cleaned.dilate(kernel);
57
-
58
- // Encode kembali ke buffer
59
- return cv.imencode('.png', dilated).toString('base64');
60
-
61
- } catch (error) {
62
- console.error("OpenCV Preprocessing error:", error);
63
- return buffer; // Fallback ke original
64
  }
 
 
 
 
 
 
 
 
65
  }
66
 
67
  // ==========================
68
- // OCR WITH OPENCV PREPROCESSING
69
  // ==========================
70
- async function ocrWithOpenCV(buffer, mode = "soal") {
71
- // Preprocess dengan OpenCV
72
- const processedBase64 = await preprocessWithOpenCV(buffer);
73
- const processedBuffer = Buffer.from(processedBase64, 'base64');
74
-
75
- const tmp = path.join(__dirname, "opencv_processed_" + Date.now() + ".png");
76
- fs.writeFileSync(tmp, processedBuffer);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  try {
79
- const config = {
 
80
  tessedit_pageseg_mode: mode === "soal" ? "7" : "8",
81
  tessedit_ocr_engine_mode: "1"
82
- };
83
-
84
- if (mode === "soal") {
85
- config.tessedit_char_whitelist = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789@$&";
86
- } else {
87
- config.tessedit_char_whitelist = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789";
88
- }
89
-
90
- const r = await Tesseract.recognize(tmp, "eng", config);
91
  fs.unlinkSync(tmp);
92
  return r.data.text.trim();
93
  } catch (error) {
@@ -98,46 +122,56 @@ async function ocrWithOpenCV(buffer, mode = "soal") {
98
  }
99
 
100
  // ==========================
101
- // TEXT CLEANING
102
  // ==========================
103
- function cleanText(text) {
104
- return text.replace(/[^A-Za-z0-9]/g, '');
105
- }
106
 
107
- // ==========================
108
- // LEVENSHTEIN SIMILARITY
109
- // ==========================
110
- function levenshtein(a, b) {
111
- const m = [];
112
- for (let i = 0; i <= a.length; i++) {
113
- m[i] = [i];
114
- for (let j = 1; j <= b.length; j++) {
115
- m[i][j] =
116
- i === 0
117
- ? j
118
- : Math.min(
119
- m[i - 1][j] + 1,
120
- m[i][j - 1] + 1,
121
- m[i - 1][j - 1] + (a[i - 1] === b[j - 1] ? 0 : 1)
122
- );
123
  }
124
  }
125
- return m[a.length][b.length];
 
126
  }
127
 
128
- function similarity(a, b) {
129
- const dist = levenshtein(a, b);
130
- return 1 - dist / Math.max(a.length, b.length);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  }
132
 
133
  // ==========================
134
- // MAIN FUNCTION - OPENCV + LEET
135
  // ==========================
136
  module.exports = async data => {
137
  try {
138
- // SOAL - Pakai OpenCV preprocessing + Leet
139
  const soalImg = buf(data.main);
140
- const soalText = await ocrWithOpenCV(soalImg, "soal");
 
141
 
142
  const soalRaw = soalText
143
  .split(/\s+/)
@@ -145,65 +179,80 @@ module.exports = async data => {
145
  .slice(0, 3);
146
 
147
  const soalLeet = soalRaw.map(leetize);
 
 
 
148
 
149
- // BOT - Pakai OpenCV preprocessing (TANPA LEET)
150
  const botResults = [];
151
  for (const b of data.bots) {
152
- const botImg = buf(b.img);
153
- const botText = await ocrWithOpenCV(botImg, "bot");
 
154
 
155
- const cleanText = cleanText(botText);
 
156
 
157
  botResults.push({
158
  id: b.id,
159
- text: cleanText,
160
- value: cleanText, // Tetap asli, tidak di-leet
161
- originalOCR: botText
162
  });
163
  }
164
 
165
- console.log("=== DEBUG OPENCV ===");
166
- console.log("Soal Raw:", soalRaw);
167
- console.log("Soal Leet:", soalLeet);
168
- console.log("Bot Results:", botResults.map(b => ({ id: b.id, value: b.value })));
169
-
170
- // MATCHING LOGIC
171
  const result = [];
172
  const usedBots = new Set();
173
 
174
- // First pass: exact match atau similarity tinggi
175
- for (const s of soalLeet) {
176
- let bestBot = null;
177
- let bestScore = 0;
178
-
179
- for (const bot of botResults) {
180
- if (usedBots.has(bot.id) || !bot.value) continue;
181
-
182
- const score = similarity(s, bot.value);
183
- console.log(`Matching "${s}" with bot ${bot.id} ("${bot.value}") = ${score}`);
184
-
185
- if (score > bestScore) {
186
- bestScore = score;
187
- bestBot = bot.id;
188
- }
189
  }
190
 
191
- // Threshold lebih rendah karena OpenCV sudah lebih akurat
192
- if (bestBot && bestScore >= 0.2) {
 
 
193
  usedBots.add(bestBot);
194
  result.push(bestBot);
195
  } else {
 
196
  result.push(null);
197
  }
198
  }
199
 
200
- // Auto-fill remaining
 
 
 
 
 
 
 
 
 
 
 
 
201
  for (let i = 0; i < result.length; i++) {
202
  if (result[i] === null) {
203
  for (const bot of botResults) {
204
- if (!usedBots.has(bot.id) && bot.value) {
205
  result[i] = bot.id;
206
  usedBots.add(bot.id);
 
207
  break;
208
  }
209
  }
@@ -213,8 +262,9 @@ module.exports = async data => {
213
  return {
214
  soal: soalRaw,
215
  soalLeet,
216
- botResults,
217
- result
 
218
  };
219
 
220
  } catch (error) {
@@ -223,7 +273,8 @@ module.exports = async data => {
223
  soal: [],
224
  soalLeet: [],
225
  botResults: [],
226
- result: []
 
227
  };
228
  }
229
  };
 
1
  const Tesseract = require("tesseract.js");
2
  const fs = require("fs");
3
  const path = require("path");
4
+ const sharp = require("sharp");
5
 
6
  const buf = b => Buffer.from(b.replace(/^data:image\/\w+;base64,/, ""), "base64");
7
 
8
  // ==========================
9
+ // LEET MAP & UTILITIES
10
  // ==========================
11
  const LEET = {
12
+ a: "4", e: "3", g: "9", i: "1", l: "1", o: "0", s: "5", t: "7", b: "8", z: "2",
13
+ "@": "4", "$": "5", "&": "8"
14
  };
15
 
16
+ const REVERSE_LEET = {
17
+ "4": "a", "3": "e", "9": "g", "1": "i", "0": "o", "5": "s", "7": "t", "8": "b", "2": "z"
18
+ };
19
+
20
+ function isMostlyNumbers(str) {
21
+ if (!str) return false;
22
+ const numCount = (str.match(/[0-9]/g) || []).length;
23
+ return numCount >= str.length * 0.7; // 70% atau lebih adalah angka
24
+ }
25
+
26
+ function containsAlphabet(str) {
27
+ return /[a-zA-Z@$&]/.test(str);
28
+ }
29
+
30
  function leetize(str) {
31
+ if (!str || isMostlyNumbers(str) && !containsAlphabet(str)) {
32
+ // Jika string mostly numbers dan tidak ada alfabet, return as-is
33
+ return str;
34
+ }
35
+
36
  return str
37
  .toLowerCase()
38
  .split("")
 
40
  .join("");
41
  }
42
 
43
+ function normalizeText(text) {
44
+ return text
45
+ .replace(/[^A-Za-z0-9@$&]/g, "")
46
+ .replace(/\s+/g, "")
47
+ .trim();
48
+ }
49
+
50
  // ==========================
51
+ // LEVENSHTEIN
52
  // ==========================
53
+ function levenshtein(a, b) {
54
+ if (!a || !b) return Math.max(a?.length || 0, b?.length || 0);
55
+
56
+ const m = [];
57
+ for (let i = 0; i <= a.length; i++) {
58
+ m[i] = [i];
59
+ for (let j = 1; j <= b.length; j++) {
60
+ m[i][j] =
61
+ i === 0
62
+ ? j
63
+ : Math.min(
64
+ m[i - 1][j] + 1,
65
+ m[i][j - 1] + 1,
66
+ m[i - 1][j - 1] + (a[i - 1] === b[j - 1] ? 0 : 1)
67
+ );
68
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  }
70
+ return m[a.length][b.length];
71
+ }
72
+
73
+ function similarity(a, b) {
74
+ if (!a || !b) return 0;
75
+ const dist = levenshtein(a, b);
76
+ const maxLen = Math.max(a.length, b.length);
77
+ return maxLen === 0 ? 1 : 1 - dist / maxLen;
78
  }
79
 
80
  // ==========================
81
+ // OCR PREP
82
  // ==========================
83
+ async function preprocessSoal(b) {
84
+ return sharp(b)
85
+ .resize({ width: 1000 })
86
+ .grayscale()
87
+ .normalize()
88
+ .sharpen({ sigma: 2 })
89
+ .median(2)
90
+ .toBuffer();
91
+ }
92
+
93
+ async function preprocessBot(b) {
94
+ return sharp(b)
95
+ .resize({ width: 800 })
96
+ .grayscale()
97
+ .normalize()
98
+ .sharpen({ sigma: 1.5 })
99
+ .median(1)
100
+ .toBuffer();
101
+ }
102
+
103
+ async function ocr(buf, mode = "soal") {
104
+ const tmp = path.join(__dirname, "tmp_" + Date.now() + ".png");
105
+ fs.writeFileSync(tmp, buf);
106
+
107
+ const whitelist = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789@$&";
108
 
109
  try {
110
+ const r = await Tesseract.recognize(tmp, "eng", {
111
+ tessedit_char_whitelist: whitelist,
112
  tessedit_pageseg_mode: mode === "soal" ? "7" : "8",
113
  tessedit_ocr_engine_mode: "1"
114
+ });
 
 
 
 
 
 
 
 
115
  fs.unlinkSync(tmp);
116
  return r.data.text.trim();
117
  } catch (error) {
 
122
  }
123
 
124
  // ==========================
125
+ // MATCHING STRATEGIES
126
  // ==========================
127
+ function findBestMatch(soalText, botResults, usedBots, minScore = 0.3) {
128
+ let bestBot = null;
129
+ let bestScore = 0;
130
 
131
+ for (const bot of botResults) {
132
+ if (usedBots.has(bot.id) || !bot.value) continue;
133
+
134
+ const score = similarity(soalText, bot.value);
135
+ if (score > bestScore && score >= minScore) {
136
+ bestScore = score;
137
+ bestBot = bot.id;
 
 
 
 
 
 
 
 
 
138
  }
139
  }
140
+
141
+ return { bestBot, bestScore };
142
  }
143
 
144
+ function analyzeTextType(texts) {
145
+ const analysis = {
146
+ total: texts.length,
147
+ numericCount: 0,
148
+ alphaCount: 0,
149
+ mixedCount: 0
150
+ };
151
+
152
+ for (const text of texts) {
153
+ if (isMostlyNumbers(text) && !containsAlphabet(text)) {
154
+ analysis.numericCount++;
155
+ } else if (containsAlphabet(text)) {
156
+ analysis.alphaCount++;
157
+ } else {
158
+ analysis.mixedCount++;
159
+ }
160
+ }
161
+
162
+ analysis.isMostlyNumeric = analysis.numericCount > analysis.total * 0.5;
163
+ return analysis;
164
  }
165
 
166
  // ==========================
167
+ // MAIN - IMPROVED MATCHING
168
  // ==========================
169
  module.exports = async data => {
170
  try {
171
+ // SOAL OCR
172
  const soalImg = buf(data.main);
173
+ const soalProcessed = await preprocessSoal(soalImg);
174
+ const soalText = await ocr(soalProcessed, "soal");
175
 
176
  const soalRaw = soalText
177
  .split(/\s+/)
 
179
  .slice(0, 3);
180
 
181
  const soalLeet = soalRaw.map(leetize);
182
+
183
+ // Analisis tipe teks soal
184
+ const soalAnalysis = analyzeTextType(soalRaw);
185
 
186
+ // BOT OCR
187
  const botResults = [];
188
  for (const b of data.bots) {
189
+ const d = buf(b.img);
190
+ const p = await preprocessBot(d);
191
+ const t = await ocr(p, "bot");
192
 
193
+ const clean = normalizeText(t);
194
+ const leet = leetize(clean);
195
 
196
  botResults.push({
197
  id: b.id,
198
+ text: clean,
199
+ value: leet,
200
+ isNumeric: isMostlyNumbers(clean) && !containsAlphabet(clean)
201
  });
202
  }
203
 
204
+ // IMPROVED MATCHING LOGIC
 
 
 
 
 
205
  const result = [];
206
  const usedBots = new Set();
207
 
208
+ console.log("Soal Analysis:", soalAnalysis);
209
+ console.log("Soal Raw:", soalRaw);
210
+ console.log("Soal Leet:", soalLeet);
211
+ console.log("Bot Results:", botResults.map(b => ({ id: b.id, text: b.text, value: b.value, isNumeric: b.isNumeric })));
212
+
213
+ // First pass: cari match terbaik dengan threshold yang disesuaikan
214
+ for (let i = 0; i < soalLeet.length; i++) {
215
+ const currentSoal = soalLeet[i];
216
+ const currentSoalRaw = soalRaw[i];
217
+
218
+ // Adjust threshold berdasarkan tipe konten
219
+ let minScore = 0.3;
220
+ if (isMostlyNumbers(currentSoalRaw) && !containsAlphabet(currentSoalRaw)) {
221
+ minScore = 0.6; // Lebih ketat untuk angka murni
 
222
  }
223
 
224
+ const { bestBot, bestScore } = findBestMatch(currentSoal, botResults, usedBots, minScore);
225
+
226
+ if (bestBot) {
227
+ console.log(`Match found: Soal "${currentSoal}" -> Bot ${bestBot} (score: ${bestScore.toFixed(3)})`);
228
  usedBots.add(bestBot);
229
  result.push(bestBot);
230
  } else {
231
+ console.log(`No good match for: "${currentSoal}", will auto-fill later`);
232
  result.push(null);
233
  }
234
  }
235
 
236
+ // Second pass: auto-fill remaining slots dengan unused bots
237
+ const unusedBots = botResults.filter(bot => !usedBots.has(bot.id) && bot.value);
238
+
239
+ for (let i = 0; i < result.length; i++) {
240
+ if (result[i] === null && unusedBots.length > 0) {
241
+ const bot = unusedBots.shift();
242
+ result[i] = bot.id;
243
+ usedBots.add(bot.id);
244
+ console.log(`Auto-filled slot ${i} with bot ${bot.id}`);
245
+ }
246
+ }
247
+
248
+ // Final fallback: jika masih ada null, isi dengan bot yang tersisa (termasuk yang sudah digunakan jika perlu)
249
  for (let i = 0; i < result.length; i++) {
250
  if (result[i] === null) {
251
  for (const bot of botResults) {
252
+ if (!usedBots.has(bot.id) || result.filter(x => x === bot.id).length === 0) {
253
  result[i] = bot.id;
254
  usedBots.add(bot.id);
255
+ console.log(`Fallback filled slot ${i} with bot ${bot.id}`);
256
  break;
257
  }
258
  }
 
262
  return {
263
  soal: soalRaw,
264
  soalLeet,
265
+ botResults: botResults.map(b => ({ id: b.id, text: b.text, value: b.value })),
266
+ result,
267
+ analysis: soalAnalysis
268
  };
269
 
270
  } catch (error) {
 
273
  soal: [],
274
  soalLeet: [],
275
  botResults: [],
276
+ result: [],
277
+ analysis: {}
278
  };
279
  }
280
  };