doniramdani820 commited on
Commit
de22214
Β·
verified Β·
1 Parent(s): b4e54e7

Upload 4 files

Browse files
Files changed (4) hide show
  1. app.py +428 -0
  2. model55x140.onnx +3 -0
  3. model90x280.onnx +3 -0
  4. requirements.txt +6 -0
app.py ADDED
@@ -0,0 +1,428 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import cv2
3
+ import numpy as np
4
+ import base64
5
+ import onnxruntime as ort
6
+ from PIL import Image, ImageDraw, ImageFont
7
+ import io
8
+ import json
9
+ import logging
10
+ from difflib import SequenceMatcher
11
+
12
+ # Set up logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ # Global variables for ONNX models
17
+ title_model = None
18
+ button_model = None
19
+
20
+ def load_models():
21
+ """Load ONNX models for title and button detection"""
22
+ global title_model, button_model
23
+
24
+ try:
25
+ # Load title detection model
26
+ title_model = ort.InferenceSession("model90x280.onnx")
27
+ logger.info("βœ… Title model (model90x280.onnx) loaded successfully")
28
+
29
+ # Load button detection model
30
+ button_model = ort.InferenceSession("model55x140.onnx")
31
+ logger.info("βœ… Button model (model55x140.onnx) loaded successfully")
32
+
33
+ except Exception as e:
34
+ logger.error(f"❌ Error loading models: {str(e)}")
35
+ raise e
36
+
37
+ def decode_base64_image(base64_str):
38
+ """Convert base64 string to numpy array"""
39
+ try:
40
+ # Remove data URL prefix if present
41
+ if base64_str.startswith('data:image'):
42
+ base64_str = base64_str.split(',')[1]
43
+
44
+ # Decode base64
45
+ img_data = base64.b64decode(base64_str)
46
+
47
+ # Convert to PIL Image
48
+ img = Image.open(io.BytesIO(img_data))
49
+
50
+ # Convert to RGB if necessary
51
+ if img.mode != 'RGB':
52
+ img = img.convert('RGB')
53
+
54
+ # Convert to numpy array
55
+ img_array = np.array(img)
56
+
57
+ return img_array
58
+ except Exception as e:
59
+ logger.error(f"❌ Error decoding base64 image: {str(e)}")
60
+ raise e
61
+
62
+ def crop_title_area(image):
63
+ """
64
+ Crop title area from image
65
+ Area: Width: 280px, Height: 100px, X: 0, Y: 220
66
+ """
67
+ try:
68
+ # Crop title area
69
+ title_crop = image[220:320, 0:280] # [y1:y2, x1:x2]
70
+
71
+ logger.info(f"πŸ“ Title area cropped: {title_crop.shape}")
72
+ return title_crop
73
+ except Exception as e:
74
+ logger.error(f"❌ Error cropping title area: {str(e)}")
75
+ raise e
76
+
77
+ def crop_button_areas(image):
78
+ """
79
+ Crop button areas from image
80
+ Grid 280Γ—320 px with specific positions for buttons 1-9
81
+ """
82
+ try:
83
+ button_crops = {}
84
+
85
+ # Define button positions (x, y, width, height)
86
+ button_positions = {
87
+ 1: (0, 0, 140, 60), # Tombol 1: (0,0) β†’ 140Γ—60 px
88
+ 2: (140, 0, 140, 60), # Tombol 2: (140,0) β†’ 140Γ—60 px
89
+ 3: (0, 60, 140, 60), # Tombol 3: (0,60) β†’ 140Γ—60 px
90
+ 4: (140, 60, 140, 50), # Tombol 4: (140,60) β†’ 140Γ—50 px
91
+ 5: (0, 115, 140, 50), # Tombol 5: (0,115) β†’ 140Γ—50 px
92
+ 6: (140, 110, 140, 60), # Tombol 6: (140,110) β†’ 140Γ—60 px
93
+ 7: (0, 170, 140, 50), # Tombol 7: (0,170) β†’ 140Γ—50 px
94
+ 8: (140, 170, 140, 50), # Tombol 8: (140,170) β†’ 140Γ—50 px
95
+ }
96
+
97
+ # Assume button area starts at a consistent position (adjust as needed)
98
+ button_grid_start_y = 350 # Adjust based on actual image layout
99
+
100
+ for button_id, (x, y, w, h) in button_positions.items():
101
+ # Adjust coordinates for actual image position
102
+ abs_y = button_grid_start_y + y
103
+ abs_x = x
104
+
105
+ # Crop button area
106
+ button_crop = image[abs_y:abs_y+h, abs_x:abs_x+w]
107
+ button_crops[button_id] = button_crop
108
+
109
+ logger.info(f"πŸ”² Button {button_id} cropped: {button_crop.shape}")
110
+
111
+ return button_crops
112
+ except Exception as e:
113
+ logger.error(f"❌ Error cropping button areas: {str(e)}")
114
+ raise e
115
+
116
+ def preprocess_for_ocr(image, target_size):
117
+ """Preprocess image for ONNX model inference"""
118
+ try:
119
+ # Resize image to target size
120
+ resized = cv2.resize(image, target_size)
121
+
122
+ # Convert to grayscale if needed
123
+ if len(resized.shape) == 3:
124
+ gray = cv2.cvtColor(resized, cv2.COLOR_RGB2GRAY)
125
+ else:
126
+ gray = resized
127
+
128
+ # Normalize pixel values to 0-1
129
+ normalized = gray.astype(np.float32) / 255.0
130
+
131
+ # Add batch and channel dimensions
132
+ input_tensor = normalized.reshape(1, 1, target_size[1], target_size[0])
133
+
134
+ return input_tensor
135
+ except Exception as e:
136
+ logger.error(f"❌ Error preprocessing image: {str(e)}")
137
+ raise e
138
+
139
+ def predict_title(title_crop):
140
+ """Predict title text using ONNX model"""
141
+ try:
142
+ if title_model is None:
143
+ raise ValueError("Title model not loaded")
144
+
145
+ # Preprocess image for model (280x90 based on model name)
146
+ input_tensor = preprocess_for_ocr(title_crop, (280, 90))
147
+
148
+ # Get model input name
149
+ input_name = title_model.get_inputs()[0].name
150
+
151
+ # Run inference
152
+ outputs = title_model.run(None, {input_name: input_tensor})
153
+
154
+ # Process output (assuming it returns text prediction)
155
+ # This will depend on your specific model output format
156
+ predicted_text = process_model_output(outputs[0])
157
+
158
+ logger.info(f"πŸ”€ Title prediction: '{predicted_text}'")
159
+ return predicted_text
160
+ except Exception as e:
161
+ logger.error(f"❌ Error predicting title: {str(e)}")
162
+ raise e
163
+
164
+ def predict_button_text(button_crop):
165
+ """Predict button text using ONNX model"""
166
+ try:
167
+ if button_model is None:
168
+ raise ValueError("Button model not loaded")
169
+
170
+ # Preprocess image for model (140x55 based on model name)
171
+ input_tensor = preprocess_for_ocr(button_crop, (140, 55))
172
+
173
+ # Get model input name
174
+ input_name = button_model.get_inputs()[0].name
175
+
176
+ # Run inference
177
+ outputs = button_model.run(None, {input_name: input_tensor})
178
+
179
+ # Process output
180
+ predicted_text = process_model_output(outputs[0])
181
+
182
+ return predicted_text
183
+ except Exception as e:
184
+ logger.error(f"❌ Error predicting button text: {str(e)}")
185
+ raise e
186
+
187
+ def process_model_output(output):
188
+ """
189
+ Process ONNX model output to extract text
190
+ This is a placeholder - you'll need to adjust based on your specific model output format
191
+ """
192
+ try:
193
+ # If output is probabilities over characters, decode to text
194
+ # This is a simplified example - adjust based on your model
195
+
196
+ if isinstance(output, np.ndarray):
197
+ if len(output.shape) == 3: # Sequence of character probabilities
198
+ # Convert probabilities to characters (CTC-like decoding)
199
+ chars = "abcdefghijklmnopqrstuvwxyz0123456789"
200
+ text = ""
201
+ for timestep in output[0]:
202
+ char_idx = np.argmax(timestep)
203
+ if char_idx < len(chars):
204
+ text += chars[char_idx]
205
+
206
+ # Clean up repeated characters and spaces
207
+ cleaned_text = ""
208
+ prev_char = ""
209
+ for char in text:
210
+ if char != prev_char:
211
+ cleaned_text += char
212
+ prev_char = char
213
+
214
+ return cleaned_text.strip()
215
+
216
+ elif len(output.shape) == 2: # Single prediction
217
+ chars = "abcdefghijklmnopqrstuvwxyz0123456789"
218
+ char_idx = np.argmax(output[0])
219
+ if char_idx < len(chars):
220
+ return chars[char_idx]
221
+
222
+ return ""
223
+ except Exception as e:
224
+ logger.error(f"❌ Error processing model output: {str(e)}")
225
+ return ""
226
+
227
+ def split_title(title_text):
228
+ """
229
+ Split title into 2 parts based on length
230
+ 6 huruf: abcdef β†’ abc dan def
231
+ 5 huruf: abcde β†’ abc dan de
232
+ 4 huruf: abcd β†’ ab dan cd
233
+ """
234
+ try:
235
+ title_length = len(title_text)
236
+
237
+ if title_length >= 6:
238
+ # 6+ characters: split in half
239
+ mid = title_length // 2
240
+ part1 = title_text[:mid]
241
+ part2 = title_text[mid:]
242
+ elif title_length == 5:
243
+ # 5 characters: first 3, last 2
244
+ part1 = title_text[:3]
245
+ part2 = title_text[3:]
246
+ elif title_length == 4:
247
+ # 4 characters: split in half
248
+ part1 = title_text[:2]
249
+ part2 = title_text[2:]
250
+ else:
251
+ # Less than 4 characters: handle as edge case
252
+ mid = max(1, title_length // 2)
253
+ part1 = title_text[:mid]
254
+ part2 = title_text[mid:]
255
+
256
+ logger.info(f"βœ‚οΈ Title split: '{title_text}' β†’ '{part1}' + '{part2}'")
257
+ return part1, part2
258
+ except Exception as e:
259
+ logger.error(f"❌ Error splitting title: {str(e)}")
260
+ return "", ""
261
+
262
+ def find_matching_buttons(part1, part2, button_predictions):
263
+ """
264
+ Find buttons that match the title parts
265
+ Uses fuzzy matching to handle OCR errors
266
+ """
267
+ try:
268
+ matching_buttons = []
269
+
270
+ # Convert parts to lowercase for comparison
271
+ part1_lower = part1.lower().strip()
272
+ part2_lower = part2.lower().strip()
273
+
274
+ logger.info(f"πŸ” Looking for buttons matching: '{part1_lower}' and '{part2_lower}'")
275
+
276
+ # Track best matches
277
+ part1_matches = []
278
+ part2_matches = []
279
+
280
+ for button_id, button_text in button_predictions.items():
281
+ button_text_lower = button_text.lower().strip()
282
+
283
+ # Calculate similarity scores
284
+ part1_similarity = SequenceMatcher(None, part1_lower, button_text_lower).ratio()
285
+ part2_similarity = SequenceMatcher(None, part2_lower, button_text_lower).ratio()
286
+
287
+ # Set threshold for matching (adjust as needed)
288
+ threshold = 0.6
289
+
290
+ if part1_similarity >= threshold:
291
+ part1_matches.append((button_id, part1_similarity, button_text))
292
+ logger.info(f" πŸ“ Button {button_id} matches part1 '{part1_lower}': '{button_text_lower}' (similarity: {part1_similarity:.2f})")
293
+
294
+ if part2_similarity >= threshold:
295
+ part2_matches.append((button_id, part2_similarity, button_text))
296
+ logger.info(f" πŸ“ Button {button_id} matches part2 '{part2_lower}': '{button_text_lower}' (similarity: {part2_similarity:.2f})")
297
+
298
+ # Sort matches by similarity score (highest first)
299
+ part1_matches.sort(key=lambda x: x[1], reverse=True)
300
+ part2_matches.sort(key=lambda x: x[1], reverse=True)
301
+
302
+ # Select best matches
303
+ if part1_matches:
304
+ best_part1_match = part1_matches[0]
305
+ matching_buttons.append(best_part1_match[0])
306
+ logger.info(f"🎯 Best match for part1: Button {best_part1_match[0]} ('{best_part1_match[2]}', score: {best_part1_match[1]:.2f})")
307
+
308
+ if part2_matches:
309
+ best_part2_match = part2_matches[0]
310
+ # Avoid duplicate buttons
311
+ if best_part2_match[0] not in matching_buttons:
312
+ matching_buttons.append(best_part2_match[0])
313
+ logger.info(f"🎯 Best match for part2: Button {best_part2_match[0]} ('{best_part2_match[2]}', score: {best_part2_match[1]:.2f})")
314
+
315
+ logger.info(f"βœ… Final matching buttons: {matching_buttons}")
316
+ return matching_buttons
317
+ except Exception as e:
318
+ logger.error(f"❌ Error finding matching buttons: {str(e)}")
319
+ return []
320
+
321
+ def solve_assemble_captcha(base64_image):
322
+ """
323
+ Main function to solve assemble captcha
324
+ """
325
+ try:
326
+ logger.info("πŸš€ Starting assemble captcha solving...")
327
+
328
+ # Decode base64 image
329
+ image = decode_base64_image(base64_image)
330
+ logger.info(f"πŸ“Έ Image decoded: {image.shape}")
331
+
332
+ # Step 1: Crop title area
333
+ title_crop = crop_title_area(image)
334
+
335
+ # Step 2: Predict title text
336
+ title_text = predict_title(title_crop)
337
+ if not title_text:
338
+ raise ValueError("Could not detect title text")
339
+
340
+ # Step 3: Split title into parts
341
+ part1, part2 = split_title(title_text)
342
+ if not part1 or not part2:
343
+ raise ValueError("Could not split title into valid parts")
344
+
345
+ # Step 4: Crop button areas
346
+ button_crops = crop_button_areas(image)
347
+
348
+ # Step 5: Predict button texts
349
+ button_predictions = {}
350
+ for button_id, button_crop in button_crops.items():
351
+ button_text = predict_button_text(button_crop)
352
+ button_predictions[button_id] = button_text
353
+ logger.info(f"πŸ”² Button {button_id} prediction: '{button_text}'")
354
+
355
+ # Step 6: Find matching buttons
356
+ matching_buttons = find_matching_buttons(part1, part2, button_predictions)
357
+
358
+ if not matching_buttons:
359
+ raise ValueError("No matching buttons found")
360
+
361
+ # Prepare result
362
+ result = {
363
+ "success": True,
364
+ "title_detected": title_text,
365
+ "title_part1": part1,
366
+ "title_part2": part2,
367
+ "button_predictions": button_predictions,
368
+ "buttons_to_click": matching_buttons,
369
+ "message": f"Found {len(matching_buttons)} matching buttons for '{title_text}' ('{part1}' + '{part2}')"
370
+ }
371
+
372
+ logger.info("βœ… Assemble captcha solved successfully!")
373
+ logger.info(f"πŸ“Š Result: {json.dumps(result, indent=2)}")
374
+
375
+ return result
376
+
377
+ except Exception as e:
378
+ logger.error(f"❌ Error solving assemble captcha: {str(e)}")
379
+ return {
380
+ "success": False,
381
+ "error": str(e),
382
+ "message": "Failed to solve assemble captcha"
383
+ }
384
+
385
+ # Initialize models when app starts
386
+ try:
387
+ load_models()
388
+ except Exception as e:
389
+ logger.error(f"❌ Failed to initialize models: {str(e)}")
390
+
391
+ # Gradio interface
392
+ def gradio_solve(base64_image):
393
+ """Gradio interface function"""
394
+ try:
395
+ result = solve_assemble_captcha(base64_image)
396
+ return json.dumps(result, indent=2)
397
+ except Exception as e:
398
+ error_result = {
399
+ "success": False,
400
+ "error": str(e),
401
+ "message": "Internal server error"
402
+ }
403
+ return json.dumps(error_result, indent=2)
404
+
405
+ # Create Gradio interface
406
+ iface = gr.Interface(
407
+ fn=gradio_solve,
408
+ inputs=gr.Textbox(
409
+ label="Base64 Image",
410
+ placeholder="Paste base64 encoded captcha image here...",
411
+ lines=3
412
+ ),
413
+ outputs=gr.Textbox(
414
+ label="Solution Result",
415
+ lines=10
416
+ ),
417
+ title="XCaptcha2 Assemble Solver",
418
+ description="Solve 'Assemble from 2 elements' type captchas by detecting title and matching buttons",
419
+ examples=[]
420
+ )
421
+
422
+ # For Hugging Face Spaces API
423
+ def solve(image_base64):
424
+ """API endpoint for solving captcha"""
425
+ return solve_assemble_captcha(image_base64)
426
+
427
+ if __name__ == "__main__":
428
+ iface.launch()
model55x140.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1cb0b7a5aa05fe95f7110a99dfc7a210151229744c7a4b1bf3ca279e8cdc1cea
3
+ size 1935908
model90x280.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:161aa69dce99ef1c5c291d9b35163479808ea228ba6fe8903c926e5ca2bc7a77
3
+ size 1938087
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ gradio
2
+ opencv-python
3
+ numpy
4
+ onnxruntime
5
+ pillow
6
+ difflib