Initial Commit
Browse files- README.md +211 -0
- __pycache__/configuration_captcha.cpython-313.pyc +0 -0
- __pycache__/modeling_captcha.cpython-313.pyc +0 -0
- __pycache__/processing_captcha.cpython-313.pyc +0 -0
- config.json +21 -0
- configuration_captcha.py +7 -0
- images/3eplzv.jpg +0 -0
- images/46CN5W.jpg +0 -0
- images/5820.jpg +0 -0
- images/6521.jpg +0 -0
- images/67qas.jpg +0 -0
- images/75ke.jpg +0 -0
- images/8JKM.jpg +0 -0
- images/8jpwt0.jpg +0 -0
- images/B1QAZ6.jpg +0 -0
- images/CAPTCHA.png +0 -0
- images/CCX8.jpg +0 -0
- images/EPOD.jpg +0 -0
- images/ER6Y.jpg +0 -0
- images/EWSP.jpg +0 -0
- images/GIOGp.jpg +0 -0
- images/HCDS.jpg +0 -0
- images/JBWkEs.jpg +0 -0
- images/KKh8Q.jpg +0 -0
- images/MFMH.jpg +0 -0
- images/NJSEX.jpg +0 -0
- images/R6AB.jpg +0 -0
- images/TVHF.jpg +0 -0
- images/Vb4cG.jpg +0 -0
- images/XaNqQx.jpg +0 -0
- images/YULM.jpg +0 -0
- images/abfsh.jpg +0 -0
- images/b6yc.jpg +0 -0
- images/bCWaLR.jpg +0 -0
- images/confusion-matrix-no-diagonal.png +0 -0
- images/confusion-matrix.png +0 -0
- images/d3no.jpg +0 -0
- images/iq1sZo.jpg +0 -0
- images/kJtOfk.jpg +0 -0
- images/prediction.png +0 -0
- model.safetensors +3 -0
- modeling_captcha.py +41 -0
- pipeline.py +19 -0
- processing_captcha.py +51 -0
- processor_config.json +7 -0
README.md
ADDED
|
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
tags:
|
| 3 |
+
- ocr
|
| 4 |
+
- pytorch
|
| 5 |
+
license: mit
|
| 6 |
+
datasets:
|
| 7 |
+
- hammer888/captcha-data
|
| 8 |
+
metrics:
|
| 9 |
+
- accuracy
|
| 10 |
+
- cer
|
| 11 |
+
pipeline_tag: image-to-text
|
| 12 |
+
library_name: transformers
|
| 13 |
+
---
|
| 14 |
+
|
| 15 |
+
<div align="center">
|
| 16 |
+
|
| 17 |
+
# ✨ DeepCaptcha-CRNN: Sequential Vision for OCR
|
| 18 |
+
### CRNN Base
|
| 19 |
+
|
| 20 |
+
[](https://opensource.org/licenses/MIT)
|
| 21 |
+
[](https://www.python.org/downloads/release/python-3130/)
|
| 22 |
+
[](https://huggingface.co/Graf-J/captcha-crnn-finetuned)
|
| 23 |
+
|
| 24 |
+
---
|
| 25 |
+
|
| 26 |
+
<img src="images/CAPTCHA.png" alt="Captcha Example" width="500">
|
| 27 |
+
|
| 28 |
+
*Advanced sequence recognition using a Convolutional Recurrent Neural Network (CRNN) with Connectionist Temporal Classification (CTC) loss.*
|
| 29 |
+
|
| 30 |
+
</div>
|
| 31 |
+
|
| 32 |
+
---
|
| 33 |
+
|
| 34 |
+
## 📋 Model Details
|
| 35 |
+
- **Task:** Alphanumeric Captcha Recognition
|
| 36 |
+
- **Input:** Images
|
| 37 |
+
- **Output:** String sequences (Length 1–8 characters)
|
| 38 |
+
- **Vocabulary:** Alphanumeric (`a-z`, `A-Z`, `0-9`)
|
| 39 |
+
- **Architecture:** CRNN (CNN + Bi-LSTM)
|
| 40 |
+
|
| 41 |
+
---
|
| 42 |
+
|
| 43 |
+
## 📊 Performance Metrics
|
| 44 |
+
|
| 45 |
+
### **Test Set Results**
|
| 46 |
+
|
| 47 |
+
| Dataset | Sequence Accuracy | Character Error Rate (CER) |
|
| 48 |
+
| --- | --- | --- |
|
| 49 |
+
| **[hammer888/captcha-data](https://huggingface.co/datasets/hammer888/captcha-data)** | `96.81%` | `0.70%` |
|
| 50 |
+
|
| 51 |
+
### **Hardware & Efficiency**
|
| 52 |
+
| Metric | Value |
|
| 53 |
+
| --- | --- |
|
| 54 |
+
| **Model Parameters** | `3,570,943` |
|
| 55 |
+
| **Model Size (Disk)** | `14.3 MB` |
|
| 56 |
+
| **Throughput (Images/sec)** | `447.26 – 467.29` |
|
| 57 |
+
| **Compute Hardware** | **NVIDIA RTX A6000** |
|
| 58 |
+
|
| 59 |
+
---
|
| 60 |
+
|
| 61 |
+
## 🧪 Try It With Sample Images
|
| 62 |
+
|
| 63 |
+
The following are images sampled of the test set of the [hammer888/captcha-data](https://huggingface.co/datasets/hammer888/captcha-data) dataset. Click any image below to download it and test the model locally.
|
| 64 |
+
|
| 65 |
+
<div align="center">
|
| 66 |
+
<table>
|
| 67 |
+
<tr>
|
| 68 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/46CN5W.jpg"><img src="images/46CN5W.jpg" width="120"/></a></td>
|
| 69 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/5820.jpg"><img src="images/5820.jpg" width="120"/></a></td>
|
| 70 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/6521.jpg"><img src="images/6521.jpg" width="120"/></a></td>
|
| 71 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/abfsh.jpg"><img src="images/abfsh.jpg" width="120"/></a></td>
|
| 72 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/67qas.jpg"><img src="images/67qas.jpg" width="120"/></a></td>
|
| 73 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/75ke.jpg"><img src="images/75ke.jpg" width="120"/></a></td>
|
| 74 |
+
</tr>
|
| 75 |
+
<tr>
|
| 76 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/8JKM.jpg"><img src="images/8JKM.jpg" width="120"/></a></td>
|
| 77 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/8jpwt0.jpg"><img src="images/8jpwt0.jpg" width="120"/></a></td>
|
| 78 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/B1QAZ6.jpg"><img src="images/B1QAZ6.jpg" width="120"/></a></td>
|
| 79 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/CCX8.jpg"><img src="images/CCX8.jpg" width="120"/></a></td>
|
| 80 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/EPOD.jpg"><img src="images/EPOD.jpg" width="120"/></a></td>
|
| 81 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/ER6Y.jpg"><img src="images/ER6Y.jpg" width="120"/></a></td>
|
| 82 |
+
</tr>
|
| 83 |
+
<tr>
|
| 84 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/EWSP.jpg"><img src="images/EWSP.jpg" width="120"/></a></td>
|
| 85 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/GIOGp.jpg"><img src="images/GIOGp.jpg" width="120"/></a></td>
|
| 86 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/HCDS.jpg"><img src="images/HCDS.jpg" width="120"/></a></td>
|
| 87 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/JBWkEs.jpg"><img src="images/JBWkEs.jpg" width="120"/></a></td>
|
| 88 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/kJtOfk.jpg"><img src="images/kJtOfk.jpg" width="120"/></a></td>
|
| 89 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/MFMH.jpg"><img src="images/MFMH.jpg" width="120"/></a></td>
|
| 90 |
+
</tr>
|
| 91 |
+
<tr>
|
| 92 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/NJSEX.jpg"><img src="images/NJSEX.jpg" width="120"/></a></td>
|
| 93 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/R6AB.jpg"><img src="images/R6AB.jpg" width="120"/></a></td>
|
| 94 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/TVHF.jpg"><img src="images/TVHF.jpg" width="120"/></a></td>
|
| 95 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/Vb4cG.jpg"><img src="images/Vb4cG.jpg" width="120"/></a></td>
|
| 96 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/XaNqQx.jpg"><img src="images/XaNqQx.jpg" width="120"/></a></td>
|
| 97 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/YULM.jpg"><img src="images/YULM.jpg" width="120"/></a></td>
|
| 98 |
+
</tr>
|
| 99 |
+
<tr>
|
| 100 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/b6yc.jpg"><img src="images/b6yc.jpg" width="120"/></a></td>
|
| 101 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/bCWaLR.jpg"><img src="images/bCWaLR.jpg" width="120"/></a></td>
|
| 102 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/d3no.jpg"><img src="images/d3no.jpg" width="120"/></a></td>
|
| 103 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/3eplzv.jpg"><img src="images/3eplzv.jpg" width="120"/></a></td>
|
| 104 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/iq1sZo.jpg"><img src="images/iq1sZo.jpg" width="120"/></a></td>
|
| 105 |
+
<td><a href="https://huggingface.co/Graf-J/captcha-crnn-finetuned/resolve/main/images/KKh8Q.jpg"><img src="images/KKh8Q.jpg" width="120"/></a></td>
|
| 106 |
+
</tr>
|
| 107 |
+
</table>
|
| 108 |
+
</div>
|
| 109 |
+
|
| 110 |
+
---
|
| 111 |
+
|
| 112 |
+
## 🚀 Quick Start (Pipeline - Recommended)
|
| 113 |
+
|
| 114 |
+
The easiest way to perform inference is using the custom Hugging Face pipeline.
|
| 115 |
+
|
| 116 |
+
```python
|
| 117 |
+
from transformers import pipeline
|
| 118 |
+
from PIL import Image
|
| 119 |
+
|
| 120 |
+
# Initialize the pipeline
|
| 121 |
+
pipe = pipeline(
|
| 122 |
+
task="captcha-recognition",
|
| 123 |
+
model="Graf-J/captcha-crnn-base",
|
| 124 |
+
trust_remote_code=True
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
# Load and predict
|
| 128 |
+
img = Image.open("path/to/image.png")
|
| 129 |
+
result = pipe(img)
|
| 130 |
+
print(f"Decoded Text: {result['prediction']}")
|
| 131 |
+
|
| 132 |
+
```
|
| 133 |
+
|
| 134 |
+
## 🔬 Advanced Usage (Raw Logits & Custom Decoding)
|
| 135 |
+
|
| 136 |
+
Use this method if you need access to the raw logits or internal hidden states.
|
| 137 |
+
|
| 138 |
+
```python
|
| 139 |
+
import torch
|
| 140 |
+
from PIL import Image
|
| 141 |
+
from transformers import AutoModel, AutoProcessor
|
| 142 |
+
|
| 143 |
+
# Load Model & Custom Processor
|
| 144 |
+
repo_id = "Graf-J/captcha-crnn-base"
|
| 145 |
+
processor = AutoProcessor.from_pretrained(repo_id, trust_remote_code=True)
|
| 146 |
+
model = AutoModel.from_pretrained(repo_id, trust_remote_code=True)
|
| 147 |
+
|
| 148 |
+
model.eval()
|
| 149 |
+
|
| 150 |
+
# Load and process image
|
| 151 |
+
img = Image.open("path/to/image.png")
|
| 152 |
+
inputs = processor(img)
|
| 153 |
+
|
| 154 |
+
# Inference
|
| 155 |
+
with torch.no_grad():
|
| 156 |
+
outputs = model(inputs["pixel_values"])
|
| 157 |
+
logits = outputs.logits
|
| 158 |
+
|
| 159 |
+
# Decode the prediction via CTC logic
|
| 160 |
+
prediction = processor.batch_decode(logits)[0]
|
| 161 |
+
print(f"Prediction: '{prediction}'")
|
| 162 |
+
|
| 163 |
+
```
|
| 164 |
+
|
| 165 |
+
---
|
| 166 |
+
|
| 167 |
+
## ⚙️ Training
|
| 168 |
+
The base model was trained on a refined version of the [hammer888/captcha-data](https://huggingface.co/datasets/hammer888/captcha-data) (1,365,874 images). This dataset underwent a specialized cleaning process where multiple pre-trained models were used to identify and prune inconsistent data. Specifically, images where models were "confidently incorrect" regarding casing (upper/lower-case errors) were removed to ensure high-fidelity ground truth for the final training run.
|
| 169 |
+
|
| 170 |
+
### **Parameters**
|
| 171 |
+
- **Optimizer:** Adam (lr=0.002)
|
| 172 |
+
- **Scheduler:** ReduceLROnPlateau (factor=0.5, patience=3)
|
| 173 |
+
- **Batch Size:** 128
|
| 174 |
+
- **Loss Function:** CTCLoss
|
| 175 |
+
- **Augmentations:** ElasticTransform, Random Rotation, Grayscale Resize
|
| 176 |
+
|
| 177 |
+
---
|
| 178 |
+
|
| 179 |
+
## 🔍 Error Analysis
|
| 180 |
+
|
| 181 |
+
The following confusion matrices illustrate the character-level performance across the alphanumeric vocabulary for the test dataset of the images generated via Python.
|
| 182 |
+
|
| 183 |
+
### **Full Confusion Matrix**
|
| 184 |
+

|
| 185 |
+
|
| 186 |
+
### **Misclassification Deep Dive**
|
| 187 |
+
|
| 188 |
+
This matrix highlights only the misclassification patterns, stripping away correct predictions to visualize which character pairs (such as '0' vs 'O' or '1' vs 'l') the model most frequently confuses.
|
| 189 |
+

|
| 190 |
+
|
| 191 |
+
---
|
| 192 |
+
|
| 193 |
+
## ⚖️ **License & Citation**
|
| 194 |
+
|
| 195 |
+
This project is licensed under the **MIT License**. If you use this model in your research, portfolio, or applications, please attribute the author.
|
| 196 |
+
|
| 197 |
+
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
|
__pycache__/configuration_captcha.cpython-313.pyc
ADDED
|
Binary file (806 Bytes). View file
|
|
|
__pycache__/modeling_captcha.cpython-313.pyc
ADDED
|
Binary file (2.92 kB). View file
|
|
|
__pycache__/processing_captcha.cpython-313.pyc
ADDED
|
Binary file (3.08 kB). View file
|
|
|
config.json
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"CaptchaCRNN"
|
| 4 |
+
],
|
| 5 |
+
"dtype": "float32",
|
| 6 |
+
"model_type": "captcha_crnn",
|
| 7 |
+
"num_chars": 63,
|
| 8 |
+
"transformers_version": "5.1.0",
|
| 9 |
+
"auto_map": {
|
| 10 |
+
"AutoConfig": "configuration_captcha.CaptchaConfig",
|
| 11 |
+
"AutoModel": "modeling_captcha.CaptchaCRNN",
|
| 12 |
+
"AutoProcessor": "processing_captcha.CaptchaProcessor"
|
| 13 |
+
},
|
| 14 |
+
"custom_pipelines": {
|
| 15 |
+
"captcha-recognition": {
|
| 16 |
+
"impl": "pipeline.CaptchaPipeline",
|
| 17 |
+
"pt": ["AutoModel"],
|
| 18 |
+
"type": "multimodal"
|
| 19 |
+
}
|
| 20 |
+
}
|
| 21 |
+
}
|
configuration_captcha.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import PretrainedConfig
|
| 2 |
+
|
| 3 |
+
class CaptchaConfig(PretrainedConfig):
|
| 4 |
+
model_type = "captcha_crnn"
|
| 5 |
+
def __init__(self, num_chars=63, **kwargs):
|
| 6 |
+
super().__init__(**kwargs)
|
| 7 |
+
self.num_chars = num_chars
|
images/3eplzv.jpg
ADDED
|
images/46CN5W.jpg
ADDED
|
images/5820.jpg
ADDED
|
images/6521.jpg
ADDED
|
images/67qas.jpg
ADDED
|
images/75ke.jpg
ADDED
|
images/8JKM.jpg
ADDED
|
images/8jpwt0.jpg
ADDED
|
images/B1QAZ6.jpg
ADDED
|
images/CAPTCHA.png
ADDED
|
images/CCX8.jpg
ADDED
|
images/EPOD.jpg
ADDED
|
images/ER6Y.jpg
ADDED
|
images/EWSP.jpg
ADDED
|
images/GIOGp.jpg
ADDED
|
images/HCDS.jpg
ADDED
|
images/JBWkEs.jpg
ADDED
|
images/KKh8Q.jpg
ADDED
|
images/MFMH.jpg
ADDED
|
images/NJSEX.jpg
ADDED
|
images/R6AB.jpg
ADDED
|
images/TVHF.jpg
ADDED
|
images/Vb4cG.jpg
ADDED
|
images/XaNqQx.jpg
ADDED
|
images/YULM.jpg
ADDED
|
images/abfsh.jpg
ADDED
|
images/b6yc.jpg
ADDED
|
images/bCWaLR.jpg
ADDED
|
images/confusion-matrix-no-diagonal.png
ADDED
|
images/confusion-matrix.png
ADDED
|
images/d3no.jpg
ADDED
|
images/iq1sZo.jpg
ADDED
|
images/kJtOfk.jpg
ADDED
|
images/prediction.png
ADDED
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff93abaec8ddbf2a5979a2c350178ba71c7bf8ca78873ab187a21bd2678df35b
|
| 3 |
+
size 14290964
|
modeling_captcha.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
from transformers import PreTrainedModel
|
| 4 |
+
from transformers.modeling_outputs import SequenceClassifierOutput
|
| 5 |
+
from .configuration_captcha import CaptchaConfig
|
| 6 |
+
|
| 7 |
+
class CaptchaCRNN(PreTrainedModel):
|
| 8 |
+
config_class = CaptchaConfig
|
| 9 |
+
|
| 10 |
+
def __init__(self, config):
|
| 11 |
+
super().__init__(config)
|
| 12 |
+
self.conv_layer = nn.Sequential(
|
| 13 |
+
nn.Conv2d(1, 32, kernel_size=3, padding=1),
|
| 14 |
+
nn.BatchNorm2d(32),
|
| 15 |
+
nn.SiLU(),
|
| 16 |
+
nn.MaxPool2d(2, 2),
|
| 17 |
+
nn.Conv2d(32, 64, kernel_size=3, padding=1),
|
| 18 |
+
nn.BatchNorm2d(64),
|
| 19 |
+
nn.SiLU(),
|
| 20 |
+
nn.MaxPool2d(2, 2),
|
| 21 |
+
nn.Conv2d(64, 128, kernel_size=3, padding=1),
|
| 22 |
+
nn.BatchNorm2d(128),
|
| 23 |
+
nn.SiLU(),
|
| 24 |
+
nn.MaxPool2d(kernel_size=(2, 1)),
|
| 25 |
+
nn.Conv2d(128, 256, kernel_size=3, padding=1),
|
| 26 |
+
nn.BatchNorm2d(256),
|
| 27 |
+
nn.SiLU()
|
| 28 |
+
)
|
| 29 |
+
self.lstm = nn.LSTM(input_size=1280, hidden_size=256, bidirectional=True, batch_first=True)
|
| 30 |
+
self.classifier = nn.Linear(512, config.num_chars)
|
| 31 |
+
self.post_init()
|
| 32 |
+
|
| 33 |
+
def forward(self, x, labels=None):
|
| 34 |
+
x = self.conv_layer(x)
|
| 35 |
+
x = x.permute(0, 3, 1, 2)
|
| 36 |
+
batch, width, channels, height = x.size()
|
| 37 |
+
x = x.view(batch, width, -1)
|
| 38 |
+
x, _ = self.lstm(x)
|
| 39 |
+
logits = self.classifier(x)
|
| 40 |
+
|
| 41 |
+
return SequenceClassifierOutput(logits=logits)
|
pipeline.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import Pipeline
|
| 2 |
+
import torch
|
| 3 |
+
|
| 4 |
+
class CaptchaPipeline(Pipeline):
|
| 5 |
+
def _sanitize_parameters(self, **kwargs):
|
| 6 |
+
return {}, {}, {}
|
| 7 |
+
|
| 8 |
+
def preprocess(self, image):
|
| 9 |
+
return self.processor(image)
|
| 10 |
+
|
| 11 |
+
def _forward(self, model_inputs):
|
| 12 |
+
with torch.no_grad():
|
| 13 |
+
outputs = self.model(model_inputs["pixel_values"])
|
| 14 |
+
return outputs
|
| 15 |
+
|
| 16 |
+
def postprocess(self, model_outputs):
|
| 17 |
+
logits = model_outputs.logits
|
| 18 |
+
prediction = self.processor.batch_decode(logits)[0]
|
| 19 |
+
return {"prediction": prediction}
|
processing_captcha.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import string
|
| 2 |
+
import torch
|
| 3 |
+
import torchvision.transforms.functional as F
|
| 4 |
+
from transformers.processing_utils import ProcessorMixin
|
| 5 |
+
|
| 6 |
+
class CaptchaProcessor(ProcessorMixin):
|
| 7 |
+
attributes = []
|
| 8 |
+
def __init__(self, vocab=None, **kwargs):
|
| 9 |
+
super().__init__(**kwargs)
|
| 10 |
+
self.vocab = vocab or (string.ascii_lowercase + string.ascii_uppercase + string.digits)
|
| 11 |
+
self.idx_to_char = {i + 1: c for i, c in enumerate(self.vocab)}
|
| 12 |
+
self.idx_to_char[0] = ""
|
| 13 |
+
|
| 14 |
+
def __call__(self, images):
|
| 15 |
+
"""
|
| 16 |
+
Converts PIL images to the tensor format the CRNN expects.
|
| 17 |
+
"""
|
| 18 |
+
if not isinstance(images, list):
|
| 19 |
+
images = [images]
|
| 20 |
+
|
| 21 |
+
processed_images = []
|
| 22 |
+
for img in images:
|
| 23 |
+
# Convert to Grayscale
|
| 24 |
+
img = img.convert("L")
|
| 25 |
+
# Resize to your model's expected input (Width, Height)
|
| 26 |
+
img = img.resize((150, 40))
|
| 27 |
+
# Convert to Tensor and Scale to [0, 1]
|
| 28 |
+
img_tensor = F.to_tensor(img)
|
| 29 |
+
processed_images.append(img_tensor)
|
| 30 |
+
|
| 31 |
+
return {"pixel_values": torch.stack(processed_images)}
|
| 32 |
+
|
| 33 |
+
def batch_decode(self, logits):
|
| 34 |
+
"""
|
| 35 |
+
CTC decoding logic.
|
| 36 |
+
"""
|
| 37 |
+
tokens = torch.argmax(logits, dim=-1)
|
| 38 |
+
if len(tokens.shape) == 1:
|
| 39 |
+
tokens = tokens.unsqueeze(0)
|
| 40 |
+
|
| 41 |
+
decoded_strings = []
|
| 42 |
+
for batch_item in tokens:
|
| 43 |
+
char_list = []
|
| 44 |
+
for i in range(len(batch_item)):
|
| 45 |
+
token = batch_item[i].item()
|
| 46 |
+
if token != 0:
|
| 47 |
+
if i > 0 and batch_item[i] == batch_item[i - 1]:
|
| 48 |
+
continue
|
| 49 |
+
char_list.append(self.idx_to_char.get(token, ""))
|
| 50 |
+
decoded_strings.append("".join(char_list))
|
| 51 |
+
return decoded_strings
|
processor_config.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"processor_class": "CaptchaProcessor",
|
| 3 |
+
"vocab": "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789",
|
| 4 |
+
"auto_map": {
|
| 5 |
+
"AutoProcessor": "processing_captcha.CaptchaProcessor"
|
| 6 |
+
}
|
| 7 |
+
}
|