Devam0 commited on
Commit
c25b15a
·
1 Parent(s): 4a390bf

Initialize project with proper LFS tracking

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. README.md +253 -11
  3. app.py +32 -0
  4. models/FSRCNN_x3.pb +3 -0
  5. models/depth_anything_v2/.gitattributes +35 -0
  6. models/depth_anything_v2/README.md +108 -0
  7. models/depth_anything_v2/config.json +53 -0
  8. models/depth_anything_v2/model.safetensors +3 -0
  9. models/depth_anything_v2/preprocessor_config.json +44 -0
  10. models/helmet_v11.pt +3 -0
  11. models/license.pt +3 -0
  12. models/paddleocr/official_models/PP-LCNet_x1_0_doc_ori/.gitattributes +36 -0
  13. models/paddleocr/official_models/PP-LCNet_x1_0_doc_ori/README.md +152 -0
  14. models/paddleocr/official_models/PP-LCNet_x1_0_doc_ori/config.json +102 -0
  15. models/paddleocr/official_models/PP-LCNet_x1_0_doc_ori/inference.json +0 -0
  16. models/paddleocr/official_models/PP-LCNet_x1_0_doc_ori/inference.pdiparams +3 -0
  17. models/paddleocr/official_models/PP-LCNet_x1_0_doc_ori/inference.yml +48 -0
  18. models/paddleocr/official_models/PP-LCNet_x1_0_textline_ori/.gitattributes +36 -0
  19. models/paddleocr/official_models/PP-LCNet_x1_0_textline_ori/README.md +104 -0
  20. models/paddleocr/official_models/PP-LCNet_x1_0_textline_ori/config.json +98 -0
  21. models/paddleocr/official_models/PP-LCNet_x1_0_textline_ori/inference.json +0 -0
  22. models/paddleocr/official_models/PP-LCNet_x1_0_textline_ori/inference.pdiparams +3 -0
  23. models/paddleocr/official_models/PP-LCNet_x1_0_textline_ori/inference.yml +46 -0
  24. models/paddleocr/official_models/PP-OCRv5_mobile_det/.msc +0 -0
  25. models/paddleocr/official_models/PP-OCRv5_mobile_det/.mv +1 -0
  26. models/paddleocr/official_models/PP-OCRv5_mobile_det/README.md +219 -0
  27. models/paddleocr/official_models/PP-OCRv5_mobile_det/config.json +111 -0
  28. models/paddleocr/official_models/PP-OCRv5_mobile_det/inference.json +0 -0
  29. models/paddleocr/official_models/PP-OCRv5_mobile_det/inference.pdiparams +3 -0
  30. models/paddleocr/official_models/PP-OCRv5_mobile_det/inference.yml +53 -0
  31. models/paddleocr/official_models/UVDoc/.gitattributes +36 -0
  32. models/paddleocr/official_models/UVDoc/README.md +131 -0
  33. models/paddleocr/official_models/UVDoc/config.json +57 -0
  34. models/paddleocr/official_models/UVDoc/inference.json +0 -0
  35. models/paddleocr/official_models/UVDoc/inference.pdiparams +3 -0
  36. models/paddleocr/official_models/UVDoc/inference.yml +16 -0
  37. models/paddleocr/official_models/en_PP-OCRv5_mobile_rec/.gitattributes +36 -0
  38. models/paddleocr/official_models/en_PP-OCRv5_mobile_rec/README.md +169 -0
  39. models/paddleocr/official_models/en_PP-OCRv5_mobile_rec/config.json +533 -0
  40. models/paddleocr/official_models/en_PP-OCRv5_mobile_rec/inference.json +0 -0
  41. models/paddleocr/official_models/en_PP-OCRv5_mobile_rec/inference.pdiparams +3 -0
  42. models/paddleocr/official_models/en_PP-OCRv5_mobile_rec/inference.yml +479 -0
  43. models/stage1_best.pt +3 -0
  44. models/yolov8s.pt +3 -0
  45. patch_safetensors.py +13 -0
  46. requirements.txt +48 -0
  47. run_inference.py +25 -0
  48. solution.py +405 -0
  49. testimages/1.jpg +0 -0
  50. testimages/2.webp +0 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.pdiparams filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,14 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- title: Gridlock
3
- emoji: 👁
4
- colorFrom: gray
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 6.19.0
8
- python_version: '3.13'
9
- app_file: app.py
10
- pinned: false
11
- short_description: flipkart gridlock hackathon
 
 
 
 
 
 
 
 
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AID 728 — Traffic Rule Violation Detection
2
+
3
+ **IIIT Bangalore**
4
+
5
+ Detects traffic rule violations involving two-wheelers from single RGB street-camera images. Identifies **helmet violations**, **over-riding (>2 riders on one bike)**, and extracts the **license plate text** of every violating vehicle.
6
+
7
+ ---
8
+
9
+ ## Submission Files
10
+
11
+ ```
12
+ final_submission/
13
+ ├── solution.py # Core detection pipeline (TrafficViolationDetector class)
14
+ ├── requirements.txt # All Python dependencies
15
+ ├── README.md # This file
16
+ └── models/ # All model weights (bundled, fully offline)
17
+ ├── yolov8s.pt # COCO primary detector (21.54 MB)
18
+ ├── stage1_best.pt # Custom two-wheeler detector (21.49 MB)
19
+ ├── helmet_v11.pt # Helmet classifier (5.22 MB)
20
+ ├── license.pt # License plate localiser (42.77 MB)
21
+ ├── FSRCNN_x3.pb # Super-resolution for plates (0.04 MB)
22
+ ├── depth_anything_v2/ # Depth-Anything V2 Small (HF) (47.31 MB fp16)
23
+ └── paddleocr/ # Bundled PaddleOCR models
24
+ └── official_models/
25
+ └── ...
26
+
27
+ The pipeline also uses the `inference_sdk` to query the Roboflow API for:
28
+ - **Wrong-way driving detection** (`wrong-way-driving-detection-gqdmg/1`)
29
+ - **Seatbelt classification** (`seat-belt-detection-udcfg/5`)
30
+
31
+ Total model size: 194.59 MB (limit: 250 MB)
32
+ ```
33
+
34
+ ---
35
+
36
+ ## Quick Start
37
+
38
+ ### Install dependencies
39
+ ```bash
40
+ pip install -r requirements.txt
41
+ ```
42
+
43
+ ### Run inference
44
+ ```python
45
+ from solution import TrafficViolationDetector
46
+
47
+ detector = TrafficViolationDetector(model_dir="./models")
48
+ result = detector.predict("path/to/image.jpg")
49
+ print(result)
50
+ ```
51
+
52
+ ### Output format
53
+ ```json
54
+ {
55
+ "violations": [
56
+ {
57
+ "vehicle_type": "two_wheeler",
58
+ "num_riders": 2,
59
+ "helmet_violations": 1,
60
+ "wrong_way": false,
61
+ "license_plate": "DL 7S AF 8144"
62
+ },
63
+ {
64
+ "vehicle_type": "four_wheeler",
65
+ "seatbelt_violations": 1,
66
+ "wrong_way": true,
67
+ "license_plate": "MH 12 AB 1234"
68
+ }
69
+ ]
70
+ }
71
+ ```
72
+
73
+ - One entry per **violating** two-wheeler only
74
+ - `violations` is an empty list `[]` if no violations are found
75
+ - `license_plate` is `"UNKNOWN"` when the plate cannot be read
76
+ - `num_riders` counts riders per bike; `helmet_violations` counts those without a helmet
77
+
78
+ ---
79
+
80
+ ## Pipeline Architecture
81
+
82
+ The pipeline runs in 7 sequential stages per image:
83
+
84
+ ```
85
+ Input Image
86
+
87
+
88
+ ┌─────────────────────────────────────────────────────────────────┐
89
+ │ Stage 1 — Primary Detection (yolov8s.pt, COCO) │
90
+ │ Detects: persons (cls 0), motorcycles (cls 3) │
91
+ └────────────────────────┬────────────────────────────────────────┘
92
+
93
+ ┌────────────────────▼────────────────────┐
94
+ │ Stage 2 — Supplemental Bike Detection │
95
+ │ (stage1_best.pt — custom trained) │
96
+ │ Merged with Stage 1 bikes via NMS │
97
+ └────────────────────┬────────────────────┘
98
+
99
+ ┌────────────────────▼────────────────────┐
100
+ │ Stage 3 — Monocular Depth Estimation │
101
+ │ (Depth-Anything V2 Small, fp16 stored) │
102
+ │ Produces normalised depth map [0,1] │
103
+ └────────────────────┬────────────────────┘
104
+
105
+ ┌────────────────────▼────────────────────┐
106
+ │ Stage 4 — Person → Bike Association │
107
+ │ Criteria: IoU overlap + column align │
108
+ │ + depth proximity check │
109
+ └────────────────────┬────────────────────┘
110
+
111
+ ┌──────────▼──────────┐
112
+ │ Per-bike loop │
113
+ └──────────┬──────────┘
114
+
115
+ ┌────────────────────▼────────────────────┐
116
+ │ Stage 5 — Helmet Classification │
117
+ │ (helmet_v11.pt — YOLOv11 custom) │
118
+ │ Crops top 45% of each rider bbox │
119
+ │ (head region), runs cls 0=helmet │
120
+ └────────────────────┬────────────────────┘
121
+
122
+ ┌────────────────────▼────────────────────┐
123
+ │ Stage 6 — Wrong Way Detection (API) │
124
+ │ (wrong-way-driving-detection-gqdmg/1) │
125
+ │ Flags vehicle bounding boxes that │
126
+ │ overlap with 'wrong-side' detections │
127
+ └────────────────────┬────────────────────┘
128
+
129
+ ┌────────────────────▼────────────────────┐
130
+ │ Stage 7 — Seatbelt Detection (API) │
131
+ │ (seat-belt-detection-udcfg/5) │
132
+ │ Runs only on four-wheeler crops │
133
+ └────────────────────┬────────────────────┘
134
+
135
+ ┌────────────────────▼────────────────────┐
136
+ │ Stage 8 — License Plate Localisation │
137
+ │ (license.pt — YOLO custom) │
138
+ │ Runs on violating vehicles │
139
+ └────────────────────┬────────────────────┘
140
+
141
+ ┌────────────────────▼────────────────────┐
142
+ │ Stage 9 — OCR (PaddleOCR 3.5.0) │
143
+ │ FSRCNN x3 super-resolution → CLAHE │
144
+ │ sharpening → PP-OCRv5 mobile det+rec │
145
+ │ Text cleaned: uppercase alphanumeric │
146
+ └────────────────────┬────────────────────┘
147
+
148
+
149
+ Output: violations list
150
+ ```
151
+
152
+ ### Violation Logic
153
+ - A bike is flagged as a **violation** if:
154
+ - `num_riders >= 3` (over-riding), **OR**
155
+ - `helmet_violations > 0` (at least one rider without a helmet)
156
+ - Only violating bikes appear in the output list
157
+
158
+ ---
159
+
160
+ ## Model Details
161
+
162
+ ### `yolov8s.pt` — COCO Primary Detector
163
+ - **Type**: YOLOv8 Small, pretrained on COCO
164
+ - **Used for**: Detecting persons (class 0) and motorcycles (class 3)
165
+ - **Confidence**: 0.30, IoU: 0.45
166
+
167
+ ### `stage1_best.pt` — Custom Two-Wheeler Detector
168
+ - **Type**: YOLOv8-based, custom trained
169
+ - **Used for**: Supplementing COCO detections with domain-specific two-wheeler types (scooters, three-wheelers, etc. that COCO misses)
170
+ - **Merge**: Combined with COCO bike boxes via IoU-based NMS (threshold 0.45)
171
+ - **Augmented inference** (`augment=True`) for improved recall
172
+
173
+ ### `depth_anything_v2/` — Monocular Depth Estimation
174
+ - **Type**: Depth-Anything V2 Small (Hugging Face Transformers)
175
+ - **Used for**: Filtering out background pedestrians that share column overlap with a detected bike but are at a different depth plane
176
+ - **Storage**: fp16 safetensors on disk (47.3 MB vs 94.6 MB fp32) — loaded as fp32 at runtime for CPU inference speed
177
+ - **Output**: Normalised depth map [0, 1] resized to match the input image
178
+
179
+ ### `helmet_v11.pt` — Helmet Classifier
180
+ - **Type**: YOLOv11-based, custom trained on merged dataset
181
+ - **Training data**: 4 merged Kaggle datasets (andrewmvd, aneesarom, roboflow ×2) — all remapped to 2 classes: `with_helmet (0)`, `without_helmet (1)`
182
+ - **Input**: Top 45% of each rider bounding box (head crop) with 5% lateral padding
183
+ - **Confidence**: 0.25
184
+
185
+ ### `license.pt` — License Plate Localiser
186
+ - **Type**: YOLO custom, trained on Indian license plates
187
+ - **Used for**: Detecting the tight bounding box of the license plate within a bike crop
188
+ - **Confidence**: 0.20 (low threshold to catch partially visible plates)
189
+
190
+ ### `FSRCNN_x3.pb` — Super-Resolution
191
+ - **Type**: FSRCNN (Fast Super-Resolution CNN), ×3 scale, TensorFlow/OpenCV DNN
192
+ - **Used for**: Upscaling small plate crops (often <100px tall) 3× before OCR to improve recognition accuracy
193
+
194
+ ### `paddleocr/` — OCR Engine (PaddleOCR 3.5.0)
195
+ - **Detection**: `PP-OCRv5_mobile_det` (4.7 MB) — finds text line bounding boxes within the plate crop
196
+ - **Recognition**: `en_PP-OCRv5_mobile_rec` (7.6 MB) — reads each text line
197
+ - **Orientation models**: `PP-LCNet_x1_0_doc_ori`, `PP-LCNet_x1_0_textline_ori` — handle rotated plates
198
+ - **Unwarping**: `UVDoc` — corrects perspective distortion
199
+ - **API**: Uses the legacy `.ocr()` method (not `.predict()`). Both call the same underlying pipeline, but `.ocr()` uses a compatible inference backend on Windows/Linux CPU without triggering the OneDNN fused_conv2d operator crash present in the newer `.predict()` path
200
+ - **Post-processing**: Text is uppercased, non-alphanumeric characters stripped, tokens shorter than 2 characters discarded
201
+
202
+ ---
203
+
204
+ ## Offline Operation
205
+
206
+ All model weights are bundled in `./models/`. No internet connection is required at runtime.
207
+
208
+ PaddleOCR 3.5.0 uses [paddlex](https://github.com/PaddlePaddle/PaddleX) internally and looks for models via the `PADDLE_PDX_CACHE_HOME` environment variable. `solution.py` sets this variable to `./models/paddleocr/` **before** any paddle import, so paddlex resolves all models from the bundled path:
209
+
210
+ ```python
211
+ os.environ["PADDLE_PDX_CACHE_HOME"] = str(Path(__file__).parent / "models" / "paddleocr")
212
+ ```
213
+
214
  ---
215
+
216
+ ## Design Decisions
217
+
218
+ ### Why two bike detectors?
219
+ COCO's `motorcycle` class (cls 3) misses many Indian two-wheeler types. The custom `stage1_best.pt` trained on traffic footage recovers these. Boxes from both are merged via NMS.
220
+
221
+ ### Why depth filtering?
222
+ In busy street scenes, COCO frequently detects pedestrians on the footpath who share horizontal overlap with a detected bike. Depth-Anything V2 provides a proxy for Z-distance; persons whose median depth differs from the bike's median depth by more than 35% are excluded from association.
223
+
224
+ ### Why not use PaddleOCR's server detection model?
225
+ `PP-OCRv5_server_det` is 84.3 MB — bundling it would push the total over 250 MB. Instead, `license.pt` performs the coarse plate localisation (narrowing the search area to ~125×90 px), then `PP-OCRv5_mobile_det` (4.7 MB) finds individual text lines within that small crop, and `en_PP-OCRv5_mobile_rec` reads them. This two-stage localisation gives equivalent quality at a fraction of the size.
226
+
227
+ ### Why store depth model as fp16?
228
+ `model.safetensors` converted from fp32 (94.6 MB) to fp16 (47.3 MB) at submission time using `safetensors.torch`. At runtime the model is loaded as fp32 (`dtype=torch.float32`) because x86 CPUs have no native fp16 compute units — running fp16 tensors on CPU causes a 10× slowdown. The disk saving is free; the compute cost is zero.
229
+
230
+ ### Fallback for missing riders
231
+ If no COCO person is associated with a detected bike (e.g., very small image, occluded rider), one rider with no helmet is assumed. This is a conservative choice — it risks a false positive but never misses a genuine violation.
232
+
233
  ---
234
 
235
+ ## Constraints Compliance
236
+
237
+ | Constraint | Status |
238
+ |---|---|
239
+ | Model size ≤ 250 MB | ✅ 194.6 MB |
240
+ | No VLMs > 1B parameters | ✅ Largest model is Depth-Anything V2 Small (~24M params) |
241
+ | Fully offline execution | ✅ All weights in `./models/`, `PADDLE_PDX_CACHE_HOME` redirected |
242
+ | `TrafficViolationDetector` interface | ✅ `__init__(model_dir)` + `predict(image_path) → dict` |
243
+ | Stateless `predict()` | ✅ No mutable shared state between calls |
244
+ | Error handling | ✅ All exceptions caught; returns `{"violations": []}` on failure |
245
+
246
+ ---
247
+
248
+ ## Performance (Local Windows CPU)
249
+
250
+ | Metric | Value |
251
+ |---|---|
252
+ | Init time (cold start) | ~3–4 s |
253
+ | Inference — simple scene (1–2 bikes) | ~4–5 s |
254
+ | Inference — dense scene (8+ bikes) | ~10–12 s |
255
+
256
+ > **Note**: The evaluation server runs Linux with a faster CPU; inference times are expected to be lower. Depth estimation (Depth-Anything V2) is the primary bottleneck on CPU.
app.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from solution import TrafficViolationDetector
3
+
4
+ # Initialize the detector once at startup to keep models loaded in memory
5
+ print("Loading models for Hugging Face Space...")
6
+ detector = TrafficViolationDetector(model_dir="./models")
7
+ print("Models loaded successfully!")
8
+
9
+ def detect_violations(image_path):
10
+ if image_path is None:
11
+ return {"error": "No image provided"}
12
+
13
+ try:
14
+ # The detector.predict expects a path to the image
15
+ result = detector.predict(image_path)
16
+ return result
17
+ except Exception as e:
18
+ return {"error": str(e)}
19
+
20
+ # Create the Gradio interface
21
+ iface = gr.Interface(
22
+ fn=detect_violations,
23
+ inputs=gr.Image(type="filepath", label="Upload Traffic Image"),
24
+ outputs=gr.JSON(label="Violation Results"),
25
+ title="Traffic Rule Violation Detection API",
26
+ description="Upload an image to detect traffic violations. Supports two-wheelers (helmet, over-riding, wrong-way) and four-wheelers (seatbelt, wrong-way). Detects and runs OCR on the license plates of violating vehicles.\n\nThis application can be accessed programmatically via its built-in API.",
27
+ allow_flagging="never"
28
+ )
29
+
30
+ if __name__ == "__main__":
31
+ # Launch on 0.0.0.0 to allow Hugging Face to route traffic
32
+ iface.launch(server_name="0.0.0.0", server_port=7860)
models/FSRCNN_x3.pb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:efd38655a815908c6c8954db6052f128e76a735f1de657894c477d0dc0b64481
3
+ size 40093
models/depth_anything_v2/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/depth_anything_v2/README.md ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ tags:
4
+ - depth
5
+ - relative depth
6
+ pipeline_tag: depth-estimation
7
+ library: transformers
8
+ widget:
9
+ - inference: false
10
+ ---
11
+
12
+ # Depth Anything V2 Small – Transformers Version
13
+
14
+ Depth Anything V2 is trained from 595K synthetic labeled images and 62M+ real unlabeled images, providing the most capable monocular depth estimation (MDE) model with the following features:
15
+ - more fine-grained details than Depth Anything V1
16
+ - more robust than Depth Anything V1 and SD-based models (e.g., Marigold, Geowizard)
17
+ - more efficient (10x faster) and more lightweight than SD-based models
18
+ - impressive fine-tuned performance with our pre-trained models
19
+
20
+ This model checkpoint is compatible with the transformers library.
21
+
22
+ Depth Anything V2 was introduced in [the paper of the same name](https://arxiv.org/abs/2406.09414) by Lihe Yang et al. It uses the same architecture as the original Depth Anything release, but uses synthetic data and a larger capacity teacher model to achieve much finer and robust depth predictions. The original Depth Anything model was introduced in the paper [Depth Anything: Unleashing the Power of Large-Scale Unlabeled Data](https://arxiv.org/abs/2401.10891) by Lihe Yang et al., and was first released in [this repository](https://github.com/LiheYoung/Depth-Anything).
23
+
24
+ [Online demo](https://huggingface.co/spaces/depth-anything/Depth-Anything-V2).
25
+
26
+ ## Model description
27
+
28
+ Depth Anything V2 leverages the [DPT](https://huggingface.co/docs/transformers/model_doc/dpt) architecture with a [DINOv2](https://huggingface.co/docs/transformers/model_doc/dinov2) backbone.
29
+
30
+ The model is trained on ~600K synthetic labeled images and ~62 million real unlabeled images, obtaining state-of-the-art results for both relative and absolute depth estimation.
31
+
32
+ <img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/model_doc/depth_anything_overview.jpg"
33
+ alt="drawing" width="600"/>
34
+
35
+ <small> Depth Anything overview. Taken from the <a href="https://arxiv.org/abs/2401.10891">original paper</a>.</small>
36
+
37
+ ## Intended uses & limitations
38
+
39
+ You can use the raw model for tasks like zero-shot depth estimation. See the [model hub](https://huggingface.co/models?search=depth-anything) to look for
40
+ other versions on a task that interests you.
41
+
42
+ ### How to use
43
+
44
+ Here is how to use this model to perform zero-shot depth estimation:
45
+
46
+ ```python
47
+ from transformers import pipeline
48
+ from PIL import Image
49
+ import requests
50
+
51
+ # load pipe
52
+ pipe = pipeline(task="depth-estimation", model="depth-anything/Depth-Anything-V2-Small-hf")
53
+
54
+ # load image
55
+ url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
56
+ image = Image.open(requests.get(url, stream=True).raw)
57
+
58
+ # inference
59
+ depth = pipe(image)["depth"]
60
+ ```
61
+
62
+ Alternatively, you can use the model and processor classes:
63
+
64
+ ```python
65
+ from transformers import AutoImageProcessor, AutoModelForDepthEstimation
66
+ import torch
67
+ import numpy as np
68
+ from PIL import Image
69
+ import requests
70
+
71
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
72
+ image = Image.open(requests.get(url, stream=True).raw)
73
+
74
+ image_processor = AutoImageProcessor.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
75
+ model = AutoModelForDepthEstimation.from_pretrained("depth-anything/Depth-Anything-V2-Small-hf")
76
+
77
+ # prepare image for the model
78
+ inputs = image_processor(images=image, return_tensors="pt")
79
+
80
+ with torch.no_grad():
81
+ outputs = model(**inputs)
82
+ predicted_depth = outputs.predicted_depth
83
+
84
+ # interpolate to original size
85
+ prediction = torch.nn.functional.interpolate(
86
+ predicted_depth.unsqueeze(1),
87
+ size=image.size[::-1],
88
+ mode="bicubic",
89
+ align_corners=False,
90
+ )
91
+ ```
92
+
93
+ For more code examples, please refer to the [documentation](https://huggingface.co/transformers/main/model_doc/depth_anything.html#).
94
+
95
+
96
+ ### Citation
97
+
98
+ ```bibtex
99
+ @misc{yang2024depth,
100
+ title={Depth Anything V2},
101
+ author={Lihe Yang and Bingyi Kang and Zilong Huang and Zhen Zhao and Xiaogang Xu and Jiashi Feng and Hengshuang Zhao},
102
+ year={2024},
103
+ eprint={2406.09414},
104
+ archivePrefix={arXiv},
105
+ primaryClass={id='cs.CV' full_name='Computer Vision and Pattern Recognition' is_active=True alt_name=None in_archive='cs' is_general=False description='Covers image processing, computer vision, pattern recognition, and scene understanding. Roughly includes material in ACM Subject Classes I.2.10, I.4, and I.5.'}
106
+ }
107
+ ```
108
+
models/depth_anything_v2/config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_commit_hash": null,
3
+ "architectures": [
4
+ "DepthAnythingForDepthEstimation"
5
+ ],
6
+ "backbone": null,
7
+ "backbone_config": {
8
+ "architectures": [
9
+ "Dinov2Model"
10
+ ],
11
+ "hidden_size": 384,
12
+ "image_size": 518,
13
+ "model_type": "dinov2",
14
+ "num_attention_heads": 6,
15
+ "out_features": [
16
+ "stage3",
17
+ "stage6",
18
+ "stage9",
19
+ "stage12"
20
+ ],
21
+ "out_indices": [
22
+ 3,
23
+ 6,
24
+ 9,
25
+ 12
26
+ ],
27
+ "patch_size": 14,
28
+ "reshape_hidden_states": false,
29
+ "torch_dtype": "float32"
30
+ },
31
+ "fusion_hidden_size": 64,
32
+ "head_hidden_size": 32,
33
+ "head_in_index": -1,
34
+ "initializer_range": 0.02,
35
+ "model_type": "depth_anything",
36
+ "neck_hidden_sizes": [
37
+ 48,
38
+ 96,
39
+ 192,
40
+ 384
41
+ ],
42
+ "patch_size": 14,
43
+ "reassemble_factors": [
44
+ 4,
45
+ 2,
46
+ 1,
47
+ 0.5
48
+ ],
49
+ "reassemble_hidden_size": 384,
50
+ "torch_dtype": "float32",
51
+ "transformers_version": null,
52
+ "use_pretrained_backbone": false
53
+ }
models/depth_anything_v2/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:453e3d7ffaa5d89985ea6593e87af0091026e098d028ae7ab7f929780c3a2f85
3
+ size 49603410
models/depth_anything_v2/preprocessor_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_valid_processor_keys": [
3
+ "images",
4
+ "do_resize",
5
+ "size",
6
+ "keep_aspect_ratio",
7
+ "ensure_multiple_of",
8
+ "resample",
9
+ "do_rescale",
10
+ "rescale_factor",
11
+ "do_normalize",
12
+ "image_mean",
13
+ "image_std",
14
+ "do_pad",
15
+ "size_divisor",
16
+ "return_tensors",
17
+ "data_format",
18
+ "input_data_format"
19
+ ],
20
+ "do_normalize": true,
21
+ "do_pad": false,
22
+ "do_rescale": true,
23
+ "do_resize": true,
24
+ "ensure_multiple_of": 14,
25
+ "image_mean": [
26
+ 0.485,
27
+ 0.456,
28
+ 0.406
29
+ ],
30
+ "image_processor_type": "DPTImageProcessor",
31
+ "image_std": [
32
+ 0.229,
33
+ 0.224,
34
+ 0.225
35
+ ],
36
+ "keep_aspect_ratio": true,
37
+ "resample": 3,
38
+ "rescale_factor": 0.00392156862745098,
39
+ "size": {
40
+ "height": 518,
41
+ "width": 518
42
+ },
43
+ "size_divisor": null
44
+ }
models/helmet_v11.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a228e55c20f452a1e19ca42d5c9fd2f115611667a7cddce3c68046aa6c590e43
3
+ size 5478490
models/license.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e30a2bfd1f8342eb7f21d9f7d3bb9452c8570eb597df3eb8bbe04e66f8fde0f6
3
+ size 44849729
models/paddleocr/official_models/PP-LCNet_x1_0_doc_ori/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ inference.pdiparams filter=lfs diff=lfs merge=lfs -text
models/paddleocr/official_models/PP-LCNet_x1_0_doc_ori/README.md ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: PaddleOCR
4
+ language:
5
+ - en
6
+ - zh
7
+ pipeline_tag: image-to-text
8
+ tags:
9
+ - OCR
10
+ - PaddlePaddle
11
+ - PaddleOCR
12
+ - doc_img_orientation_classification
13
+ ---
14
+
15
+ # PP-LCNet_x1_0_doc_ori
16
+
17
+ ## Introduction
18
+
19
+ The Document Image Orientation Classification Module is primarily designed to distinguish the orientation of document images and correct them through post-processing. During processes such as document scanning or ID photo capturing, the device might be rotated to achieve clearer images, resulting in images with various orientations. Standard OCR pipelines may not handle these images effectively. By leveraging image classification techniques, the orientation of documents or IDs containing text regions can be pre-determined and adjusted, thereby improving the accuracy of OCR processing. The key accuracy metrics are as follow:
20
+
21
+ <table>
22
+ <tr>
23
+ <th>Model</th>
24
+ <th>Recognition Avg Accuracy(%)</th>
25
+ <th>Model Storage Size (M)</th>
26
+ <th>Introduction</th>
27
+ </tr>
28
+ <tr>
29
+ <td>PP-LCNet_x1_0_doc_ori</td>
30
+ <td>99.06</td>
31
+ <td>7</td>
32
+ <td>A document image classification model based on PP-LCNet_x1_0, with four categories: 0°, 90°, 180°, and 270°.</td>
33
+ </tr>
34
+ </table>
35
+
36
+ ## Quick Start
37
+
38
+ ### Installation
39
+
40
+ 1. PaddlePaddle
41
+
42
+ Please refer to the following commands to install PaddlePaddle using pip:
43
+
44
+ ```bash
45
+ # for CUDA11.8
46
+ python -m pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
47
+
48
+ # for CUDA12.6
49
+ python -m pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
50
+
51
+ # for CPU
52
+ python -m pip install paddlepaddle==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
53
+ ```
54
+
55
+ For details about PaddlePaddle installation, please refer to the [PaddlePaddle official website](https://www.paddlepaddle.org.cn/en/install/quick).
56
+
57
+ 2. PaddleOCR
58
+
59
+ Install the latest version of the PaddleOCR inference package from PyPI:
60
+
61
+ ```bash
62
+ python -m pip install paddleocr
63
+ ```
64
+
65
+ ### Model Usage
66
+
67
+ You can quickly experience the functionality with a single command:
68
+
69
+ ```bash
70
+ paddleocr doc_img_orientation_classification \
71
+ --model_name PP-LCNet_x1_0_doc_ori \
72
+ -i https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/4ifXaBJmFByG_mAnF86Vv.png
73
+ ```
74
+
75
+ You can also integrate the model inference of the text recognition module into your project. Before running the following code, please download the sample image to your local machine.
76
+
77
+ ```python
78
+ from paddleocr import DocImgOrientationClassification
79
+ model = DocImgOrientationClassification(model_name="PP-LCNet_x1_0_doc_ori")
80
+ output = model.predict(input="4ifXaBJmFByG_mAnF86Vv.png", batch_size=1)
81
+ for res in output:
82
+ res.print()
83
+ res.save_to_img(save_path="./output/")
84
+ res.save_to_json(save_path="./output/res.json")
85
+ ```
86
+
87
+ After running, the obtained result is as follows:
88
+
89
+ ```json
90
+ {'res': {'input_path': '/root/.paddlex/predict_input/4ifXaBJmFByG_mAnF86Vv.png', 'page_index': None, 'class_ids': array([2], dtype=int32), 'scores': array([0.90971], dtype=float32), 'label_names': ['180']}}
91
+ ```
92
+
93
+ The visualized image is as follows:
94
+
95
+ ![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/DU_k30fxijLXFdXl179-0.png)
96
+
97
+ For details about usage command and descriptions of parameters, please refer to the [Document](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/module_usage/text_recognition.html#iii-quick-start).
98
+
99
+ ### Pipeline Usage
100
+
101
+ The ability of a single model is limited. But the pipeline consists of several models can provide more capacity to resolve difficult problems in real-world scenarios.
102
+
103
+ #### doc_preprocessor
104
+
105
+ The Document Image Preprocessing Pipeline integrates two key functions: document orientation classification and geometric distortion correction. The document orientation classification module automatically identifies the four possible orientations of a document (0°, 90°, 180°, 270°), ensuring that the document is processed in the correct direction. The text image unwarping model is designed to correct geometric distortions that occur during document photography or scanning, restoring the document's original shape and proportions. This pipeline is suitable for digital document management, preprocessing tasks for OCR, and any scenario requiring improved document image quality. By automating orientation correction and geometric distortion correction, this module significantly enhances the accuracy and efficiency of document processing, providing a more reliable foundation for image analysis. The pipeline also offers flexible service-oriented deployment options, supporting calls from various programming languages on multiple hardware platforms. Additionally, the pipeline supports secondary development, allowing you to fine-tune the models on your own datasets and seamlessly integrate the trained models. And there are 2 modules in the pipeline:
106
+ * Document Image Orientation Classification Module (Optional)
107
+ * Text Image Unwarping Module (Optional)
108
+
109
+ Run a single command to quickly experience the OCR pipeline:
110
+
111
+ ```bash
112
+ paddleocr doc_preprocessor -i https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/pY6sY6wLDuoHF1-cGUvDr.png \
113
+ --use_doc_orientation_classify True \
114
+ --use_doc_unwarping True \
115
+ --doc_orientation_classify_model_name PP-LCNet_x1_0_doc_ori \
116
+ --save_path ./output \
117
+ --device gpu:0
118
+ ```
119
+
120
+ Results are printed to the terminal:
121
+
122
+ ```json
123
+ {'res': {'input_path': '/root/.paddlex/predict_input/pY6sY6wLDuoHF1-cGUvDr.png', 'page_index': None, 'model_settings': {'use_doc_orientation_classify': True, 'use_doc_unwarping': True}, 'angle': 180}}
124
+ ```
125
+
126
+ If save_path is specified, the visualization results will be saved under `save_path`. The visualization output is shown below:
127
+
128
+ ![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/HM8xQKtyBHx-CNVGk2ZJd.png)
129
+
130
+ The command-line method is for quick experience. For project integration, also only a few codes are needed as well:
131
+
132
+ ```python
133
+ from paddleocr import DocPreprocessor
134
+
135
+ ocr = DocPreprocessor(
136
+ doc_orientation_classify_model_name="PP-LCNet_x1_0_doc_ori",
137
+ use_doc_orientation_classify=True, # Use use_doc_orientation_classify to enable/disable document orientation classification model
138
+ use_doc_unwarping=True, # Use use_doc_unwarping to enable/disable document unwarping module
139
+ device="gpu:0", # Use device to specify GPU for model inference
140
+ )
141
+ result = ocr.predict("https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/pY6sY6wLDuoHF1-cGUvDr.png")
142
+ for res in result:
143
+ res.print()
144
+ res.save_to_img("output")
145
+ res.save_to_json("output")
146
+ ```
147
+
148
+ ## Links
149
+
150
+ [PaddleOCR Repo](https://github.com/paddlepaddle/paddleocr)
151
+
152
+ [PaddleOCR Documentation](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)
models/paddleocr/official_models/PP-LCNet_x1_0_doc_ori/config.json ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Global": {
3
+ "model_name": "PP-LCNet_x1_0_doc_ori"
4
+ },
5
+ "Hpi": {
6
+ "backend_configs": {
7
+ "paddle_infer": {
8
+ "trt_dynamic_shapes": {
9
+ "x": [
10
+ [
11
+ 1,
12
+ 3,
13
+ 224,
14
+ 224
15
+ ],
16
+ [
17
+ 1,
18
+ 3,
19
+ 224,
20
+ 224
21
+ ],
22
+ [
23
+ 8,
24
+ 3,
25
+ 224,
26
+ 224
27
+ ]
28
+ ]
29
+ }
30
+ },
31
+ "tensorrt": {
32
+ "dynamic_shapes": {
33
+ "x": [
34
+ [
35
+ 1,
36
+ 3,
37
+ 224,
38
+ 224
39
+ ],
40
+ [
41
+ 1,
42
+ 3,
43
+ 224,
44
+ 224
45
+ ],
46
+ [
47
+ 8,
48
+ 3,
49
+ 224,
50
+ 224
51
+ ]
52
+ ]
53
+ }
54
+ }
55
+ }
56
+ },
57
+ "PreProcess": {
58
+ "transform_ops": [
59
+ {
60
+ "ResizeImage": {
61
+ "resize_short": 256
62
+ }
63
+ },
64
+ {
65
+ "CropImage": {
66
+ "size": 224
67
+ }
68
+ },
69
+ {
70
+ "NormalizeImage": {
71
+ "channel_num": 3,
72
+ "mean": [
73
+ 0.485,
74
+ 0.456,
75
+ 0.406
76
+ ],
77
+ "order": "",
78
+ "scale": 0.00392156862745098,
79
+ "std": [
80
+ 0.229,
81
+ 0.224,
82
+ 0.225
83
+ ]
84
+ }
85
+ },
86
+ {
87
+ "ToCHWImage": null
88
+ }
89
+ ]
90
+ },
91
+ "PostProcess": {
92
+ "Topk": {
93
+ "topk": 1,
94
+ "label_list": [
95
+ "0",
96
+ "90",
97
+ "180",
98
+ "270"
99
+ ]
100
+ }
101
+ }
102
+ }
models/paddleocr/official_models/PP-LCNet_x1_0_doc_ori/inference.json ADDED
The diff for this file is too large to render. See raw diff
 
models/paddleocr/official_models/PP-LCNet_x1_0_doc_ori/inference.pdiparams ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8d6e7c5d264507e40e58a655779059d616b20d7441ea22047d829eb3931989c
3
+ size 6754166
models/paddleocr/official_models/PP-LCNet_x1_0_doc_ori/inference.yml ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ model_name: PP-LCNet_x1_0_doc_ori
3
+ Hpi:
4
+ backend_configs:
5
+ paddle_infer:
6
+ trt_dynamic_shapes: &id001
7
+ x:
8
+ - - 1
9
+ - 3
10
+ - 224
11
+ - 224
12
+ - - 1
13
+ - 3
14
+ - 224
15
+ - 224
16
+ - - 8
17
+ - 3
18
+ - 224
19
+ - 224
20
+ tensorrt:
21
+ dynamic_shapes: *id001
22
+ PreProcess:
23
+ transform_ops:
24
+ - ResizeImage:
25
+ resize_short: 256
26
+ - CropImage:
27
+ size: 224
28
+ - NormalizeImage:
29
+ channel_num: 3
30
+ mean:
31
+ - 0.485
32
+ - 0.456
33
+ - 0.406
34
+ order: ''
35
+ scale: 0.00392156862745098
36
+ std:
37
+ - 0.229
38
+ - 0.224
39
+ - 0.225
40
+ - ToCHWImage: null
41
+ PostProcess:
42
+ Topk:
43
+ topk: 1
44
+ label_list:
45
+ - '0'
46
+ - '90'
47
+ - '180'
48
+ - '270'
models/paddleocr/official_models/PP-LCNet_x1_0_textline_ori/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ inference.pdiparams filter=lfs diff=lfs merge=lfs -text
models/paddleocr/official_models/PP-LCNet_x1_0_textline_ori/README.md ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: PaddleOCR
4
+ language:
5
+ - en
6
+ - zh
7
+ pipeline_tag: image-to-text
8
+ tags:
9
+ - OCR
10
+ - PaddlePaddle
11
+ - PaddleOCR
12
+ - textline_orientation_classification
13
+ ---
14
+
15
+ # PP-LCNet_x1_0_textline_ori
16
+
17
+ ## Introduction
18
+
19
+ The text line orientation classification module primarily distinguishes the orientation of text lines and corrects them using post-processing. In processes such as document scanning and license/certificate photography, to capture clearer images, the capture device may be rotated, resulting in text lines in various orientations. Standard OCR pipelines cannot handle such data well. By utilizing image classification technology, the orientation of text lines can be predetermined and adjusted, thereby enhancing the accuracy of OCR processing. The key accuracy metrics are as follow:
20
+
21
+ <table>
22
+ <tr>
23
+ <th>Model</th>
24
+ <th>Recognition Avg Accuracy(%)</th>
25
+ <th>Model Storage Size (M)</th>
26
+ <th>Introduction</th>
27
+ </tr>
28
+ <tr>
29
+ <td>PP-LCNet_x1_0_textline_ori</td>
30
+ <td>98.85</td>
31
+ <td>0.96</td>
32
+ <td>Text line classification model based on PP-LCNet_x0_25, with two classes: 0 degrees and 180 degrees</td>
33
+ </tr>
34
+ </table>
35
+
36
+ ## Quick Start
37
+
38
+ ### Installation
39
+
40
+ 1. PaddlePaddle
41
+
42
+ Please refer to the following commands to install PaddlePaddle using pip:
43
+
44
+ ```bash
45
+ # for CUDA11.8
46
+ python -m pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
47
+
48
+ # for CUDA12.6
49
+ python -m pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
50
+
51
+ # for CPU
52
+ python -m pip install paddlepaddle==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
53
+ ```
54
+
55
+ For details about PaddlePaddle installation, please refer to the [PaddlePaddle official website](https://www.paddlepaddle.org.cn/en/install/quick).
56
+
57
+ 2. PaddleOCR
58
+
59
+ Install the latest version of the PaddleOCR inference package from PyPI:
60
+
61
+ ```bash
62
+ python -m pip install paddleocr
63
+ ```
64
+
65
+ ### Model Usage
66
+
67
+ You can quickly experience the functionality with a single command:
68
+
69
+ ```bash
70
+ paddleocr text_line_orientation_classification \
71
+ --model_name PP-LCNet_x1_0_textline_ori \
72
+ -i https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/m3ZmUPAnst1f9xXvTVLKS.png
73
+ ```
74
+
75
+ You can also integrate the model inference of the text recognition module into your project. Before running the following code, please download the sample image to your local machine.
76
+
77
+ ```python
78
+ from paddleocr import TextLineOrientationClassification
79
+ model = TextLineOrientationClassification(model_name="PP-LCNet_x1_0_textline_ori")
80
+ output = model.predict(input="m3ZmUPAnst1f9xXvTVLKS.png", batch_size=1)
81
+ for res in output:
82
+ res.print()
83
+ res.save_to_img(save_path="./output/")
84
+ res.save_to_json(save_path="./output/res.json")
85
+ ```
86
+
87
+ After running, the obtained result is as follows:
88
+
89
+ ```json
90
+ {'res': {'input_path': '/root/.paddlex/predict_input/m3ZmUPAnst1f9xXvTVLKS.png', 'page_index': None, 'class_ids': array([1], dtype=int32), 'scores': array([0.99829], dtype=float32), 'label_names': ['180_degree']}}
91
+ ```
92
+
93
+ The visualized image is as follows:
94
+
95
+ ![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/0y5rEbMTzgsqP6Ptnj-Er.png)
96
+
97
+ For details about usage command and descriptions of parameters, please refer to the [Document](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/module_usage/text_recognition.html#iii-quick-start).
98
+
99
+
100
+ ## Links
101
+
102
+ [PaddleOCR Repo](https://github.com/paddlepaddle/paddleocr)
103
+
104
+ [PaddleOCR Documentation](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)
models/paddleocr/official_models/PP-LCNet_x1_0_textline_ori/config.json ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Global": {
3
+ "model_name": "PP-LCNet_x1_0_textline_ori"
4
+ },
5
+ "Hpi": {
6
+ "backend_configs": {
7
+ "paddle_infer": {
8
+ "trt_dynamic_shapes": {
9
+ "x": [
10
+ [
11
+ 1,
12
+ 3,
13
+ 80,
14
+ 160
15
+ ],
16
+ [
17
+ 1,
18
+ 3,
19
+ 80,
20
+ 160
21
+ ],
22
+ [
23
+ 8,
24
+ 3,
25
+ 80,
26
+ 160
27
+ ]
28
+ ]
29
+ }
30
+ },
31
+ "tensorrt": {
32
+ "dynamic_shapes": {
33
+ "x": [
34
+ [
35
+ 1,
36
+ 3,
37
+ 80,
38
+ 160
39
+ ],
40
+ [
41
+ 1,
42
+ 3,
43
+ 80,
44
+ 160
45
+ ],
46
+ [
47
+ 8,
48
+ 3,
49
+ 80,
50
+ 160
51
+ ]
52
+ ]
53
+ }
54
+ }
55
+ }
56
+ },
57
+ "PreProcess": {
58
+ "transform_ops": [
59
+ {
60
+ "ResizeImage": {
61
+ "size": [
62
+ 160,
63
+ 80
64
+ ]
65
+ }
66
+ },
67
+ {
68
+ "NormalizeImage": {
69
+ "channel_num": 3,
70
+ "mean": [
71
+ 0.485,
72
+ 0.456,
73
+ 0.406
74
+ ],
75
+ "order": "",
76
+ "scale": 0.00392156862745098,
77
+ "std": [
78
+ 0.229,
79
+ 0.224,
80
+ 0.225
81
+ ]
82
+ }
83
+ },
84
+ {
85
+ "ToCHWImage": null
86
+ }
87
+ ]
88
+ },
89
+ "PostProcess": {
90
+ "Topk": {
91
+ "topk": 1,
92
+ "label_list": [
93
+ "0_degree",
94
+ "180_degree"
95
+ ]
96
+ }
97
+ }
98
+ }
models/paddleocr/official_models/PP-LCNet_x1_0_textline_ori/inference.json ADDED
The diff for this file is too large to render. See raw diff
 
models/paddleocr/official_models/PP-LCNet_x1_0_textline_ori/inference.pdiparams ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0de2bcf996cf553e2b848dd7b1769dafffc6917b1ccdf55c1d8efe7909fbf743
3
+ size 6743918
models/paddleocr/official_models/PP-LCNet_x1_0_textline_ori/inference.yml ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ model_name: PP-LCNet_x1_0_textline_ori
3
+ Hpi:
4
+ backend_configs:
5
+ paddle_infer:
6
+ trt_dynamic_shapes: &id001
7
+ x:
8
+ - - 1
9
+ - 3
10
+ - 80
11
+ - 160
12
+ - - 1
13
+ - 3
14
+ - 80
15
+ - 160
16
+ - - 8
17
+ - 3
18
+ - 80
19
+ - 160
20
+ tensorrt:
21
+ dynamic_shapes: *id001
22
+ PreProcess:
23
+ transform_ops:
24
+ - ResizeImage:
25
+ size:
26
+ - 160
27
+ - 80
28
+ - NormalizeImage:
29
+ channel_num: 3
30
+ mean:
31
+ - 0.485
32
+ - 0.456
33
+ - 0.406
34
+ order: ''
35
+ scale: 0.00392156862745098
36
+ std:
37
+ - 0.229
38
+ - 0.224
39
+ - 0.225
40
+ - ToCHWImage: null
41
+ PostProcess:
42
+ Topk:
43
+ topk: 1
44
+ label_list:
45
+ - 0_degree
46
+ - 180_degree
models/paddleocr/official_models/PP-OCRv5_mobile_det/.msc ADDED
Binary file (366 Bytes). View file
 
models/paddleocr/official_models/PP-OCRv5_mobile_det/.mv ADDED
@@ -0,0 +1 @@
 
 
1
+ Revision:master,CreatedAt:1751518563
models/paddleocr/official_models/PP-OCRv5_mobile_det/README.md ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: PaddleOCR
4
+ language:
5
+ - en
6
+ - zh
7
+ pipeline_tag: image-to-text
8
+ tags:
9
+ - OCR
10
+ - PaddlePaddle
11
+ - PaddleOCR
12
+ ---
13
+
14
+ # PP-OCRv5_mobile_det
15
+
16
+ ## Introduction
17
+
18
+ PP-OCRv5_mobile_det is one of the PP-OCRv5_det series, the latest generation of text detection models developed by the PaddleOCR team. It aims to efficiently and accurately supports the detection of text in diverse scenarios—including handwriting, vertical, rotated, and curved text—across multiple languages such as Simplified Chinese, Traditional Chinese, English, and Japanese. Key features include robust handling of complex layouts, varying text sizes, and challenging backgrounds, making it suitable for practical applications like document analysis, license plate recognition, and scene text detection. The key accuracy metrics are as follow:
19
+
20
+ | Handwritten Chinese | Handwritten English | Printed Chinese | Printed English | Traditional Chinese | Ancient Text | Japanese | General Scenario | Pinyin | Rotation | Distortion | Artistic Text | Average |
21
+ | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- |
22
+ | 0.744 | 0.777 | 0.905 | 0.910 | 0.823 | 0.581 | 0.727 | 0.721 | 0.575 | 0.647 | 0.827 | 0.525 | 0.770 |
23
+
24
+ ## Quick Start
25
+
26
+ ### Installation
27
+
28
+ 1. PaddlePaddle
29
+
30
+ Please refer to the following commands to install PaddlePaddle using pip:
31
+
32
+ ```bash
33
+ # for CUDA11.8
34
+ python -m pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
35
+
36
+ # for CUDA12.6
37
+ python -m pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
38
+
39
+ # for CPU
40
+ python -m pip install paddlepaddle==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
41
+ ```
42
+
43
+ For details about PaddlePaddle installation, please refer to the [PaddlePaddle official website](https://www.paddlepaddle.org.cn/en/install/quick).
44
+
45
+ 2. PaddleOCR
46
+
47
+ Install the latest version of the PaddleOCR inference package from PyPI:
48
+
49
+ ```bash
50
+ python -m pip install paddleocr
51
+ ```
52
+
53
+ ### Model Usage
54
+
55
+ You can quickly experience the functionality with a single command:
56
+
57
+ ```bash
58
+ paddleocr text_detection \
59
+ --model_name PP-OCRv5_mobile_det \
60
+ -i https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/3ul2Rq4Sk5Cn-l69D695U.png
61
+ ```
62
+
63
+ You can also integrate the model inference of the text detection module into your project. Before running the following code, please download the sample image to your local machine.
64
+
65
+ ```python
66
+ from paddleocr import TextDetection
67
+ model = TextDetection(model_name="PP-OCRv5_mobile_det")
68
+ output = model.predict(input="3ul2Rq4Sk5Cn-l69D695U.png", batch_size=1)
69
+ for res in output:
70
+ res.print()
71
+ res.save_to_img(save_path="./output/")
72
+ res.save_to_json(save_path="./output/res.json")
73
+ ```
74
+
75
+ After running, the obtained result is as follows:
76
+
77
+ ```json
78
+ {'res': {'input_path': '/root/.paddlex/predict_input/3ul2Rq4Sk5Cn-l69D695U.png', 'page_index': None, 'dt_polys': array([[[ 105, 1431],
79
+ ...,
80
+ [ 105, 1452]],
81
+
82
+ ...,
83
+
84
+ [[ 353, 106],
85
+ ...,
86
+ [ 353, 129]]], dtype=int16), 'dt_scores': [0.8306416015066644, 0.7603795581201811, ..., 0.8819806867477359]}}
87
+ ```
88
+
89
+ The visualized image is as follows:
90
+
91
+ ![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/x7iTnr_hOnfTdyblW0qcb.jpeg)
92
+
93
+ For details about usage command and descriptions of parameters, please refer to the [Document](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/module_usage/text_detection.html#iii-quick-start).
94
+
95
+ ### Pipeline Usage
96
+
97
+ The ability of a single model is limited. But the pipeline consists of several models can provide more capacity to resolve difficult problems in real-world scenarios.
98
+
99
+ #### PP-OCRv5
100
+
101
+ The general OCR pipeline is used to solve text recognition tasks by extracting text information from images and outputting it in string format. And there are 5 modules in the pipeline:
102
+ * Document Image Orientation Classification Module (Optional)
103
+ * Text Image Unwarping Module (Optional)
104
+ * Text Line Orientation Classification Module (Optional)
105
+ * Text Detection Module
106
+ * Text Recognition Module
107
+
108
+ Run a single command to quickly experience the OCR pipeline:
109
+
110
+ ```bash
111
+ paddleocr ocr -i https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/3ul2Rq4Sk5Cn-l69D695U.png \
112
+ --text_detection_model_name PP-OCRv5_mobile_det \
113
+ --use_doc_orientation_classify False \
114
+ --use_doc_unwarping False \
115
+ --use_textline_orientation True \
116
+ --save_path ./output \
117
+ --device gpu:0
118
+ ```
119
+
120
+ Results are printed to the terminal:
121
+
122
+ ```json
123
+ {'res': {'input_path': 'printing_en/3ul2Rq4Sk5Cn-l69D695U.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': True}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[ 352, 105],
124
+ ...,
125
+ [ 352, 128]],
126
+
127
+ ...,
128
+
129
+ [[ 632, 1431],
130
+ ...,
131
+ [ 632, 1447]]], dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([0, ..., 0]), 'text_rec_score_thresh': 0.0, 'rec_texts': ['Algorithms for the Markov Entropy Decomposition', 'Andrew J. Ferris and David Poulin', 'Département de Physique, Université de Sherbrooke, Québec, JI K 2R1, Canada', '(Dated: October 31, 2018)', 'The Markov entropy decomposition (MED) is a recently-proposed, cluster-based simulation method for fi -', 'nite temperature quantum systems with arbitrary geometry. In this paper, we detail numerical algorithms for', 'performing the required steps of the MED, principally solving a minimization problem with a preconditioned', 'arXiv:1212.1442v1 [cond-mat.stat-mech] 6 Dec 2012', "Newton's algorithm, as well as how to extract global susceptibilities and thermal responses. We demonstrate", 'the power of the method with the spin-1/2 XXZ model on the 2D square lattice, including the extraction of', 'critical points and details of each phase. Although the method shares some qualitative similarities with exact-', 'diagonalization, we show the MED is both more accurate and significantly more flexible.', 'PACS numbers: 05.10.—a, 02.50.Ng, 03.67.–a, 74.40.Kb', 'I. INTRODUCTION', 'This approximation becomes exact in the case of a 1D quan-', 'tum (or classical) Markov chain [1O], and leads to an expo-', 'Although the equations governing quantum many-body', 'nential reduction of cost for exact entropy calculations when', 'systems are simple to write down, finding solutions for the', 'the global density matrix is a higher-dimensional Markov net-', 'majority of systems remains incredibly difficult. Modern', 'work state [12, 13].', 'physics finds itself in need of new tools to compute the emer-', 'The second approximation used in the MED approach is', 'gent behavior of large, many-body systems.', 'related to the N-representibility problem. Given a set of lo-', 'There has been a great variety of tools developed to tackle', 'cal but overlapping reduced density matrices { ρi }, it is a very', 'many-body problems, but in general, large 2D and 3D quan-', 'challenging problem to determine if there exists a global den.', 'tum systems remain hard to deal with. Most systems are', 'sity operator which is positive semi-definite and whose partial', 'thought to be non-integrable, so exact analytic solutions are', 'trace agrees with each ρi. This problem is QMA-hard (the', 'not usually expected. Direct numerical diagonalization can be', 'quantum analogue of NP) [14, 15], and is hopelessly diffi-', 'performed for relatively small systems — however the emer-', 'cult to enforce. Thus, the second approximation employed', 'gent behavior of a system in the thermodynamic limit may be', 'involves ignoring global consistency with a positive opera-', 'difficult to extract, especially in systems with large correlation', 'tor, while requiring local consistency on any overlapping re-', 'lengths. Monte Carlo approaches are technically exact (up to', 'gions between the ρi. At the zero-temperature limit, the MED', 'sampling error), but suffer from the so-called sign problem', 'approach becomes analogous to the variational nth-order re-', 'for fermionic, frustrated, or dynamical problems. Thus we are', 'duced density matrix approach, where positivity is enforced', 'limited to search for clever approximations to solve the ma-', 'on all reduced density matrices of size n [16–18].', 'jority of many-body problems.', 'The MED approach is an extremely flexible cluster method.', 'Over the past century, hundreds of such approximations', 'applicable to both translationally invariant systems of any di-', 'have been proposed, and we will mention just a few notable', 'mension in the thermodynamic limit, as well as finite systems', 'examples applicable to quantum lattice models. Mean-field', 'or systems without translational invariance (e.g. disordered', 'theory is simple and frequently arrives at the correct quali-', 'lattices, or harmonically trapped atoms in optical lattices).', 'tative description, but often fails when correlations are im-', 'The free energy given by MED is guaranteed to lower bound', 'portant. Density-matrix renormalisation group (DMRG) [1]', 'the true free energy, which in turn lower-bounds the ground', 'is efficient and extremely accurate at solving 1D problems,', 'state energy — thus providing a natural complement to varia-', 'but the computational cost grows exponentially with system', 'tional approaches which upper-bound the ground state energy.', 'size in two- or higher-dimensions [2, 3]. Related tensor-', 'The ability to provide a rigorous ground-state energy window', 'network techniques designed for 2D systems are still in their', 'is a powerful validation tool, creating a very compelling rea-', 'infancy [4–6]. Series-expansion methods [7] can be success-', 'son to use this approach.', 'ful, but may diverge or otherwise converge slowly, obscuring', 'In this paper we paper we present a pedagogical introduc-', 'the state in certain regimes. There exist a variety of cluster-', 'tion to MED, including numerical implementation issues and', 'based techniques, such as dynamical-mean-field theory [8]', 'applications to 2D quantum lattice models in the thermody-', 'and density-matrix embedding [9]', 'namic limit. In Sec. II. we giye a brief deriyation of the', 'Here we discuss the so-called Markov entropy decompo-', 'Markov entropy decomposition. Section III outlines a robust', 'sition (MED), recently proposed by Poulin & Hastings [10]', 'numerical strategy for optimizing the clusters that make up', '(and analogous to a slightly earlier classical algorithm [11]).', 'the decomposition. In Sec. IV we show how we can extend', 'This is a self-consistent cluster method for fi nite temperature', 'these algorithms to extract non-trivial information, such as', 'systems that takes advantage of an approximation of the (von', 'specific heat and susceptibilities. We present an application of', 'Neumann) entropy. In [10], it was shown that the entropy', 'the method to the spin-1/2 XXZ model on a 2D square lattice', 'per site can be rigorously upper bounded using only local in-', 'in Sec. V, describing how to characterize the phase diagram', 'formation — a local, reduced density matrix on N sites, say.', 'and determine critical points, before concluding in Sec. VI.'], 'rec_scores': array([0.99388635, ..., 0.99304372]), 'rec_polys': array([[[ 352, 105],
132
+ ...,
133
+ [ 352, 128]],
134
+
135
+ ...,
136
+
137
+ [[ 632, 1431],
138
+ ...,
139
+ [ 632, 1447]]], dtype=int16), 'rec_boxes': array([[ 352, ..., 128],
140
+ ...,
141
+ [ 632, ..., 1447]], dtype=int16)}}
142
+ ```
143
+
144
+ If save_path is specified, the visualization results will be saved under `save_path`. The visualization output is shown below:
145
+
146
+ ![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/4lLYO_jQJwz3qWuv7CAyf.png)
147
+
148
+ The command-line method is for quick experience. For project integration, also only a few codes are needed as well:
149
+
150
+ ```python
151
+ from paddleocr import PaddleOCR
152
+
153
+ ocr = PaddleOCR(
154
+ text_detection_model_name="PP-OCRv5_mobile_det",
155
+ use_doc_orientation_classify=False, # Use use_doc_orientation_classify to enable/disable document orientation classification model
156
+ use_doc_unwarping=False, # Use use_doc_unwarping to enable/disable document unwarping module
157
+ use_textline_orientation=True, # Use use_textline_orientation to enable/disable textline orientation classification model
158
+ device="gpu:0", # Use device to specify GPU for model inference
159
+ )
160
+ result = ocr.predict("https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/3ul2Rq4Sk5Cn-l69D695U.png")
161
+ for res in result:
162
+ res.print()
163
+ res.save_to_img("output")
164
+ res.save_to_json("output")
165
+ ```
166
+
167
+ The default model used in pipeline is `PP-OCRv5_server_det`, so it is needed that specifing to `PP-OCRv5_mobile_det` by argument `text_detection_model_name`. And you can also use the local model file by argument `text_detection_model_dir`. For details about usage command and descriptions of parameters, please refer to the [Document](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/OCR.html#2-quick-start).
168
+
169
+ #### PP-StructureV3
170
+
171
+ Layout analysis is a technique used to extract structured information from document images. PP-StructureV3 includes the following six modules:
172
+ * Layout Detection Module
173
+ * General OCR Pipeline
174
+ * Document Image Preprocessing Pipeline (Optional)
175
+ * Table Recognition Pipeline (Optional)
176
+ * Seal Recognition Pipeline (Optional)
177
+ * Formula Recognition Pipeline (Optional)
178
+
179
+ Run a single command to quickly experience the PP-StructureV3 pipeline:
180
+
181
+ ```bash
182
+ paddleocr pp_structurev3 -i https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/mG4tnwfrvECoFMu-S9mxo.png \
183
+ --text_detection_model_name PP-OCRv5_mobile_det \
184
+ --use_doc_orientation_classify False \
185
+ --use_doc_unwarping False \
186
+ --use_textline_orientation False \
187
+ --device gpu:0
188
+ ```
189
+
190
+ Results would be printed to the terminal. If save_path is specified, the results will be saved under `save_path`. The predicted markdown visualization is shown below:
191
+
192
+ ![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/SfxF0X4drBTNGnfFOtZij.png)
193
+
194
+ Just a few lines of code can experience the inference of the pipeline. Taking the PP-StructureV3 pipeline as an example:
195
+
196
+ ```python
197
+ from paddleocr import PPStructureV3
198
+
199
+ pipeline = PPStructureV3(
200
+ text_detection_model_name="PP-OCRv5_mobile_det",
201
+ use_doc_orientation_classify=False, # Use use_doc_orientation_classify to enable/disable document orientation classification model
202
+ use_doc_unwarping=False, # Use use_doc_unwarping to enable/disable document unwarping module
203
+ use_textline_orientation=False, # Use use_textline_orientation to enable/disable textline orientation classification model
204
+ device="gpu:0", # Use device to specify GPU for model inference
205
+ )
206
+ output = pipeline.predict("./pp_structure_v3_demo.png")
207
+ for res in output:
208
+ res.print() # Print the structured prediction output
209
+ res.save_to_json(save_path="output") ## Save the current image's structured result in JSON format
210
+ res.save_to_markdown(save_path="output") ## Save the current image's result in Markdown format
211
+ ```
212
+
213
+ The default model used in pipeline is `PP-OCRv5_server_det`, so it is needed that specifing to `PP-OCRv5_mobile_det` by argument `text_detection_model_name`. And you can also use the local model file by argument `text_detection_model_dir`. For details about usage command and descriptions of parameters, please refer to the [Document](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-StructureV3.html#2-quick-start).
214
+
215
+ ## Links
216
+
217
+ [PaddleOCR Repo](https://github.com/paddlepaddle/paddleocr)
218
+
219
+ [PaddleOCR Documentation](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)
models/paddleocr/official_models/PP-OCRv5_mobile_det/config.json ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Global": {
3
+ "model_name": "PP-OCRv5_mobile_det"
4
+ },
5
+ "Hpi": {
6
+ "backend_configs": {
7
+ "paddle_infer": {
8
+ "trt_dynamic_shapes": {
9
+ "x": [
10
+ [
11
+ 1,
12
+ 3,
13
+ 32,
14
+ 32
15
+ ],
16
+ [
17
+ 1,
18
+ 3,
19
+ 736,
20
+ 736
21
+ ],
22
+ [
23
+ 1,
24
+ 3,
25
+ 4000,
26
+ 4000
27
+ ]
28
+ ]
29
+ }
30
+ },
31
+ "tensorrt": {
32
+ "dynamic_shapes": {
33
+ "x": [
34
+ [
35
+ 1,
36
+ 3,
37
+ 32,
38
+ 32
39
+ ],
40
+ [
41
+ 1,
42
+ 3,
43
+ 736,
44
+ 736
45
+ ],
46
+ [
47
+ 1,
48
+ 3,
49
+ 4000,
50
+ 4000
51
+ ]
52
+ ]
53
+ }
54
+ }
55
+ }
56
+ },
57
+ "PreProcess": {
58
+ "transform_ops": [
59
+ {
60
+ "DecodeImage": {
61
+ "channel_first": false,
62
+ "img_mode": "BGR"
63
+ }
64
+ },
65
+ {
66
+ "DetLabelEncode": null
67
+ },
68
+ {
69
+ "DetResizeForTest": {
70
+ "resize_long": 960
71
+ }
72
+ },
73
+ {
74
+ "NormalizeImage": {
75
+ "mean": [
76
+ 0.485,
77
+ 0.456,
78
+ 0.406
79
+ ],
80
+ "order": "hwc",
81
+ "scale": "1./255.",
82
+ "std": [
83
+ 0.229,
84
+ 0.224,
85
+ 0.225
86
+ ]
87
+ }
88
+ },
89
+ {
90
+ "ToCHWImage": null
91
+ },
92
+ {
93
+ "KeepKeys": {
94
+ "keep_keys": [
95
+ "image",
96
+ "shape",
97
+ "polys",
98
+ "ignore_tags"
99
+ ]
100
+ }
101
+ }
102
+ ]
103
+ },
104
+ "PostProcess": {
105
+ "name": "DBPostProcess",
106
+ "thresh": 0.3,
107
+ "box_thresh": 0.6,
108
+ "max_candidates": 1000,
109
+ "unclip_ratio": 1.5
110
+ }
111
+ }
models/paddleocr/official_models/PP-OCRv5_mobile_det/inference.json ADDED
The diff for this file is too large to render. See raw diff
 
models/paddleocr/official_models/PP-OCRv5_mobile_det/inference.pdiparams ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:afa1820cb16c1fd0dad589d0f8b389139061c1ef6d68019685fd07be997dda5b
3
+ size 4692937
models/paddleocr/official_models/PP-OCRv5_mobile_det/inference.yml ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ model_name: PP-OCRv5_mobile_det
3
+ Hpi:
4
+ backend_configs:
5
+ paddle_infer:
6
+ trt_dynamic_shapes: &id001
7
+ x:
8
+ - - 1
9
+ - 3
10
+ - 32
11
+ - 32
12
+ - - 1
13
+ - 3
14
+ - 736
15
+ - 736
16
+ - - 1
17
+ - 3
18
+ - 4000
19
+ - 4000
20
+ tensorrt:
21
+ dynamic_shapes: *id001
22
+ PreProcess:
23
+ transform_ops:
24
+ - DecodeImage:
25
+ channel_first: false
26
+ img_mode: BGR
27
+ - DetLabelEncode: null
28
+ - DetResizeForTest:
29
+ resize_long: 960
30
+ - NormalizeImage:
31
+ mean:
32
+ - 0.485
33
+ - 0.456
34
+ - 0.406
35
+ order: hwc
36
+ scale: 1./255.
37
+ std:
38
+ - 0.229
39
+ - 0.224
40
+ - 0.225
41
+ - ToCHWImage: null
42
+ - KeepKeys:
43
+ keep_keys:
44
+ - image
45
+ - shape
46
+ - polys
47
+ - ignore_tags
48
+ PostProcess:
49
+ name: DBPostProcess
50
+ thresh: 0.3
51
+ box_thresh: 0.6
52
+ max_candidates: 1000
53
+ unclip_ratio: 1.5
models/paddleocr/official_models/UVDoc/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ inference.pdiparams filter=lfs diff=lfs merge=lfs -text
models/paddleocr/official_models/UVDoc/README.md ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: PaddleOCR
4
+ language:
5
+ - en
6
+ - zh
7
+ pipeline_tag: image-to-text
8
+ tags:
9
+ - OCR
10
+ - PaddlePaddle
11
+ - PaddleOCR
12
+ - doc_img_unwarping
13
+ ---
14
+
15
+ # UVDoc
16
+
17
+ ## Introduction
18
+
19
+ The main purpose of text image correction is to carry out geometric transformation on the image to correct the document distortion, inclination, perspective deformation and other problems in the image, so that the subsequent text recognition can be more accurate.
20
+
21
+ | Model| CER |
22
+ | --- | --- |
23
+ |UVDoc | 0.179 |
24
+
25
+ **Note**: Test data set: docunet benchmark data set.
26
+
27
+ ## Quick Start
28
+
29
+ ### Installation
30
+
31
+ 1. PaddlePaddle
32
+
33
+ Please refer to the following commands to install PaddlePaddle using pip:
34
+
35
+ ```bash
36
+ # for CUDA11.8
37
+ python -m pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
38
+
39
+ # for CUDA12.6
40
+ python -m pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
41
+
42
+ # for CPU
43
+ python -m pip install paddlepaddle==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
44
+ ```
45
+
46
+ For details about PaddlePaddle installation, please refer to the [PaddlePaddle official website](https://www.paddlepaddle.org.cn/en/install/quick).
47
+
48
+ 2. PaddleOCR
49
+
50
+ Install the latest version of the PaddleOCR inference package from PyPI:
51
+
52
+ ```bash
53
+ python -m pip install paddleocr
54
+ ```
55
+
56
+ ### Model Usage
57
+
58
+ You can quickly experience the functionality with a single command:
59
+
60
+ ```bash
61
+ paddleocr text_image_unwarping --model_name UVDoc -i https://cdn-uploads.huggingface.co/production/uploads/63d7b8ee07cd1aa3c49a2026/SfMVKd0xnMII5KBDV6Mfz.jpeg
62
+ ```
63
+
64
+ You can also integrate the model inference of the TextImageUnwarping module into your project. Before running the following code, please download the sample image to your local machine.
65
+
66
+ ```python
67
+ from paddleocr import TextImageUnwarping
68
+
69
+ model = TextImageUnwarping(model_name="UVDoc")
70
+ output = model.predict("SfMVKd0xnMII5KBDV6Mfz.jpeg", batch_size=1)
71
+ for res in output:
72
+ res.print()
73
+ res.save_to_img(save_path="./output/")
74
+ res.save_to_json(save_path="./output/res.json")
75
+ ```
76
+
77
+ After running, the obtained result is as follows:
78
+
79
+ ```json
80
+ {'res': {'input_path': 'doc_test.jpg', 'page_index': None, 'doctr_img': '...'}}
81
+ ```
82
+
83
+ The visualized image is as follows:
84
+
85
+ ![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/63d7b8ee07cd1aa3c49a2026/1405yNIYq_hA9VL3_8Itn.jpeg)
86
+
87
+ For details about usage command and descriptions of parameters, please refer to the [Document](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/module_usage/text_image_unwarping.html#iii-quick-integration).
88
+
89
+
90
+ ### Pipeline Usage
91
+
92
+ The ability of a single model is limited. But the pipeline consists of several models can provide more capacity to resolve difficult problems in real-world scenarios.
93
+
94
+
95
+ #### PP-StructureV3
96
+
97
+ Layout analysis is a technique used to extract structured information from document images. PP-StructureV3 includes the following six modules:
98
+ * Layout Detection Module
99
+ * General OCR Sub-pipeline
100
+ * Document Image Preprocessing Sub-pipeline (Optional)
101
+ * Table Recognition Sub-pipeline (Optional)
102
+ * Seal Recognition Sub-pipeline (Optional)
103
+ * Formula Recognition Sub-pipeline (Optional)
104
+
105
+ You can quickly experience the PP-StructureV3 pipeline with a single command.
106
+
107
+ ```bash
108
+ paddleocr pp_structurev3 --use_doc_unwarping True -i https://cdn-uploads.huggingface.co/production/uploads/63d7b8ee07cd1aa3c49a2026/KP10tiSZfAjMuwZUSLtRp.png
109
+ ```
110
+
111
+ You can experience the inference of the pipeline with just a few lines of code. Taking the PP-StructureV3 pipeline as an example:
112
+
113
+ ```python
114
+ from paddleocr import PPStructureV3
115
+
116
+ pipeline = PPStructureV3(use_doc_unwarping=True) # Use use_doc_unwarping to enable/disable document unwarping module
117
+ output = pipeline.predict("./KP10tiSZfAjMuwZUSLtRp.png")
118
+ for res in output:
119
+ res.print() ## Print the structured prediction output
120
+ res.save_to_json(save_path="output") ## Save the current image's structured result in JSON format
121
+ res.save_to_markdown(save_path="output") ## Save the current image's result in Markdown format
122
+ ```
123
+
124
+ For details about usage command and descriptions of parameters, please refer to the [Document](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/PP-StructureV3.html#2-quick-start).
125
+
126
+ ## Links
127
+
128
+ [PaddleOCR Repo](https://github.com/paddlepaddle/paddleocr)
129
+
130
+ [PaddleOCR Documentation](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)
131
+
models/paddleocr/official_models/UVDoc/config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Global": {
3
+ "model_name": "UVDoc"
4
+ },
5
+ "Hpi": {
6
+ "backend_configs": {
7
+ "paddle_infer": {
8
+ "trt_dynamic_shapes": {
9
+ "img": [
10
+ [
11
+ 1,
12
+ 3,
13
+ 128,
14
+ 64
15
+ ],
16
+ [
17
+ 1,
18
+ 3,
19
+ 256,
20
+ 128
21
+ ],
22
+ [
23
+ 8,
24
+ 3,
25
+ 512,
26
+ 256
27
+ ]
28
+ ]
29
+ }
30
+ },
31
+ "tensorrt": {
32
+ "dynamic_shapes": {
33
+ "img": [
34
+ [
35
+ 1,
36
+ 3,
37
+ 128,
38
+ 64
39
+ ],
40
+ [
41
+ 1,
42
+ 3,
43
+ 256,
44
+ 128
45
+ ],
46
+ [
47
+ 8,
48
+ 3,
49
+ 512,
50
+ 256
51
+ ]
52
+ ]
53
+ }
54
+ }
55
+ }
56
+ }
57
+ }
models/paddleocr/official_models/UVDoc/inference.json ADDED
The diff for this file is too large to render. See raw diff
 
models/paddleocr/official_models/UVDoc/inference.pdiparams ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:810488899520e0da843b9bd9769ba4949f1c81e357f0eceb12d4a7da459c3eca
3
+ size 32054311
models/paddleocr/official_models/UVDoc/inference.yml ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ model_name: UVDoc
3
+ Hpi:
4
+ backend_configs:
5
+ paddle_infer:
6
+ trt_dynamic_shapes:
7
+ img:
8
+ - [1, 3, 128, 64]
9
+ - [1, 3, 256, 128]
10
+ - [8, 3, 512, 256]
11
+ tensorrt:
12
+ dynamic_shapes:
13
+ img:
14
+ - [1, 3, 128, 64]
15
+ - [1, 3, 256, 128]
16
+ - [8, 3, 512, 256]
models/paddleocr/official_models/en_PP-OCRv5_mobile_rec/.gitattributes ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ inference.pdiparams filter=lfs diff=lfs merge=lfs -text
models/paddleocr/official_models/en_PP-OCRv5_mobile_rec/README.md ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ library_name: PaddleOCR
4
+ language:
5
+ - en
6
+ pipeline_tag: image-to-text
7
+ tags:
8
+ - OCR
9
+ - PaddlePaddle
10
+ - PaddleOCR
11
+ - textline_recognition
12
+ ---
13
+
14
+ # en_PP-OCRv5_mobile_rec
15
+
16
+ ## Introduction
17
+
18
+ en_PP-OCRv5_mobile_rec is one of the PP-OCRv5_rec that are the latest generation text line recognition models developed by PaddleOCR team. It aims to efficiently and accurately support the recognition of English. The key accuracy metrics are as follow:
19
+
20
+ | Model | Accuracy (%) |
21
+ |-|-|
22
+ | en_PP-OCRv5_mobile_rec | 85.3|
23
+
24
+
25
+
26
+ **Note**: If any character (including punctuation) in a line was incorrect, the entire line was marked as wrong. This ensures higher accuracy in practical applications.
27
+
28
+ ## Quick Start
29
+
30
+ ### Installation
31
+
32
+ 1. PaddlePaddle
33
+
34
+ Please refer to the following commands to install PaddlePaddle using pip:
35
+
36
+ ```bash
37
+ # for CUDA11.8
38
+ python -m pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu118/
39
+
40
+ # for CUDA12.6
41
+ python -m pip install paddlepaddle-gpu==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cu126/
42
+
43
+ # for CPU
44
+ python -m pip install paddlepaddle==3.0.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
45
+ ```
46
+
47
+ For details about PaddlePaddle installation, please refer to the [PaddlePaddle official website](https://www.paddlepaddle.org.cn/en/install/quick).
48
+
49
+ 2. PaddleOCR
50
+
51
+ Install the latest version of the PaddleOCR inference package from PyPI:
52
+
53
+ ```bash
54
+ python -m pip install paddleocr
55
+ ```
56
+
57
+ ### Model Usage
58
+
59
+ You can quickly experience the functionality with a single command:
60
+
61
+ ```bash
62
+ paddleocr text_recognition \
63
+ --model_name en_PP-OCRv5_mobile_rec \
64
+ -i https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/QmaPtftqwOgCtx0AIvU2z.png
65
+ ```
66
+
67
+ You can also integrate the model inference of the text recognition module into your project. Before running the following code, please download the sample image to your local machine.
68
+
69
+ ```python
70
+ from paddleocr import TextRecognition
71
+ model = TextRecognition(model_name="en_PP-OCRv5_mobile_rec")
72
+ output = model.predict(input="QmaPtftqwOgCtx0AIvU2z.png", batch_size=1)
73
+ for res in output:
74
+ res.print()
75
+ res.save_to_img(save_path="./output/")
76
+ res.save_to_json(save_path="./output/res.json")
77
+ ```
78
+
79
+ After running, the obtained result is as follows:
80
+
81
+ ```json
82
+ {'res': {'input_path': '/root/.paddlex/predict_input/QmaPtftqwOgCtx0AIvU2z.png', 'page_index': None, 'rec_text': 'the number of model parameters and FLOPs get larger, it', 'rec_score': 0.993655264377594}}
83
+ ```
84
+
85
+ The visualized image is as follows:
86
+
87
+ ![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/Xe-blNpCl-X-U1o3L4Rav.png)
88
+
89
+ For details about usage command and descriptions of parameters, please refer to the [Document](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/module_usage/text_recognition.html#iii-quick-start).
90
+
91
+ ### Pipeline Usage
92
+
93
+ The ability of a single model is limited. But the pipeline consists of several models can provide more capacity to resolve difficult problems in real-world scenarios.
94
+
95
+ #### PP-OCRv5
96
+
97
+ The general OCR pipeline is used to solve text recognition tasks by extracting text information from images and outputting it in string format. And there are 5 modules in the pipeline:
98
+ * Document Image Orientation Classification Module (Optional)
99
+ * Text Image Unwarping Module (Optional)
100
+ * Text Line Orientation Classification Module (Optional)
101
+ * Text Detection Module
102
+ * Text Recognition Module
103
+
104
+ Run a single command to quickly experience the OCR pipeline:
105
+
106
+ ```bash
107
+ paddleocr ocr -i https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/c3hSldnYVQXp48T5V0Ze4.png \
108
+ --text_recognition_model_name en_PP-OCRv5_mobile_rec \
109
+ --use_doc_orientation_classify False \
110
+ --use_doc_unwarping False \
111
+ --use_textline_orientation True \
112
+ --save_path ./output \
113
+ --device gpu:0
114
+ ```
115
+
116
+ Results are printed to the terminal:
117
+
118
+ ```json
119
+ {'res': {'input_path': '/root/.paddlex/predict_input/c3hSldnYVQXp48T5V0Ze4.png', 'page_index': None, 'model_settings': {'use_doc_preprocessor': True, 'use_textline_orientation': False}, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'model_settings': {'use_doc_orientation_classify': False, 'use_doc_unwarping': False}, 'angle': -1}, 'dt_polys': array([[[252, 172],
120
+ ...,
121
+ [254, 241]],
122
+
123
+ ...,
124
+
125
+ [[665, 566],
126
+ ...,
127
+ [663, 601]]], dtype=int16), 'text_det_params': {'limit_side_len': 64, 'limit_type': 'min', 'thresh': 0.3, 'max_side_limit': 4000, 'box_thresh': 0.6, 'unclip_ratio': 1.5}, 'text_type': 'general', 'textline_orientation_angles': array([-1, ..., -1]), 'text_rec_score_thresh': 0.0, 'return_word_box': False, 'rec_texts': ['The moon tells the sky', 'The sky tells the sea', 'The sea tells the tide', 'And the tide tells me', 'Lemn Sissay'], 'rec_scores': array([0.98405874, ..., 0.9837752 ]), 'rec_polys': array([[[252, 172],
128
+ ...,
129
+ [254, 241]],
130
+
131
+ ...,
132
+
133
+ [[665, 566],
134
+ ...,
135
+ [663, 601]]], dtype=int16), 'rec_boxes': array([[252, ..., 241],
136
+ ...,
137
+ [663, ..., 612]], dtype=int16)}}
138
+ ```
139
+
140
+ If save_path is specified, the visualization results will be saved under `save_path`. The visualization output is shown below:
141
+
142
+ ![image/jpeg](https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/DcAem61DifjkUQK9f-0iZ.png)
143
+
144
+ The command-line method is for quick experience. For project integration, also only a few codes are needed as well:
145
+
146
+ ```python
147
+ from paddleocr import PaddleOCR
148
+
149
+ ocr = PaddleOCR(
150
+ text_recognition_model_name="en_PP-OCRv5_mobile_rec",
151
+ use_doc_orientation_classify=False, # Use use_doc_orientation_classify to enable/disable document orientation classification model
152
+ use_doc_unwarping=False, # Use use_doc_unwarping to enable/disable document unwarping module
153
+ use_textline_orientation=True, # Use use_textline_orientation to enable/disable textline orientation classification model
154
+ device="gpu:0", # Use device to specify GPU for model inference
155
+ )
156
+ result = ocr.predict("https://cdn-uploads.huggingface.co/production/uploads/681c1ecd9539bdde5ae1733c/6KQKOS42DKVEUnrticvhd.png")
157
+ for res in result:
158
+ res.print()
159
+ res.save_to_img("output")
160
+ res.save_to_json("output")
161
+ ```
162
+
163
+ The default model used in pipeline is `PP-OCRv5_server_rec`, so it is needed that specifing to `en_PP-OCRv5_mobile_rec` by argument `text_recognition_model_name`. And you can also use the local model file by argument `text_recognition_model_dir`. For details about usage command and descriptions of parameters, please refer to the [Document](https://paddlepaddle.github.io/PaddleOCR/latest/en/version3.x/pipeline_usage/OCR.html#2-quick-start).
164
+
165
+ ## Links
166
+
167
+ [PaddleOCR Repo](https://github.com/paddlepaddle/paddleocr)
168
+
169
+ [PaddleOCR Documentation](https://paddlepaddle.github.io/PaddleOCR/latest/en/index.html)
models/paddleocr/official_models/en_PP-OCRv5_mobile_rec/config.json ADDED
@@ -0,0 +1,533 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "Global": {
3
+ "model_name": "en_PP-OCRv5_mobile_rec"
4
+ },
5
+ "Hpi": {
6
+ "backend_configs": {
7
+ "paddle_infer": {
8
+ "trt_dynamic_shapes": {
9
+ "x": [
10
+ [
11
+ 1,
12
+ 3,
13
+ 48,
14
+ 160
15
+ ],
16
+ [
17
+ 1,
18
+ 3,
19
+ 48,
20
+ 320
21
+ ],
22
+ [
23
+ 8,
24
+ 3,
25
+ 48,
26
+ 3200
27
+ ]
28
+ ]
29
+ }
30
+ },
31
+ "tensorrt": {
32
+ "dynamic_shapes": {
33
+ "x": [
34
+ [
35
+ 1,
36
+ 3,
37
+ 48,
38
+ 160
39
+ ],
40
+ [
41
+ 1,
42
+ 3,
43
+ 48,
44
+ 320
45
+ ],
46
+ [
47
+ 8,
48
+ 3,
49
+ 48,
50
+ 3200
51
+ ]
52
+ ]
53
+ }
54
+ }
55
+ }
56
+ },
57
+ "PreProcess": {
58
+ "transform_ops": [
59
+ {
60
+ "DecodeImage": {
61
+ "channel_first": false,
62
+ "img_mode": "BGR"
63
+ }
64
+ },
65
+ {
66
+ "MultiLabelEncode": {
67
+ "gtc_encode": "NRTRLabelEncode"
68
+ }
69
+ },
70
+ {
71
+ "RecResizeImg": {
72
+ "image_shape": [
73
+ 3,
74
+ 48,
75
+ 320
76
+ ]
77
+ }
78
+ },
79
+ {
80
+ "KeepKeys": {
81
+ "keep_keys": [
82
+ "image",
83
+ "label_ctc",
84
+ "label_gtc",
85
+ "length",
86
+ "valid_ratio"
87
+ ]
88
+ }
89
+ }
90
+ ]
91
+ },
92
+ "PostProcess": {
93
+ "name": "CTCLabelDecode",
94
+ "character_dict": [
95
+ "0",
96
+ "1",
97
+ "2",
98
+ "3",
99
+ "4",
100
+ "5",
101
+ "6",
102
+ "7",
103
+ "8",
104
+ "9",
105
+ "A",
106
+ "B",
107
+ "C",
108
+ "D",
109
+ "E",
110
+ "F",
111
+ "G",
112
+ "H",
113
+ "I",
114
+ "J",
115
+ "K",
116
+ "L",
117
+ "M",
118
+ "N",
119
+ "O",
120
+ "P",
121
+ "Q",
122
+ "R",
123
+ "S",
124
+ "T",
125
+ "U",
126
+ "V",
127
+ "W",
128
+ "X",
129
+ "Y",
130
+ "Z",
131
+ "a",
132
+ "b",
133
+ "c",
134
+ "d",
135
+ "e",
136
+ "f",
137
+ "g",
138
+ "h",
139
+ "i",
140
+ "j",
141
+ "k",
142
+ "l",
143
+ "m",
144
+ "n",
145
+ "o",
146
+ "p",
147
+ "q",
148
+ "r",
149
+ "s",
150
+ "t",
151
+ "u",
152
+ "v",
153
+ "w",
154
+ "x",
155
+ "y",
156
+ "z",
157
+ "!",
158
+ "\"",
159
+ "#",
160
+ "$",
161
+ "%",
162
+ "&",
163
+ "'",
164
+ "(",
165
+ ")",
166
+ "*",
167
+ "+",
168
+ ",",
169
+ "-",
170
+ ".",
171
+ "/",
172
+ ":",
173
+ ";",
174
+ "<",
175
+ "=",
176
+ ">",
177
+ "?",
178
+ "@",
179
+ "[",
180
+ "\\",
181
+ "]",
182
+ "_",
183
+ "`",
184
+ "{",
185
+ "|",
186
+ "}",
187
+ "^",
188
+ "~",
189
+ "©",
190
+ "®",
191
+ "℉",
192
+ "№",
193
+ "Ω",
194
+ "℮",
195
+ "™",
196
+ "∆",
197
+ "✓",
198
+ "✔",
199
+ "✗",
200
+ "✘",
201
+ "✕",
202
+ "☑",
203
+ "☒",
204
+ "●",
205
+ "▪",
206
+ "▫",
207
+ "◼",
208
+ "▶",
209
+ "◀",
210
+ "⬆",
211
+ "¤",
212
+ "¦",
213
+ "§",
214
+ "¨",
215
+ "ª",
216
+ "«",
217
+ "¬",
218
+ "¯",
219
+ "°",
220
+ "²",
221
+ "³",
222
+ "´",
223
+ "µ",
224
+ "¶",
225
+ "¸",
226
+ "¹",
227
+ "º",
228
+ "»",
229
+ "¼",
230
+ "½",
231
+ "��",
232
+ "¿",
233
+ "×",
234
+ "‐",
235
+ "‑",
236
+ "‒",
237
+ "—",
238
+ "―",
239
+ "‖",
240
+ "‗",
241
+ "‘",
242
+ "’",
243
+ "‚",
244
+ "‛",
245
+ "“",
246
+ "”",
247
+ "„",
248
+ "‟",
249
+ "†",
250
+ "‡",
251
+ "‣",
252
+ "․",
253
+ "…",
254
+ "‧",
255
+ "‰",
256
+ "‴",
257
+ "‵",
258
+ "‶",
259
+ "‷",
260
+ "‸",
261
+ "‹",
262
+ "›",
263
+ "※",
264
+ "‼",
265
+ "‽",
266
+ "‾",
267
+ "−",
268
+ "₤",
269
+ "₡",
270
+ "₹",
271
+ "₽",
272
+ "₴",
273
+ "₿",
274
+ "¢",
275
+ "€",
276
+ "£",
277
+ "¥",
278
+ "Ⅰ",
279
+ "Ⅱ",
280
+ "Ⅲ",
281
+ "Ⅳ",
282
+ "Ⅴ",
283
+ "Ⅵ",
284
+ "Ⅶ",
285
+ "Ⅷ",
286
+ "Ⅸ",
287
+ "Ⅹ",
288
+ "Ⅺ",
289
+ "Ⅻ",
290
+ "ⅰ",
291
+ "ⅱ",
292
+ "ⅲ",
293
+ "ⅳ",
294
+ "ⅴ",
295
+ "ⅵ",
296
+ "ⅶ",
297
+ "ⅷ",
298
+ "ⅸ",
299
+ "ⅹ",
300
+ "ⅺ",
301
+ "ⅻ",
302
+ "➀",
303
+ "➁",
304
+ "➂",
305
+ "➃",
306
+ "➄",
307
+ "➅",
308
+ "➆",
309
+ "➇",
310
+ "➈",
311
+ "➉",
312
+ "➊",
313
+ "➋",
314
+ "➌",
315
+ "➍",
316
+ "➎",
317
+ "➏",
318
+ "➐",
319
+ "➑",
320
+ "➒",
321
+ "➓",
322
+ "❶",
323
+ "❷",
324
+ "❸",
325
+ "❹",
326
+ "❺",
327
+ "❻",
328
+ "❼",
329
+ "❽",
330
+ "❾",
331
+ "❿",
332
+ "①",
333
+ "②",
334
+ "③",
335
+ "④",
336
+ "⑤",
337
+ "⑥",
338
+ "⑦",
339
+ "⑧",
340
+ "⑨",
341
+ "⑩",
342
+ "↑",
343
+ "→",
344
+ "↓",
345
+ "↕",
346
+ "←",
347
+ "↔",
348
+ "⇒",
349
+ "⇐",
350
+ "⇔",
351
+ "∀",
352
+ "∃",
353
+ "∄",
354
+ "∴",
355
+ "∵",
356
+ "∝",
357
+ "∞",
358
+ "∩",
359
+ "∪",
360
+ "∂",
361
+ "∫",
362
+ "∬",
363
+ "∭",
364
+ "∮",
365
+ "∯",
366
+ "∰",
367
+ "∑",
368
+ "∏",
369
+ "√",
370
+ "∛",
371
+ "∜",
372
+ "∱",
373
+ "∲",
374
+ "∳",
375
+ "∶",
376
+ "∷",
377
+ "∼",
378
+ "∖",
379
+ "∗",
380
+ "≈",
381
+ "≠",
382
+ "≡",
383
+ "≤",
384
+ "≥",
385
+ "⊂",
386
+ "⊃",
387
+ "⊥",
388
+ "⊾",
389
+ "⊿",
390
+ "□",
391
+ "∥",
392
+ "∋",
393
+ "ƒ",
394
+ "′",
395
+ "″",
396
+ "À",
397
+ "Á",
398
+ "Â",
399
+ "Ã",
400
+ "Ä",
401
+ "Å",
402
+ "Æ",
403
+ "Ç",
404
+ "È",
405
+ "É",
406
+ "Ê",
407
+ "Ë",
408
+ "Ì",
409
+ "Í",
410
+ "Î",
411
+ "Ï",
412
+ "Ð",
413
+ "Ñ",
414
+ "Ò",
415
+ "Ó",
416
+ "Ô",
417
+ "Õ",
418
+ "Ö",
419
+ "Ø",
420
+ "Ù",
421
+ "Ú",
422
+ "Û",
423
+ "Ü",
424
+ "Ý",
425
+ "Þ",
426
+ "à",
427
+ "á",
428
+ "â",
429
+ "ã",
430
+ "ä",
431
+ "å",
432
+ "æ",
433
+ "ç",
434
+ "è",
435
+ "é",
436
+ "ê",
437
+ "ë",
438
+ "ì",
439
+ "í",
440
+ "î",
441
+ "ï",
442
+ "ð",
443
+ "ñ",
444
+ "ò",
445
+ "ó",
446
+ "ô",
447
+ "õ",
448
+ "ö",
449
+ "ø",
450
+ "ù",
451
+ "ú",
452
+ "û",
453
+ "ü",
454
+ "ý",
455
+ "þ",
456
+ "ÿ",
457
+ "Α",
458
+ "Β",
459
+ "Γ",
460
+ "Δ",
461
+ "Ε",
462
+ "Ζ",
463
+ "Η",
464
+ "Θ",
465
+ "Ι",
466
+ "Κ",
467
+ "Λ",
468
+ "Μ",
469
+ "Ν",
470
+ "Ξ",
471
+ "Ο",
472
+ "Π",
473
+ "Ρ",
474
+ "Σ",
475
+ "Τ",
476
+ "Υ",
477
+ "Φ",
478
+ "Χ",
479
+ "Ψ",
480
+ "Ω",
481
+ "α",
482
+ "β",
483
+ "γ",
484
+ "δ",
485
+ "ε",
486
+ "ζ",
487
+ "η",
488
+ "θ",
489
+ "ι",
490
+ "κ",
491
+ "λ",
492
+ "μ",
493
+ "ν",
494
+ "ξ",
495
+ "ο",
496
+ "π",
497
+ "ρ",
498
+ "σ",
499
+ "ς",
500
+ "τ",
501
+ "υ",
502
+ "φ",
503
+ "χ",
504
+ "ψ",
505
+ "ω",
506
+ "Å",
507
+ "ℏ",
508
+ "⌀",
509
+ "⍺",
510
+ "⍵",
511
+ "𝑢",
512
+ "𝜓",
513
+ "०",
514
+ "‥",
515
+ "︽",
516
+ "﹥",
517
+ "•",
518
+ "÷",
519
+ "∕",
520
+ "∙",
521
+ "⋅",
522
+ "·",
523
+ "±",
524
+ "∓",
525
+ "∟",
526
+ "∠",
527
+ "∡",
528
+ "∢",
529
+ "℧",
530
+ "☺"
531
+ ]
532
+ }
533
+ }
models/paddleocr/official_models/en_PP-OCRv5_mobile_rec/inference.json ADDED
The diff for this file is too large to render. See raw diff
 
models/paddleocr/official_models/en_PP-OCRv5_mobile_rec/inference.pdiparams ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3ec8a97ed6cefe8568d3e2ee90bb193299b566a7661aa4fd52d224b96b59f66b
3
+ size 7772315
models/paddleocr/official_models/en_PP-OCRv5_mobile_rec/inference.yml ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Global:
2
+ model_name: en_PP-OCRv5_mobile_rec
3
+ Hpi:
4
+ backend_configs:
5
+ paddle_infer:
6
+ trt_dynamic_shapes: &id001
7
+ x:
8
+ - - 1
9
+ - 3
10
+ - 48
11
+ - 160
12
+ - - 1
13
+ - 3
14
+ - 48
15
+ - 320
16
+ - - 8
17
+ - 3
18
+ - 48
19
+ - 3200
20
+ tensorrt:
21
+ dynamic_shapes: *id001
22
+ PreProcess:
23
+ transform_ops:
24
+ - DecodeImage:
25
+ channel_first: false
26
+ img_mode: BGR
27
+ - MultiLabelEncode:
28
+ gtc_encode: NRTRLabelEncode
29
+ - RecResizeImg:
30
+ image_shape:
31
+ - 3
32
+ - 48
33
+ - 320
34
+ - KeepKeys:
35
+ keep_keys:
36
+ - image
37
+ - label_ctc
38
+ - label_gtc
39
+ - length
40
+ - valid_ratio
41
+ PostProcess:
42
+ name: CTCLabelDecode
43
+ character_dict:
44
+ - '0'
45
+ - '1'
46
+ - '2'
47
+ - '3'
48
+ - '4'
49
+ - '5'
50
+ - '6'
51
+ - '7'
52
+ - '8'
53
+ - '9'
54
+ - A
55
+ - B
56
+ - C
57
+ - D
58
+ - E
59
+ - F
60
+ - G
61
+ - H
62
+ - I
63
+ - J
64
+ - K
65
+ - L
66
+ - M
67
+ - N
68
+ - O
69
+ - P
70
+ - Q
71
+ - R
72
+ - S
73
+ - T
74
+ - U
75
+ - V
76
+ - W
77
+ - X
78
+ - Y
79
+ - Z
80
+ - a
81
+ - b
82
+ - c
83
+ - d
84
+ - e
85
+ - f
86
+ - g
87
+ - h
88
+ - i
89
+ - j
90
+ - k
91
+ - l
92
+ - m
93
+ - n
94
+ - o
95
+ - p
96
+ - q
97
+ - r
98
+ - s
99
+ - t
100
+ - u
101
+ - v
102
+ - w
103
+ - x
104
+ - y
105
+ - z
106
+ - '!'
107
+ - '"'
108
+ - '#'
109
+ - $
110
+ - '%'
111
+ - '&'
112
+ - ''''
113
+ - (
114
+ - )
115
+ - '*'
116
+ - +
117
+ - ','
118
+ - '-'
119
+ - .
120
+ - /
121
+ - ':'
122
+ - ;
123
+ - <
124
+ - '='
125
+ - '>'
126
+ - '?'
127
+ - '@'
128
+ - '['
129
+ - \
130
+ - ']'
131
+ - _
132
+ - '`'
133
+ - '{'
134
+ - '|'
135
+ - '}'
136
+ - ^
137
+ - '~'
138
+ - ©
139
+ - ®
140
+ - ℉
141
+ - №
142
+ - Ω
143
+ - ℮
144
+ - ™
145
+ - ∆
146
+ - ✓
147
+ - ✔
148
+ - ✗
149
+ - ✘
150
+ - ✕
151
+ - ☑
152
+ - ☒
153
+ - ●
154
+ - ▪
155
+ - ▫
156
+ - ◼
157
+ - ▶
158
+ - ◀
159
+ - ⬆
160
+ - ¤
161
+ - ¦
162
+ - §
163
+ - ¨
164
+ - ª
165
+ - «
166
+ - ¬
167
+ - ¯
168
+ - °
169
+ - ²
170
+ - ³
171
+ - ´
172
+ - µ
173
+ - ¶
174
+ - ¸
175
+ - ¹
176
+ - º
177
+ - »
178
+ - ¼
179
+ - ½
180
+ - ¾
181
+ - ¿
182
+ - ×
183
+ - ‐
184
+ - ‑
185
+ - ‒
186
+ - —
187
+ - ―
188
+ - ‖
189
+ - ‗
190
+ - ‘
191
+ - ’
192
+ - ‚
193
+ - ‛
194
+ - “
195
+ - ”
196
+ - „
197
+ - ‟
198
+ - †
199
+ - ‡
200
+ - ‣
201
+ - ․
202
+ - …
203
+ - ‧
204
+ - ‰
205
+ - ‴
206
+ - ‵
207
+ - ‶
208
+ - ‷
209
+ - ‸
210
+ - ‹
211
+ - ›
212
+ - ※
213
+ - ‼
214
+ - ‽
215
+ - ‾
216
+ - −
217
+ - ₤
218
+ - ₡
219
+ - ₹
220
+ - ₽
221
+ - ₴
222
+ - ₿
223
+ - ¢
224
+ - €
225
+ - £
226
+ - ¥
227
+ - Ⅰ
228
+ - Ⅱ
229
+ - Ⅲ
230
+ - Ⅳ
231
+ - Ⅴ
232
+ - Ⅵ
233
+ - Ⅶ
234
+ - Ⅷ
235
+ - Ⅸ
236
+ - Ⅹ
237
+ - Ⅺ
238
+ - Ⅻ
239
+ - ⅰ
240
+ - ⅱ
241
+ - ⅲ
242
+ - ⅳ
243
+ - ⅴ
244
+ - ⅵ
245
+ - ⅶ
246
+ - ⅷ
247
+ - ⅸ
248
+ - ⅹ
249
+ - ⅺ
250
+ - ⅻ
251
+ - ➀
252
+ - ➁
253
+ - ➂
254
+ - ➃
255
+ - ➄
256
+ - ➅
257
+ - ➆
258
+ - ➇
259
+ - ➈
260
+ - ➉
261
+ - ➊
262
+ - ➋
263
+ - ➌
264
+ - ➍
265
+ - ➎
266
+ - ➏
267
+ - ➐
268
+ - ➑
269
+ - ➒
270
+ - ➓
271
+ - ❶
272
+ - ❷
273
+ - ❸
274
+ - ❹
275
+ - ❺
276
+ - ❻
277
+ - ❼
278
+ - ❽
279
+ - ❾
280
+ - ❿
281
+ - ①
282
+ - ②
283
+ - ③
284
+ - ④
285
+ - ⑤
286
+ - ⑥
287
+ - ⑦
288
+ - ⑧
289
+ - ⑨
290
+ - ⑩
291
+ - ↑
292
+ - →
293
+ - ↓
294
+ - ↕
295
+ - ←
296
+ - ↔
297
+ - ⇒
298
+ - ⇐
299
+ - ⇔
300
+ - ∀
301
+ - ∃
302
+ - ∄
303
+ - ∴
304
+ - ∵
305
+ - ∝
306
+ - ∞
307
+ - ∩
308
+ - ∪
309
+ - ∂
310
+ - ∫
311
+ - ∬
312
+ - ∭
313
+ - ∮
314
+ - ∯
315
+ - ∰
316
+ - ∑
317
+ - ∏
318
+ - √
319
+ - ∛
320
+ - ∜
321
+ - ∱
322
+ - ∲
323
+ - ∳
324
+ - ∶
325
+ - ∷
326
+ - ∼
327
+ - ∖
328
+ - ∗
329
+ - ≈
330
+ - ≠
331
+ - ≡
332
+ - ≤
333
+ - ≥
334
+ - ⊂
335
+ - ⊃
336
+ - ⊥
337
+ - ⊾
338
+ - ⊿
339
+ - □
340
+ - ∥
341
+ - ∋
342
+ - ƒ
343
+ - ′
344
+ - ″
345
+ - À
346
+ - Á
347
+ - Â
348
+ - Ã
349
+ - Ä
350
+ - Å
351
+ - Æ
352
+ - Ç
353
+ - È
354
+ - É
355
+ - Ê
356
+ - Ë
357
+ - Ì
358
+ - Í
359
+ - Î
360
+ - Ï
361
+ - Ð
362
+ - Ñ
363
+ - Ò
364
+ - Ó
365
+ - Ô
366
+ - Õ
367
+ - Ö
368
+ - Ø
369
+ - Ù
370
+ - Ú
371
+ - Û
372
+ - Ü
373
+ - Ý
374
+ - Þ
375
+ - à
376
+ - á
377
+ - â
378
+ - ã
379
+ - ä
380
+ - å
381
+ - æ
382
+ - ç
383
+ - è
384
+ - é
385
+ - ê
386
+ - ë
387
+ - ì
388
+ - í
389
+ - î
390
+ - ï
391
+ - ð
392
+ - ñ
393
+ - ò
394
+ - ó
395
+ - ô
396
+ - õ
397
+ - ö
398
+ - ø
399
+ - ù
400
+ - ú
401
+ - û
402
+ - ü
403
+ - ý
404
+ - þ
405
+ - ÿ
406
+ - Α
407
+ - Β
408
+ - Γ
409
+ - Δ
410
+ - Ε
411
+ - Ζ
412
+ - Η
413
+ - Θ
414
+ - Ι
415
+ - Κ
416
+ - Λ
417
+ - Μ
418
+ - Ν
419
+ - Ξ
420
+ - Ο
421
+ - Π
422
+ - Ρ
423
+ - Σ
424
+ - Τ
425
+ - Υ
426
+ - Φ
427
+ - Χ
428
+ - Ψ
429
+ - Ω
430
+ - α
431
+ - β
432
+ - γ
433
+ - δ
434
+ - ε
435
+ - ζ
436
+ - η
437
+ - θ
438
+ - ι
439
+ - κ
440
+ - λ
441
+ - μ
442
+ - ν
443
+ - ξ
444
+ - ο
445
+ - π
446
+ - ρ
447
+ - σ
448
+ - ς
449
+ - τ
450
+ - υ
451
+ - φ
452
+ - χ
453
+ - ψ
454
+ - ω
455
+ - Å
456
+ - ℏ
457
+ - ⌀
458
+ - ⍺
459
+ - ⍵
460
+ - 𝑢
461
+ - 𝜓
462
+ - ०
463
+ - ‥
464
+ - ︽
465
+ - ﹥
466
+ - •
467
+ - ÷
468
+ - ∕
469
+ - ∙
470
+ - ⋅
471
+ - ·
472
+ - ±
473
+ - ∓
474
+ - ∟
475
+ - ∠
476
+ - ∡
477
+ - ∢
478
+ - ℧
479
+ - ☺
models/stage1_best.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5bdfda1f591c8a33c1b60d0b4d013116b3dde30c2735f1a5ea6420c4d62bada8
3
+ size 22532266
models/yolov8s.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1f47a78bf100391c2a140b7ac73a1caae18c32779be7d310658112f7ac9aa78a
3
+ size 22588772
patch_safetensors.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import safetensors.torch
2
+ import os
3
+
4
+ path = "models/depth_anything_v2/model.safetensors"
5
+ temp_path = "models/depth_anything_v2/model_temp.safetensors"
6
+ try:
7
+ tensors = safetensors.torch.load_file(path)
8
+ safetensors.torch.save_file(tensors, temp_path, metadata={"format": "pt"})
9
+ os.remove(path)
10
+ os.rename(temp_path, path)
11
+ print("Successfully patched model.safetensors")
12
+ except Exception as e:
13
+ print("Error:", e)
requirements.txt ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # requirements.txt — AID 728 Traffic Rule Violation Detection
2
+ # Install with: pip install -r requirements.txt
3
+
4
+ # ── Core ML / Vision ─────────────────────────────────────────────────────────
5
+ torch==2.12.0
6
+ torchvision==0.27.0
7
+ numpy==1.26.4
8
+ Pillow==12.2.0
9
+
10
+ # ── OpenCV ────────────────────────────────────────────────────────────────────
11
+ opencv-python==4.11.0.86
12
+
13
+ # ── Object Detection ──────────────────────────────────────────────────────────
14
+ ultralytics==8.4.51
15
+ dill==0.4.1
16
+
17
+ # ── Depth Estimation ──────────────────────────────────────────────────────────
18
+ transformers==5.8.1
19
+ huggingface_hub==1.15.0
20
+ safetensors==0.7.0
21
+ tokenizers==0.22.2
22
+
23
+ # ── OCR (PaddleOCR 3.5.0 + PaddlePaddle) ─────────────────────────────────────
24
+ paddlepaddle==3.3.1
25
+ paddleocr==3.5.0
26
+
27
+ # ── PaddleOCR transitive deps ─────────────────────────────────────────────────
28
+ pyclipper==1.4.0
29
+ shapely==2.1.2
30
+ lmdb==2.2.0
31
+ imgaug==0.4.0
32
+ scikit-image==0.25.2
33
+ python-docx==1.2.0
34
+ fire==0.7.1
35
+ beautifulsoup4==4.14.3
36
+ lxml==6.1.0
37
+ RapidFuzz==3.14.5
38
+
39
+ # ── Utilities ─────────────────────────────────────────────────────────────────
40
+ requests==2.34.2
41
+ tqdm==4.67.3
42
+ PyYAML==6.0.2
43
+ regex==2026.5.9
44
+ scipy==1.15.3
45
+ packaging==26.2
46
+ filelock==3.29.0
47
+ gradio
48
+ inference-sdk
run_inference.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from solution import TrafficViolationDetector
2
+ from pathlib import Path
3
+ import json
4
+
5
+
6
+
7
+ def run():
8
+ print("Loading models...")
9
+ detector = TrafficViolationDetector(model_dir=str(Path("models").resolve().absolute()))
10
+ images = ["testimages/1.jpg", "testimages/2.webp", "testimages/images.jpg"]
11
+ results = {}
12
+ print("Running inference...")
13
+ for img in images:
14
+ if Path(img).exists():
15
+ print(f"Processing {img}...")
16
+ res = detector.predict(img)
17
+ results[img] = res
18
+ else:
19
+ results[img] = "File not found"
20
+
21
+ print("\n--- RESULTS ---")
22
+ print(json.dumps(results, indent=2))
23
+
24
+ if __name__ == "__main__":
25
+ run()
solution.py ADDED
@@ -0,0 +1,405 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ solution.py — AID 728 Traffic Rule Violation Detection
3
+ =======================================================
4
+ Pipeline:
5
+ 1. YOLOv8s (COCO) + custom bike detector → bike boxes + person boxes + car boxes
6
+ 2. Depth-Anything V2 (fp16) → depth map for person→bike association
7
+ 3. Helmet classifier (YOLO) → helmet / no-helmet per rider
8
+ 4. license.pt (YOLO) → license plate bounding box
9
+ 5. PaddleOCR 3.5.0 (mobile det+rec) → plate text via legacy ocr() API
10
+ 6. Roboflow inference_sdk → wrong-way vehicle classification
11
+ 7. Roboflow inference_sdk → seatbelt classification for cars
12
+ """
13
+
14
+ import os
15
+ import re
16
+ from pathlib import Path
17
+
18
+ # Point paddlex to bundled offline models BEFORE any paddle import.
19
+ _MODEL_DIR = Path(__file__).parent / "models"
20
+ os.environ["PADDLE_PDX_CACHE_HOME"] = str(_MODEL_DIR / "paddleocr")
21
+
22
+ import cv2
23
+ import numpy as np
24
+ import torch
25
+ from PIL import Image
26
+ from transformers import pipeline as hf_pipeline
27
+ from ultralytics import YOLO
28
+ from paddleocr import PaddleOCR
29
+
30
+ try:
31
+ from inference_sdk import InferenceHTTPClient
32
+ CLIENT = InferenceHTTPClient(
33
+ api_url="https://serverless.roboflow.com",
34
+ api_key="SEsiEStxDAHdOx2SCo3k"
35
+ )
36
+ except ImportError:
37
+ CLIENT = None
38
+
39
+ # ── CONSTANTS ─────────────────────────────────────────────────────────────────
40
+ COCO_PERSON = 0
41
+ COCO_MOTO = 3
42
+ COCO_CAR = 2
43
+ COCO_BUS = 5
44
+ COCO_TRUCK = 7
45
+ FOUR_WHEELERS = {COCO_CAR, COCO_BUS, COCO_TRUCK}
46
+
47
+ COCO_CONF = 0.30; COCO_IOU = 0.45
48
+ S1_CONF = 0.344; S1_IOU = 0.45
49
+ S3_CONF = 0.25; S3_IOU = 0.60
50
+ S4_CONF = 0.20
51
+ PERSON_BIKE_IOU_THRESH = 0.10
52
+ PERSON_BIKE_COL_MARGIN = 0.35
53
+ HEAD_CROP_FRACTION = 0.45
54
+ HEAD_CROP_MIN_PX = 40
55
+ DEPTH_THRESHOLD = 0.35
56
+ OCR_MIN_CONF = 0.25
57
+
58
+
59
+ class TrafficViolationDetector:
60
+ """
61
+ Detects traffic violations on vehicles in a single RGB image.
62
+ All models loaded once in __init__; predict() is fully stateless.
63
+ """
64
+
65
+ def __init__(self, model_dir: str = "./models"):
66
+ md = Path(model_dir)
67
+
68
+ # Ensure paddlex finds bundled offline models
69
+ os.environ["PADDLE_PDX_CACHE_HOME"] = str(md / "paddleocr")
70
+
71
+ # 1. Depth estimation
72
+ self.depth_estimator = hf_pipeline(
73
+ "depth-estimation",
74
+ model=(md / "depth_anything_v2").as_posix(),
75
+ device=0 if torch.cuda.is_available() else -1,
76
+ dtype=torch.float32,
77
+ )
78
+
79
+ # 2. YOLO models
80
+ self.s_coco = YOLO(str(md / "yolov8s.pt"))
81
+ self.s1 = YOLO(str(md / "stage1_best.pt"))
82
+ self.s3 = YOLO(str(md / "helmet_v11.pt"))
83
+ self.s4 = YOLO(str(md / "license.pt"))
84
+
85
+ # 3. Super-resolution
86
+ self.sr_engine, self.has_sr = self._init_sr(md / "FSRCNN_x3.pb")
87
+
88
+ # 4. PaddleOCR
89
+ self.ocr_engine = PaddleOCR(
90
+ lang="en",
91
+ device="cpu",
92
+ enable_mkldnn=False,
93
+ text_detection_model_name="PP-OCRv5_mobile_det",
94
+ text_recognition_model_name="en_PP-OCRv5_mobile_rec",
95
+ )
96
+
97
+ # ── helpers ───────────────────────────────────────────────────────────────
98
+
99
+ @staticmethod
100
+ def _init_sr(sr_path):
101
+ try:
102
+ sr = cv2.dnn_superres.DnnSuperResImpl_create()
103
+ except AttributeError:
104
+ return None, False
105
+ if Path(sr_path).exists():
106
+ try:
107
+ sr.readModel(str(sr_path))
108
+ sr.setModel("fsrcnn", 3)
109
+ return sr, True
110
+ except Exception:
111
+ pass
112
+ return sr, False
113
+
114
+ @staticmethod
115
+ def _box_iou(a, b):
116
+ ax1, ay1, ax2, ay2 = a
117
+ bx1, by1, bx2, by2 = b
118
+ ix1 = max(ax1, bx1); iy1 = max(ay1, by1)
119
+ ix2 = min(ax2, bx2); iy2 = min(ay2, by2)
120
+ inter = max(0.0, ix2 - ix1) * max(0.0, iy2 - iy1)
121
+ if inter == 0:
122
+ return 0.0
123
+ return inter / ((ax2-ax1)*(ay2-ay1) + (bx2-bx1)*(by2-by1) - inter + 1e-6)
124
+
125
+ @staticmethod
126
+ def _region_depth(depth_map, x1, y1, x2, y2):
127
+ h, w = depth_map.shape
128
+ x1, y1 = max(0, int(x1)), max(0, int(y1))
129
+ x2, y2 = min(w, int(x2)), min(h, int(y2))
130
+ patch = depth_map[y1:y2, x1:x2]
131
+ return float(np.median(patch)) if patch.size > 0 else 0.5
132
+
133
+ def _is_depth_ok(self, pd, bd):
134
+ if bd < 0.05:
135
+ return abs(pd - bd) <= DEPTH_THRESHOLD * 0.5
136
+ return abs(pd - bd) / (bd + 1e-6) <= DEPTH_THRESHOLD
137
+
138
+ def _merge_bike_boxes(self, coco, custom, iou_thresh=0.45):
139
+ if not coco and not custom:
140
+ return np.zeros((0, 4), dtype=np.float32)
141
+ if not coco:
142
+ return np.array(custom, dtype=np.float32)
143
+ if not custom:
144
+ return np.array(coco, dtype=np.float32)
145
+ merged = list(coco)
146
+ for cb in custom:
147
+ if not any(self._box_iou(cb, mb) > iou_thresh for mb in merged):
148
+ merged.append(cb)
149
+ return np.array(merged, dtype=np.float32)
150
+
151
+ def _associate_persons_to_bikes(self, person_boxes, bike_boxes, depth_map, h, w):
152
+ bike_persons = [[] for _ in range(len(bike_boxes))]
153
+ for p_box in person_boxes:
154
+ px1, py1, px2, py2 = p_box
155
+ p_cx = (px1 + px2) / 2
156
+ p_bottom = py2
157
+ best_bike, best_score = -1, -1.0
158
+ for b_idx, b_box in enumerate(bike_boxes):
159
+ bx1, by1, bx2, by2 = b_box
160
+ bw = bx2 - bx1
161
+ iou = self._box_iou(p_box, b_box)
162
+ in_col = (
163
+ bx1 - PERSON_BIKE_COL_MARGIN * bw <= p_cx <= bx2 + PERSON_BIKE_COL_MARGIN * bw
164
+ and p_bottom <= by2 + 0.3 * (by2 - by1)
165
+ )
166
+ if iou < PERSON_BIKE_IOU_THRESH and not in_col:
167
+ continue
168
+ pd_val = self._region_depth(depth_map, px1, py1, px2, py2)
169
+ bd_val = self._region_depth(depth_map, bx1, by1, bx2, by2)
170
+ if not self._is_depth_ok(pd_val, bd_val):
171
+ continue
172
+ score = iou + 0.5 * (1.0 - abs(p_cx - (bx1 + bx2) / 2) / (w + 1e-6))
173
+ if score > best_score:
174
+ best_score, best_bike = score, b_idx
175
+ if best_bike >= 0:
176
+ bike_persons[best_bike].append(p_box)
177
+ return bike_persons
178
+
179
+ def _get_depth_map(self, image_cv):
180
+ img_rgb = cv2.cvtColor(image_cv, cv2.COLOR_BGR2RGB)
181
+ result = self.depth_estimator(Image.fromarray(img_rgb))
182
+ depth = np.array(result["depth"]).astype(np.float32)
183
+ lo, hi = depth.min(), depth.max()
184
+ depth = (depth - lo) / (hi - lo + 1e-8)
185
+ if depth.shape != image_cv.shape[:2]:
186
+ depth = cv2.resize(depth, (image_cv.shape[1], image_cv.shape[0]))
187
+ return depth
188
+
189
+ def _classify_helmets(self, full_image, person_boxes):
190
+ if not person_boxes:
191
+ return 0, 0, 0
192
+ h_img, w_img = full_image.shape[:2]
193
+ with_h = without_h = 0
194
+ for p_box in person_boxes:
195
+ px1, py1, px2, py2 = map(int, p_box)
196
+ head_h = max(int((py2 - py1) * HEAD_CROP_FRACTION), HEAD_CROP_MIN_PX)
197
+ pad_x = max(4, int((px2 - px1) * 0.05))
198
+ crop = full_image[max(0, py1):min(h_img, py1 + head_h),
199
+ max(0, px1 - pad_x):min(w_img, px2 + pad_x)]
200
+ if crop.size == 0:
201
+ without_h += 1
202
+ continue
203
+ res = self.s3.predict(crop, conf=S3_CONF, iou=S3_IOU, verbose=False)[0]
204
+ if len(res.boxes) == 0:
205
+ without_h += 1
206
+ elif int(res.boxes[res.boxes.conf.argmax()].cls) == 0:
207
+ with_h += 1
208
+ else:
209
+ without_h += 1
210
+ return with_h + without_h, with_h, without_h
211
+
212
+ def _preprocess_plate(self, plate_img):
213
+ h, w = plate_img.shape[:2]
214
+ if self.has_sr and self.sr_engine is not None:
215
+ try:
216
+ plate_img = self.sr_engine.upsample(plate_img)
217
+ except Exception:
218
+ plate_img = cv2.resize(plate_img, (0, 0), fx=3, fy=3,
219
+ interpolation=cv2.INTER_CUBIC)
220
+ else:
221
+ if h < 100:
222
+ scale = 100 / h
223
+ plate_img = cv2.resize(plate_img,
224
+ (int(w * scale), int(h * scale)),
225
+ interpolation=cv2.INTER_CUBIC)
226
+ lab = cv2.cvtColor(plate_img, cv2.COLOR_BGR2LAB)
227
+ l, a, b = cv2.split(lab)
228
+ l = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(4, 4)).apply(l)
229
+ plate_img = cv2.cvtColor(cv2.merge([l, a, b]), cv2.COLOR_LAB2BGR)
230
+ return cv2.filter2D(plate_img, -1, np.array([[0, -1, 0], [-1, 5, -1], [0, -1, 0]]))
231
+
232
+ def _run_ocr(self, plate_img):
233
+ processed = self._preprocess_plate(plate_img)
234
+ texts, scores = [], []
235
+ try:
236
+ result = self.ocr_engine.ocr(processed)
237
+ if result and isinstance(result, list):
238
+ for page in result:
239
+ if isinstance(page, dict):
240
+ page_texts = page.get("rec_texts", [])
241
+ page_scores = page.get("rec_scores", [])
242
+ for t, s in zip(page_texts, page_scores):
243
+ if str(t).strip():
244
+ texts.append(str(t).strip())
245
+ scores.append(float(s))
246
+ elif isinstance(page, list):
247
+ for line in page:
248
+ if isinstance(line, (list, tuple)) and len(line) == 2:
249
+ try:
250
+ txt = str(line[1][0])
251
+ score = float(line[1][1])
252
+ if txt.strip():
253
+ texts.append(txt.strip())
254
+ scores.append(score)
255
+ except (TypeError, ValueError, IndexError):
256
+ pass
257
+ except Exception:
258
+ pass
259
+ if not texts:
260
+ return "UNKNOWN", 0.0
261
+ return " ".join(texts), (sum(scores) / len(scores) if scores else 0.0)
262
+
263
+ def _extract_plate(self, vehicle_crop, plate_box):
264
+ h, w = vehicle_crop.shape[:2]
265
+ pad = 4
266
+ x1 = max(0, int(plate_box[0]) - pad)
267
+ y1 = max(0, int(plate_box[1]) - pad)
268
+ x2 = min(w, int(plate_box[2]) + pad)
269
+ y2 = min(h, int(plate_box[3]) + pad)
270
+ crop = vehicle_crop[y1:y2, x1:x2]
271
+ if crop.size == 0:
272
+ return "UNKNOWN"
273
+ raw, conf = self._run_ocr(crop)
274
+ if conf < OCR_MIN_CONF:
275
+ return "UNKNOWN"
276
+ text = re.sub(r"[^A-Z0-9 \-]", "", raw.upper())
277
+ text = re.sub(r"\s+", " ", text).strip()
278
+ tokens = [t for t in text.split() if len(t) > 1]
279
+ return " ".join(tokens) if tokens else "UNKNOWN"
280
+
281
+ def _get_plate(self, img, h_img, w_img, vehicle_box):
282
+ x1, y1, x2, y2 = map(int, vehicle_box)
283
+ bw, bh = x2 - x1, y2 - y1
284
+ vcrop = img[
285
+ max(0, int(y1 - 0.20 * bh)): min(h_img, int(y2 + 0.10 * bh)),
286
+ max(0, int(x1 - 0.15 * bw)): min(w_img, int(x2 + 0.15 * bw))
287
+ ]
288
+ plate_text = "UNKNOWN"
289
+ try:
290
+ if vcrop.size > 0:
291
+ p_res = self.s4.predict(vcrop, conf=S4_CONF, verbose=False)[0]
292
+ if len(p_res.boxes) > 0:
293
+ best_pb = p_res.boxes.xyxy.cpu().numpy()[p_res.boxes.conf.argmax()]
294
+ plate_text = self._extract_plate(vcrop, best_pb)
295
+ except Exception:
296
+ pass
297
+ return plate_text
298
+
299
+ # ── predict ───────────────────────────────────────────────────────────────
300
+
301
+ def predict(self, image_path: str) -> dict:
302
+ try:
303
+ img = cv2.imread(str(image_path))
304
+ if img is None:
305
+ return {"violations": []}
306
+ h_img, w_img = img.shape[:2]
307
+
308
+ # Stage 1: COCO primary detection
309
+ coco_res = self.s_coco.predict(img, conf=COCO_CONF, iou=COCO_IOU,
310
+ verbose=False)[0]
311
+ coco_boxes = coco_res.boxes.xyxy.cpu().numpy()
312
+ coco_cls = coco_res.boxes.cls.cpu().numpy().astype(int)
313
+ person_boxes = coco_boxes[coco_cls == COCO_PERSON].tolist()
314
+ coco_motos = coco_boxes[coco_cls == COCO_MOTO].tolist()
315
+ coco_cars = coco_boxes[np.isin(coco_cls, list(FOUR_WHEELERS))].tolist()
316
+
317
+ # Stage 2: Supplemental bike detector
318
+ s1_res = self.s1.predict(img, conf=S1_CONF, iou=S1_IOU,
319
+ augment=True, verbose=False)[0]
320
+ custom_bikes = s1_res.boxes.xyxy.cpu().numpy().tolist()
321
+ bike_boxes = self._merge_bike_boxes(coco_motos, custom_bikes)
322
+
323
+ # Stage 3: Depth map for spatial person→bike association
324
+ depth_map = self._get_depth_map(img)
325
+
326
+ # Stage 4: Associate persons to bikes
327
+ bike_persons = self._associate_persons_to_bikes(
328
+ person_boxes, bike_boxes, depth_map, h_img, w_img)
329
+
330
+ # Detect Wrong Way using Roboflow API
331
+ ww_boxes = []
332
+ if CLIENT is not None:
333
+ try:
334
+ result = CLIENT.infer(img, model_id="wrong-way-driving-detection-gqdmg/1")
335
+ for pred in result.get('predictions', []):
336
+ if "wrong" in pred.get('class', '').lower():
337
+ px, py, pw, ph = pred['x'], pred['y'], pred['width'], pred['height']
338
+ wx1, wy1 = px - pw/2, py - ph/2
339
+ wx2, wy2 = px + pw/2, py + ph/2
340
+ ww_boxes.append([wx1, wy1, wx2, wy2])
341
+ except Exception as e:
342
+ print("[Warning] Wrong-way API error:", e)
343
+
344
+ def is_wrong_way(v_box):
345
+ for wb in ww_boxes:
346
+ if self._box_iou(v_box, wb) > 0.4:
347
+ return True
348
+ return False
349
+
350
+ violations = []
351
+
352
+ # Process Two-wheelers
353
+ for i, bike_box in enumerate(bike_boxes):
354
+ num_riders, with_h, without_h = self._classify_helmets(
355
+ img, bike_persons[i])
356
+
357
+ if num_riders == 0:
358
+ num_riders, with_h, without_h = 1, 0, 1
359
+
360
+ ww = is_wrong_way(bike_box)
361
+
362
+ # Check for violation first, then do plate OCR if violation exists
363
+ if (num_riders >= 3) or (without_h > 0) or ww:
364
+ plate_text = self._get_plate(img, h_img, w_img, bike_box)
365
+ violations.append({
366
+ "vehicle_type": "two_wheeler",
367
+ "num_riders": num_riders,
368
+ "helmet_violations": without_h,
369
+ "wrong_way": ww,
370
+ "license_plate": plate_text,
371
+ })
372
+
373
+ # Process Four-wheelers (Cars/Trucks/Buses)
374
+ for car_box in coco_cars:
375
+ x1, y1, x2, y2 = map(int, car_box)
376
+ ww = is_wrong_way(car_box)
377
+
378
+ sb_viols = 0
379
+ if CLIENT is not None:
380
+ ccrop = img[max(0, y1):min(h_img, y2), max(0, x1):min(w_img, x2)]
381
+ if ccrop.size > 0:
382
+ try:
383
+ res = CLIENT.infer(ccrop, model_id="seat-belt-detection-udcfg/5")
384
+ for pred in res.get('predictions', []):
385
+ cls_name = pred.get('class', '').lower()
386
+ if "no" in cls_name and "seatbelt" in cls_name:
387
+ sb_viols += 1
388
+ except Exception as e:
389
+ print("[Warning] Seatbelt API error:", e)
390
+
391
+ # Check for violation first, then do plate OCR if violation exists
392
+ if sb_viols > 0 or ww:
393
+ plate_text = self._get_plate(img, h_img, w_img, car_box)
394
+ violations.append({
395
+ "vehicle_type": "four_wheeler",
396
+ "seatbelt_violations": sb_viols,
397
+ "wrong_way": ww,
398
+ "license_plate": plate_text
399
+ })
400
+
401
+ return {"violations": violations}
402
+
403
+ except Exception as e:
404
+ print(f"[ERROR] predict() failed for {image_path}: {e}")
405
+ return {"violations": []}
testimages/1.jpg ADDED
testimages/2.webp ADDED