Safetensors
d_fine

train-DLNv1_DLNv2_WS2013_NoTables

#1
by nlivathinos - opened
Files changed (4) hide show
  1. README.md +3 -109
  2. config.json +0 -224
  3. model.safetensors +0 -3
  4. preprocessor_config.json +0 -26
README.md CHANGED
@@ -1,109 +1,3 @@
1
- ---
2
- license: apache-2.0
3
- ---
4
-
5
- # Document Layout Analysis "egret-large"
6
-
7
- ๐Ÿš€ **`egret-large`** is a Document Layout Analysis Model used in the [Docling project](https://github.com/docling-project/docling).
8
-
9
- ๐Ÿ“„ For an in-depth description of the model architecture, training datasets, and evaluation methodology, please refer to our technical report: **"Advanced Layout Analysis Models for Docling"**, Nikolaos Livathinos *et al.*, [๐Ÿ”— https://arxiv.org/abs/2509.11720](https://arxiv.org/abs/2509.11720)
10
-
11
-
12
- ## Inference code example
13
-
14
- Prerequisites:
15
-
16
- ```bash
17
- pip install transformers Pillow torch requests
18
- ```
19
-
20
- Prediction:
21
-
22
- ```python
23
- import requests
24
- from transformers import (
25
- DFineForObjectDetection,
26
- RTDetrImageProcessor,
27
- )
28
- import torch
29
- from PIL import Image
30
-
31
-
32
- classes_map = {
33
- 0: "Caption",
34
- 1: "Footnote",
35
- 2: "Formula",
36
- 3: "List-item",
37
- 4: "Page-footer",
38
- 5: "Page-header",
39
- 6: "Picture",
40
- 7: "Section-header",
41
- 8: "Table",
42
- 9: "Text",
43
- 10: "Title",
44
- 11: "Document Index",
45
- 12: "Code",
46
- 13: "Checkbox-Selected",
47
- 14: "Checkbox-Unselected",
48
- 15: "Form",
49
- 16: "Key-Value Region",
50
- }
51
- image_url = "https://huggingface.co/spaces/ds4sd/SmolDocling-256M-Demo/resolve/main/example_images/annual_rep_14.png"
52
- model_name = "ds4sd/docling-layout-egret-large"
53
- threshold = 0.6
54
-
55
- # Download the image
56
- image = Image.open(requests.get(image_url, stream=True).raw)
57
- image = image.convert("RGB")
58
-
59
-
60
- # Initialize the model
61
- image_processor = RTDetrImageProcessor.from_pretrained(model_name)
62
- model = DFineForObjectDetection.from_pretrained(model_name)
63
-
64
- # Run the prediction pipeline
65
- inputs = image_processor(images=[image], return_tensors="pt")
66
- with torch.no_grad():
67
- outputs = model(**inputs)
68
- results = image_processor.post_process_object_detection(
69
- outputs,
70
- target_sizes=torch.tensor([image.size[::-1]]),
71
- threshold=threshold,
72
- )
73
-
74
- # Get the results
75
- for result in results:
76
- for score, label_id, box in zip(
77
- result["scores"], result["labels"], result["boxes"]
78
- ):
79
- score = round(score.item(), 2)
80
- label = classes_map[label_id.item()]
81
- box = [round(i, 2) for i in box.tolist()]
82
- print(f"{label}:{score} {box}")
83
- ```
84
-
85
-
86
- ## References
87
-
88
- ```
89
- @misc{livathinos2025advancedlayoutanalysismodels,
90
- title={advanced layout analysis models for docling},
91
- author={nikolaos livathinos and christoph auer and ahmed nassar and rafael teixeira de lima and maksym lysak and brown ebouky and cesar berrospi and michele dolfi and panagiotis vagenas and matteo omenetti and kasper dinkla and yusik kim and valery weber and lucas morin and ingmar meijer and viktor kuropiatnyk and tim strohmeyer and a. said gurbuz and peter w. j. staar},
92
- year={2025},
93
- eprint={2509.11720},
94
- archiveprefix={arxiv},
95
- primaryclass={cs.cv},
96
- url={https://arxiv.org/abs/2509.11720},
97
- }
98
-
99
- @techreport{Docling,
100
- author = {Deep Search Team},
101
- month = {8},
102
- title = {Docling Technical Report},
103
- url = {https://arxiv.org/abs/2408.09869v4},
104
- eprint = {2408.09869},
105
- doi = {10.48550/arXiv.2408.09869},
106
- version = {1.0.0},
107
- year = {2024}
108
- }
109
- ```
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
config.json DELETED
@@ -1,224 +0,0 @@
1
- {
2
- "activation_dropout": 0.0,
3
- "activation_function": "silu",
4
- "anchor_image_size": null,
5
- "architectures": [
6
- "DFineForObjectDetection"
7
- ],
8
- "attention_dropout": 0.0,
9
- "auxiliary_loss": true,
10
- "backbone": null,
11
- "backbone_config": {
12
- "depths": [
13
- 3,
14
- 4,
15
- 6,
16
- 3
17
- ],
18
- "downsample_in_bottleneck": false,
19
- "downsample_in_first_stage": false,
20
- "embedding_size": 32,
21
- "hidden_act": "relu",
22
- "hidden_sizes": [
23
- 256,
24
- 512,
25
- 1024,
26
- 2048
27
- ],
28
- "initializer_range": 0.02,
29
- "layer_type": "basic",
30
- "model_type": "hgnet_v2",
31
- "num_channels": 3,
32
- "out_features": [
33
- "stage2",
34
- "stage3",
35
- "stage4"
36
- ],
37
- "out_indices": [
38
- 2,
39
- 3,
40
- 4
41
- ],
42
- "stage_downsample": [
43
- false,
44
- true,
45
- true,
46
- true
47
- ],
48
- "stage_in_channels": [
49
- 48,
50
- 128,
51
- 512,
52
- 1024
53
- ],
54
- "stage_kernel_size": [
55
- 3,
56
- 3,
57
- 5,
58
- 5
59
- ],
60
- "stage_light_block": [
61
- false,
62
- false,
63
- true,
64
- true
65
- ],
66
- "stage_mid_channels": [
67
- 48,
68
- 96,
69
- 192,
70
- 384
71
- ],
72
- "stage_names": [
73
- "stem",
74
- "stage1",
75
- "stage2",
76
- "stage3",
77
- "stage4"
78
- ],
79
- "stage_num_blocks": [
80
- 1,
81
- 1,
82
- 3,
83
- 1
84
- ],
85
- "stage_numb_of_layers": [
86
- 6,
87
- 6,
88
- 6,
89
- 6
90
- ],
91
- "stage_out_channels": [
92
- 128,
93
- 512,
94
- 1024,
95
- 2048
96
- ],
97
- "stem_channels": [
98
- 3,
99
- 32,
100
- 48
101
- ],
102
- "use_learnable_affine_block": false
103
- },
104
- "backbone_kwargs": null,
105
- "batch_norm_eps": 1e-05,
106
- "box_noise_scale": 1.0,
107
- "d_model": 256,
108
- "decoder_activation_function": "relu",
109
- "decoder_attention_heads": 8,
110
- "decoder_ffn_dim": 1024,
111
- "decoder_in_channels": [
112
- 256,
113
- 256,
114
- 256
115
- ],
116
- "decoder_layers": 6,
117
- "decoder_method": "default",
118
- "decoder_n_points": [
119
- 3,
120
- 6,
121
- 3
122
- ],
123
- "decoder_offset_scale": 0.5,
124
- "depth_mult": 1.0,
125
- "dropout": 0.0,
126
- "encode_proj_layers": [
127
- 2
128
- ],
129
- "encoder_activation_function": "gelu",
130
- "encoder_attention_heads": 8,
131
- "encoder_ffn_dim": 1024,
132
- "encoder_hidden_dim": 256,
133
- "encoder_in_channels": [
134
- 512,
135
- 1024,
136
- 2048
137
- ],
138
- "encoder_layers": 1,
139
- "eos_coefficient": 0.0001,
140
- "eval_idx": -1,
141
- "eval_size": null,
142
- "feat_strides": [
143
- 8,
144
- 16,
145
- 32
146
- ],
147
- "focal_loss_alpha": 0.75,
148
- "focal_loss_gamma": 2.0,
149
- "freeze_backbone_batch_norms": true,
150
- "hidden_expansion": 1.0,
151
- "id2label": {
152
- "0": "Caption",
153
- "1": "Footnote",
154
- "2": "Formula",
155
- "3": "List-item",
156
- "4": "Page-footer",
157
- "5": "Page-header",
158
- "6": "Picture",
159
- "7": "Section-header",
160
- "8": "Table",
161
- "9": "Text",
162
- "10": "Title",
163
- "11": "Document Index",
164
- "12": "Code",
165
- "13": "Checkbox-Selected",
166
- "14": "Checkbox-Unselected",
167
- "15": "Form",
168
- "16": "Key-Value Region"
169
- },
170
- "initializer_bias_prior_prob": null,
171
- "initializer_range": 0.01,
172
- "is_encoder_decoder": true,
173
- "label2id": {
174
- "Caption": 0,
175
- "Checkbox-Selected": 13,
176
- "Checkbox-Unselected": 14,
177
- "Code": 12,
178
- "Document Index": 11,
179
- "Footnote": 1,
180
- "Form": 15,
181
- "Formula": 2,
182
- "Key-Value Region": 16,
183
- "List-item": 3,
184
- "Page-footer": 4,
185
- "Page-header": 5,
186
- "Picture": 6,
187
- "Section-header": 7,
188
- "Table": 8,
189
- "Text": 9,
190
- "Title": 10
191
- },
192
- "label_noise_ratio": 0.5,
193
- "layer_norm_eps": 1e-05,
194
- "layer_scale": 1,
195
- "learn_initial_query": false,
196
- "lqe_hidden_dim": 64,
197
- "lqe_layers": 2,
198
- "matcher_alpha": 0.25,
199
- "matcher_bbox_cost": 5.0,
200
- "matcher_class_cost": 2.0,
201
- "matcher_gamma": 2.0,
202
- "matcher_giou_cost": 2.0,
203
- "max_num_bins": 32,
204
- "model_type": "d_fine",
205
- "normalize_before": false,
206
- "num_denoising": 100,
207
- "num_feature_levels": 3,
208
- "num_queries": 300,
209
- "positional_encoding_temperature": 10000,
210
- "reg_scale": 4.0,
211
- "top_prob_values": 4,
212
- "torch_dtype": "float32",
213
- "transformers_version": "4.53.0.dev0",
214
- "up": 0.5,
215
- "use_focal_loss": true,
216
- "use_pretrained_backbone": false,
217
- "use_timm_backbone": false,
218
- "weight_loss_bbox": 5.0,
219
- "weight_loss_ddf": 1.5,
220
- "weight_loss_fgl": 0.15,
221
- "weight_loss_giou": 2.0,
222
- "weight_loss_vfl": 1.0,
223
- "with_box_refine": true
224
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f79def9d4a0d4e6e62cab25ec7846d1579ef1ef657c39554363813f7d1a14f1b
3
- size 125100636
 
 
 
 
preprocessor_config.json DELETED
@@ -1,26 +0,0 @@
1
- {
2
- "do_convert_annotations": true,
3
- "do_normalize": false,
4
- "do_pad": false,
5
- "do_rescale": true,
6
- "do_resize": true,
7
- "format": "coco_detection",
8
- "image_mean": [
9
- 0.485,
10
- 0.456,
11
- 0.406
12
- ],
13
- "image_processor_type": "RTDetrImageProcessor",
14
- "image_std": [
15
- 0.229,
16
- 0.224,
17
- 0.225
18
- ],
19
- "pad_size": null,
20
- "resample": 2,
21
- "rescale_factor": 0.00392156862745098,
22
- "size": {
23
- "height": 640,
24
- "width": 640
25
- }
26
- }