train-DLNv1_DLNv2_WS2013_NoTables

by nlivathinos - opened Jul 2, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

-362

Files changed (4) hide show

README.md +3 -109
config.json +0 -224
model.safetensors +0 -3
preprocessor_config.json +0 -26

README.md CHANGED Viewed

@@ -1,109 +1,3 @@
----
-license: apache-2.0
----
-# Document Layout Analysis "egret-large"
-🚀 **`egret-large`** is a Document Layout Analysis Model used in the [Docling project](https://github.com/docling-project/docling).
-📄 For an in-depth description of the model architecture, training datasets, and evaluation methodology, please refer to our technical report: **"Advanced Layout Analysis Models for Docling"**, Nikolaos Livathinos *et al.*, [🔗 https://arxiv.org/abs/2509.11720](https://arxiv.org/abs/2509.11720)
-## Inference code example
-Prerequisites:
-```bash
-pip install transformers Pillow torch requests
-```
-Prediction:
-```python
-import requests
-from transformers import (
-    DFineForObjectDetection,
-    RTDetrImageProcessor,
-)
-import torch
-from PIL import Image
-classes_map = {
-    0: "Caption",
-    1: "Footnote",
-    2: "Formula",
-    3: "List-item",
-    4: "Page-footer",
-    5: "Page-header",
-    6: "Picture",
-    7: "Section-header",
-    8: "Table",
-    9: "Text",
-    10: "Title",
-    11: "Document Index",
-    12: "Code",
-    13: "Checkbox-Selected",
-    14: "Checkbox-Unselected",
-    15: "Form",
-    16: "Key-Value Region",
-}
-image_url = "https://huggingface.co/spaces/ds4sd/SmolDocling-256M-Demo/resolve/main/example_images/annual_rep_14.png"
-model_name = "ds4sd/docling-layout-egret-large"
-threshold = 0.6
-# Download the image
-image = Image.open(requests.get(image_url, stream=True).raw)
-image = image.convert("RGB")
-# Initialize the model
-image_processor = RTDetrImageProcessor.from_pretrained(model_name)
-model = DFineForObjectDetection.from_pretrained(model_name)
-# Run the prediction pipeline
-inputs = image_processor(images=[image], return_tensors="pt")
-with torch.no_grad():
-    outputs = model(**inputs)
-results = image_processor.post_process_object_detection(
-    outputs,
-    target_sizes=torch.tensor([image.size[::-1]]),
-    threshold=threshold,
-)
-# Get the results
-for result in results:
-    for score, label_id, box in zip(
-        result["scores"], result["labels"], result["boxes"]
-    ):
-        score = round(score.item(), 2)
-        label = classes_map[label_id.item()]
-        box = [round(i, 2) for i in box.tolist()]
-        print(f"{label}:{score} {box}")
-```
-## References
-```
-@misc{livathinos2025advancedlayoutanalysismodels,
-      title={advanced layout analysis models for docling},
-      author={nikolaos livathinos and christoph auer and ahmed nassar and rafael teixeira de lima and maksym lysak and brown ebouky and cesar berrospi and michele dolfi and panagiotis vagenas and matteo omenetti and kasper dinkla and yusik kim and valery weber and lucas morin and ingmar meijer and viktor kuropiatnyk and tim strohmeyer and a. said gurbuz and peter w. j. staar},
-      year={2025},
-      eprint={2509.11720},
-      archiveprefix={arxiv},
-      primaryclass={cs.cv},
-      url={https://arxiv.org/abs/2509.11720},
-}
-@techreport{Docling,
-  author = {Deep Search Team},
-  month = {8},
-  title = {Docling Technical Report},
-  url = {https://arxiv.org/abs/2408.09869v4},
-  eprint = {2408.09869},
-  doi = {10.48550/arXiv.2408.09869},
-  version = {1.0.0},
-  year = {2024}
-}
-```

+---
+license: apache-2.0
+---

config.json DELETED Viewed

@@ -1,224 +0,0 @@
-{
-  "activation_dropout": 0.0,
-  "activation_function": "silu",
-  "anchor_image_size": null,
-  "architectures": [
-    "DFineForObjectDetection"
-  ],
-  "attention_dropout": 0.0,
-  "auxiliary_loss": true,
-  "backbone": null,
-  "backbone_config": {
-    "depths": [
-      3,
-      4,
-      6,
-      3
-    ],
-    "downsample_in_bottleneck": false,
-    "downsample_in_first_stage": false,
-    "embedding_size": 32,
-    "hidden_act": "relu",
-    "hidden_sizes": [
-      256,
-      512,
-      1024,
-      2048
-    ],
-    "initializer_range": 0.02,
-    "layer_type": "basic",
-    "model_type": "hgnet_v2",
-    "num_channels": 3,
-    "out_features": [
-      "stage2",
-      "stage3",
-      "stage4"
-    ],
-    "out_indices": [
-      2,
-      3,
-      4
-    ],
-    "stage_downsample": [
-      false,
-      true,
-      true,
-      true
-    ],
-    "stage_in_channels": [
-      48,
-      128,
-      512,
-      1024
-    ],
-    "stage_kernel_size": [
-      3,
-      3,
-      5,
-      5
-    ],
-    "stage_light_block": [
-      false,
-      false,
-      true,
-      true
-    ],
-    "stage_mid_channels": [
-      48,
-      96,
-      192,
-      384
-    ],
-    "stage_names": [
-      "stem",
-      "stage1",
-      "stage2",
-      "stage3",
-      "stage4"
-    ],
-    "stage_num_blocks": [
-      1,
-      1,
-      3,
-      1
-    ],
-    "stage_numb_of_layers": [
-      6,
-      6,
-      6,
-      6
-    ],
-    "stage_out_channels": [
-      128,
-      512,
-      1024,
-      2048
-    ],
-    "stem_channels": [
-      3,
-      32,
-      48
-    ],
-    "use_learnable_affine_block": false
-  },
-  "backbone_kwargs": null,
-  "batch_norm_eps": 1e-05,
-  "box_noise_scale": 1.0,
-  "d_model": 256,
-  "decoder_activation_function": "relu",
-  "decoder_attention_heads": 8,
-  "decoder_ffn_dim": 1024,
-  "decoder_in_channels": [
-    256,
-    256,
-    256
-  ],
-  "decoder_layers": 6,
-  "decoder_method": "default",
-  "decoder_n_points": [
-    3,
-    6,
-    3
-  ],
-  "decoder_offset_scale": 0.5,
-  "depth_mult": 1.0,
-  "dropout": 0.0,
-  "encode_proj_layers": [
-    2
-  ],
-  "encoder_activation_function": "gelu",
-  "encoder_attention_heads": 8,
-  "encoder_ffn_dim": 1024,
-  "encoder_hidden_dim": 256,
-  "encoder_in_channels": [
-    512,
-    1024,
-    2048
-  ],
-  "encoder_layers": 1,
-  "eos_coefficient": 0.0001,
-  "eval_idx": -1,
-  "eval_size": null,
-  "feat_strides": [
-    8,
-    16,
-    32
-  ],
-  "focal_loss_alpha": 0.75,
-  "focal_loss_gamma": 2.0,
-  "freeze_backbone_batch_norms": true,
-  "hidden_expansion": 1.0,
-  "id2label": {
-    "0": "Caption",
-    "1": "Footnote",
-    "2": "Formula",
-    "3": "List-item",
-    "4": "Page-footer",
-    "5": "Page-header",
-    "6": "Picture",
-    "7": "Section-header",
-    "8": "Table",
-    "9": "Text",
-    "10": "Title",
-    "11": "Document Index",
-    "12": "Code",
-    "13": "Checkbox-Selected",
-    "14": "Checkbox-Unselected",
-    "15": "Form",
-    "16": "Key-Value Region"
-  },
-  "initializer_bias_prior_prob": null,
-  "initializer_range": 0.01,
-  "is_encoder_decoder": true,
-  "label2id": {
-    "Caption": 0,
-    "Checkbox-Selected": 13,
-    "Checkbox-Unselected": 14,
-    "Code": 12,
-    "Document Index": 11,
-    "Footnote": 1,
-    "Form": 15,
-    "Formula": 2,
-    "Key-Value Region": 16,
-    "List-item": 3,
-    "Page-footer": 4,
-    "Page-header": 5,
-    "Picture": 6,
-    "Section-header": 7,
-    "Table": 8,
-    "Text": 9,
-    "Title": 10
-  },
-  "label_noise_ratio": 0.5,
-  "layer_norm_eps": 1e-05,
-  "layer_scale": 1,
-  "learn_initial_query": false,
-  "lqe_hidden_dim": 64,
-  "lqe_layers": 2,
-  "matcher_alpha": 0.25,
-  "matcher_bbox_cost": 5.0,
-  "matcher_class_cost": 2.0,
-  "matcher_gamma": 2.0,
-  "matcher_giou_cost": 2.0,
-  "max_num_bins": 32,
-  "model_type": "d_fine",
-  "normalize_before": false,
-  "num_denoising": 100,
-  "num_feature_levels": 3,
-  "num_queries": 300,
-  "positional_encoding_temperature": 10000,
-  "reg_scale": 4.0,
-  "top_prob_values": 4,
-  "torch_dtype": "float32",
-  "transformers_version": "4.53.0.dev0",
-  "up": 0.5,
-  "use_focal_loss": true,
-  "use_pretrained_backbone": false,
-  "use_timm_backbone": false,
-  "weight_loss_bbox": 5.0,
-  "weight_loss_ddf": 1.5,
-  "weight_loss_fgl": 0.15,
-  "weight_loss_giou": 2.0,
-  "weight_loss_vfl": 1.0,
-  "with_box_refine": true
-}

model.safetensors DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f79def9d4a0d4e6e62cab25ec7846d1579ef1ef657c39554363813f7d1a14f1b
-size 125100636

preprocessor_config.json DELETED Viewed

@@ -1,26 +0,0 @@
-{
-  "do_convert_annotations": true,
-  "do_normalize": false,
-  "do_pad": false,
-  "do_rescale": true,
-  "do_resize": true,
-  "format": "coco_detection",
-  "image_mean": [
-    0.485,
-    0.456,
-    0.406
-  ],
-  "image_processor_type": "RTDetrImageProcessor",
-  "image_std": [
-    0.229,
-    0.224,
-    0.225
-  ],
-  "pad_size": null,
-  "resample": 2,
-  "rescale_factor": 0.00392156862745098,
-  "size": {
-    "height": 640,
-    "width": 640
-  }
-}