Spaces:
Running
Running
update
Browse files
app/src/content/article.mdx
CHANGED
|
@@ -4,11 +4,12 @@ subtitle: "A new open dataset for data-centric training of Vision Language Model
|
|
| 4 |
description: "A new open dataset for data-centric training of Vision Language Models"
|
| 5 |
authors:
|
| 6 |
- "Luis Wiedmann"
|
| 7 |
-
- "Andi Marafioti"
|
| 8 |
- "Orr Zohar"
|
|
|
|
|
|
|
| 9 |
- "Thibaud Frere"
|
| 10 |
affiliation: "Hugging Face"
|
| 11 |
-
published: "Sep
|
| 12 |
tags:
|
| 13 |
- research
|
| 14 |
- vision-language models
|
|
@@ -52,12 +53,12 @@ We manually collect **over 180** image-text datasets from the recent literature
|
|
| 52 |
|
| 53 |
<Wide>
|
| 54 |
<Accordion title="FineVision Subsets">
|
| 55 |
-
|Subset Name |Total Images|Total Samples|Total Turns|Total Question Tokens|Total Answer Tokens|Category |
|
| 56 |
|--------------------------------------|------------|-------------|-----------|---------------------|-------------------|----------------------|-------------------------------------------------------------------|
|
| 57 |
|coco_colors |118,287 |118,287 |118,287 |1,301,157 |6,376,672 |Captioning & Knowledge|[@noauthor_hazal-karakusmscoco-controlnet-canny-less-colors_nodate]|
|
| 58 |
|densefusion_1m |1,058,751 |1,058,751 |1,058,751 |10,692,478 |263,718,217 |Captioning & Knowledge|[@li_densefusion-1m_2024] |
|
| 59 |
|face_emotion |797 |797 |797 |8,767 |8,066 |Captioning & Knowledge|[@mollahosseini_affectnet_2017] |
|
| 60 |
-
|google_landmarks |299,993 |299,993 |842,127 |6,194,978 |10,202,980 |Captioning & Knowledge|
|
| 61 |
|image_textualization(filtered) |99,573 |99,573 |99,573 |917,577 |19,374,090 |Captioning & Knowledge|[@pi_image_2024] |
|
| 62 |
|laion_gpt4v |9,301 |9,301 |9,301 |93,950 |1,875,283 |Captioning & Knowledge|[@noauthor_laiongpt4v-dataset_2023] |
|
| 63 |
|localized_narratives |199,998 |199,998 |199,998 |2,167,179 |8,021,473 |Captioning & Knowledge|[@vedaldi_connecting_2020] |
|
|
|
|
| 4 |
description: "A new open dataset for data-centric training of Vision Language Models"
|
| 5 |
authors:
|
| 6 |
- "Luis Wiedmann"
|
|
|
|
| 7 |
- "Orr Zohar"
|
| 8 |
+
- "Andi Marafioti"
|
| 9 |
+
- "Amir Mahla"
|
| 10 |
- "Thibaud Frere"
|
| 11 |
affiliation: "Hugging Face"
|
| 12 |
+
published: "Sep 4, 2025"
|
| 13 |
tags:
|
| 14 |
- research
|
| 15 |
- vision-language models
|
|
|
|
| 53 |
|
| 54 |
<Wide>
|
| 55 |
<Accordion title="FineVision Subsets">
|
| 56 |
+
|Subset Name |Total Images|Total Samples|Total Turns|Total Question Tokens|Total Answer Tokens|Category |Source |
|
| 57 |
|--------------------------------------|------------|-------------|-----------|---------------------|-------------------|----------------------|-------------------------------------------------------------------|
|
| 58 |
|coco_colors |118,287 |118,287 |118,287 |1,301,157 |6,376,672 |Captioning & Knowledge|[@noauthor_hazal-karakusmscoco-controlnet-canny-less-colors_nodate]|
|
| 59 |
|densefusion_1m |1,058,751 |1,058,751 |1,058,751 |10,692,478 |263,718,217 |Captioning & Knowledge|[@li_densefusion-1m_2024] |
|
| 60 |
|face_emotion |797 |797 |797 |8,767 |8,066 |Captioning & Knowledge|[@mollahosseini_affectnet_2017] |
|
| 61 |
+
|google_landmarks |299,993 |299,993 |842,127 |6,194,978 |10,202,980 |Captioning & Knowledge|Ours |
|
| 62 |
|image_textualization(filtered) |99,573 |99,573 |99,573 |917,577 |19,374,090 |Captioning & Knowledge|[@pi_image_2024] |
|
| 63 |
|laion_gpt4v |9,301 |9,301 |9,301 |93,950 |1,875,283 |Captioning & Knowledge|[@noauthor_laiongpt4v-dataset_2023] |
|
| 64 |
|localized_narratives |199,998 |199,998 |199,998 |2,167,179 |8,021,473 |Captioning & Knowledge|[@vedaldi_connecting_2020] |
|