init
Browse files- README.md +5 -4
- demo.py +1 -1
- modeling_eagle_chat.py +3 -0
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
license:
|
| 3 |
pipeline_tag: image-text-to-text
|
| 4 |
library_name: transformers
|
| 5 |
base_model:
|
|
@@ -16,7 +16,8 @@ tags:
|
|
| 16 |
|
| 17 |
# Eagle-2
|
| 18 |
|
| 19 |
-
|
|
|
|
| 20 |
[\[π¨οΈ Chat Demo\]](http://eagle-vlm.xyz/) [\[π€ HF Demo\]](TODO)
|
| 21 |
## Introduction
|
| 22 |
|
|
@@ -57,7 +58,7 @@ We provide the following models:
|
|
| 57 |
| AI2D<sub>test</sub> | 57.1 | 64.1 | 69.3 | 74.7 |70.9|
|
| 58 |
| MMMU<sub>val</sub> | 31.4 | 36.7 | 40.9 |41.1|38.8|
|
| 59 |
| MMVet<sub>GPT-4-Turbo</sub> | 32.2 | 32.7 | 48.8 | 49.5|40.9| HallBench<sub>avg</sub> | 27.9 | 34.0 | 39.0 |**41.7**|35.3
|
| 60 |
-
| MathVista<sub>testmini</sub> |
|
| 61 |
| MMstar | 37.7 | 45.7 | 50.1|48.0|48.5|
|
| 62 |
|
| 63 |
|
|
@@ -66,7 +67,7 @@ We provide the following models:
|
|
| 66 |
|
| 67 |
|
| 68 |
|
| 69 |
-
We provide a [
|
| 70 |
- pure text input
|
| 71 |
- single image input
|
| 72 |
- multiple image input
|
|
|
|
| 1 |
---
|
| 2 |
+
license: cc-by-nc-4.0
|
| 3 |
pipeline_tag: image-text-to-text
|
| 4 |
library_name: transformers
|
| 5 |
base_model:
|
|
|
|
| 16 |
|
| 17 |
# Eagle-2
|
| 18 |
|
| 19 |
+
|
| 20 |
+
[\[π GitHub\]](https://github.com/NVlabs/EAGLE) [\[π Eagle2 Tech Report\]](TODO)
|
| 21 |
[\[π¨οΈ Chat Demo\]](http://eagle-vlm.xyz/) [\[π€ HF Demo\]](TODO)
|
| 22 |
## Introduction
|
| 23 |
|
|
|
|
| 58 |
| AI2D<sub>test</sub> | 57.1 | 64.1 | 69.3 | 74.7 |70.9|
|
| 59 |
| MMMU<sub>val</sub> | 31.4 | 36.7 | 40.9 |41.1|38.8|
|
| 60 |
| MMVet<sub>GPT-4-Turbo</sub> | 32.2 | 32.7 | 48.8 | 49.5|40.9| HallBench<sub>avg</sub> | 27.9 | 34.0 | 39.0 |**41.7**|35.3
|
| 61 |
+
| MathVista<sub>testmini</sub> | 33.8 | 37.7 | 43.2 |43.0|45.3|
|
| 62 |
| MMstar | 37.7 | 45.7 | 50.1|48.0|48.5|
|
| 63 |
|
| 64 |
|
|
|
|
| 67 |
|
| 68 |
|
| 69 |
|
| 70 |
+
We provide a [inference script](./demo.py) to help you quickly start using the model. We support different input types:
|
| 71 |
- pure text input
|
| 72 |
- single image input
|
| 73 |
- multiple image input
|
demo.py
CHANGED
|
@@ -390,7 +390,7 @@ class ModelWorker:
|
|
| 390 |
|
| 391 |
if __name__ == '__main__':
|
| 392 |
parser = argparse.ArgumentParser()
|
| 393 |
-
parser.add_argument('--model-path', type=str, default='/
|
| 394 |
parser.add_argument('--model-name', type=str, default='Eagle2-1B')
|
| 395 |
parser.add_argument('--device', type=str, default='cuda')
|
| 396 |
parser.add_argument('--load-8bit', action='store_true')
|
|
|
|
| 390 |
|
| 391 |
if __name__ == '__main__':
|
| 392 |
parser = argparse.ArgumentParser()
|
| 393 |
+
parser.add_argument('--model-path', type=str, default='nvidia/Eagle2-1B')
|
| 394 |
parser.add_argument('--model-name', type=str, default='Eagle2-1B')
|
| 395 |
parser.add_argument('--device', type=str, default='cuda')
|
| 396 |
parser.add_argument('--load-8bit', action='store_true')
|
modeling_eagle_chat.py
CHANGED
|
@@ -25,6 +25,9 @@ from .flash_attention import *
|
|
| 25 |
from .multi_backbone_channel_concatentation_model import MultiBackboneChannelConcatenationVisionModel
|
| 26 |
from .multi_backbone_channel_concatenation_encoder import MultiBackboneChannelConcatenationVisionTower
|
| 27 |
from .configuration_multi_backbone_channel_concatentation_model import MultiBackboneChannelConcatenationVisionModelConfig
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
logger = logging.get_logger(__name__)
|
| 30 |
|
|
|
|
| 25 |
from .multi_backbone_channel_concatentation_model import MultiBackboneChannelConcatenationVisionModel
|
| 26 |
from .multi_backbone_channel_concatenation_encoder import MultiBackboneChannelConcatenationVisionTower
|
| 27 |
from .configuration_multi_backbone_channel_concatentation_model import MultiBackboneChannelConcatenationVisionModelConfig
|
| 28 |
+
from .siglip_vision_tower import SiglipVisionTower
|
| 29 |
+
from .convnext_encoder import ConvNextVisionTower
|
| 30 |
+
from .convnext import ConvNeXt
|
| 31 |
|
| 32 |
logger = logging.get_logger(__name__)
|
| 33 |
|