Nightwalkx commited on
Commit
66ad6c1
·
1 Parent(s): a2f0f33

update model

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +21 -0
  2. .editorconfig +18 -0
  3. .gitignore +35 -0
  4. LICENSE +201 -0
  5. app.py +597 -60
  6. cog.yaml +37 -0
  7. llava/__init__.py +1 -0
  8. llava/constants.py +13 -0
  9. llava/conversation.py +396 -0
  10. llava/eval/eval_gpt_review.py +113 -0
  11. llava/eval/eval_gpt_review_bench.py +121 -0
  12. llava/eval/eval_gpt_review_visual.py +118 -0
  13. llava/eval/eval_pope.py +81 -0
  14. llava/eval/eval_science_qa.py +114 -0
  15. llava/eval/eval_science_qa_gpt4.py +104 -0
  16. llava/eval/eval_science_qa_gpt4_requery.py +149 -0
  17. llava/eval/eval_textvqa.py +65 -0
  18. llava/eval/generate_webpage_data_from_table.py +111 -0
  19. llava/eval/m4c_evaluator.py +334 -0
  20. llava/eval/model_qa.py +64 -0
  21. llava/eval/model_vqa.py +101 -0
  22. llava/eval/model_vqa_loader.py +144 -0
  23. llava/eval/model_vqa_mmbench.py +160 -0
  24. llava/eval/model_vqa_science.py +111 -0
  25. llava/eval/qa_baseline_gpt35.py +74 -0
  26. llava/eval/run_llava.py +145 -0
  27. llava/eval/summarize_gpt_review.py +60 -0
  28. llava/eval/webpage/figures/alpaca.png +0 -0
  29. llava/eval/webpage/figures/bard.jpg +0 -0
  30. llava/eval/webpage/figures/chatgpt.svg +1 -0
  31. llava/eval/webpage/figures/llama.jpg +0 -0
  32. llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg +1 -0
  33. llava/eval/webpage/figures/vicuna.jpeg +0 -0
  34. llava/eval/webpage/index.html +162 -0
  35. llava/eval/webpage/script.js +245 -0
  36. llava/eval/webpage/styles.css +105 -0
  37. llava/mm_utils.py +247 -0
  38. llava/model/__init__.py +6 -0
  39. llava/model/apply_delta.py +48 -0
  40. llava/model/builder.py +167 -0
  41. llava/model/consolidate.py +29 -0
  42. llava/model/language_model/llava_llama.py +158 -0
  43. llava/model/language_model/llava_mistral.py +158 -0
  44. llava/model/language_model/llava_mpt.py +97 -0
  45. llava/model/llava_arch.py +368 -0
  46. llava/model/make_delta.py +52 -0
  47. llava/model/multimodal_encoder/builder.py +15 -0
  48. llava/model/multimodal_encoder/clip_encoder.py +147 -0
  49. llava/model/multimodal_projector/builder.py +51 -0
  50. llava/model/utils.py +20 -0
.dockerignore ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # The .dockerignore file excludes files from the container build process.
2
+ #
3
+ # https://docs.docker.com/engine/reference/builder/#dockerignore-file
4
+
5
+ # Exclude Git files
6
+ .git
7
+ .github
8
+ .gitignore
9
+
10
+ # Exclude Python cache files
11
+ __pycache__
12
+ .mypy_cache
13
+ .pytest_cache
14
+ .ruff_cache
15
+
16
+ # Exclude Python virtual environment
17
+ /venv
18
+
19
+ # Exclude some weights
20
+ /openai
21
+ /liuhaotian
.editorconfig ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root = true
2
+
3
+ # Unix-style newlines with a newline ending every file
4
+ [*]
5
+ end_of_line = lf
6
+ insert_final_newline = true
7
+ trim_trailing_whitespace = true
8
+ charset = utf-8
9
+
10
+ # 4 space indentation
11
+ [*.{py,json}]
12
+ indent_style = space
13
+ indent_size = 4
14
+
15
+ # 2 space indentation
16
+ [*.{md,sh,yaml,yml}]
17
+ indent_style = space
18
+ indent_size = 2
.gitignore ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__
3
+ *.pyc
4
+ *.egg-info
5
+ dist
6
+
7
+ # Log
8
+ *.log
9
+ *.log.*
10
+ *.json
11
+ *.jsonl
12
+
13
+ # Data
14
+ !**/alpaca-data-conversation.json
15
+
16
+ # Editor
17
+ .idea
18
+ *.swp
19
+
20
+ # Other
21
+ .DS_Store
22
+ wandb
23
+ output
24
+
25
+ checkpoints
26
+ ckpts*
27
+
28
+ .ipynb_checkpoints
29
+ *.ipynb
30
+
31
+ # DevContainer
32
+ !.devcontainer/*
33
+
34
+ # Demo
35
+ serve_images/
LICENSE ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
app.py CHANGED
@@ -1,83 +1,620 @@
1
- import gradio as gr
2
- from transformers import LlavaProcessor, LlavaForConditionalGeneration, TextIteratorStreamer
3
- from threading import Thread
 
 
 
 
4
  import time
5
- from PIL import Image
6
- import torch
7
- import spaces
8
 
9
- # model_id = "llava-hf/llava-interleave-qwen-0.5b-hf"
10
- model_id = "rogerxi/llava-finetune-test"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- processor = LlavaProcessor.from_pretrained(model_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- model = LlavaForConditionalGeneration.from_pretrained(model_id, torch_dtype=torch.float16)
15
- model.to("cuda")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
- @spaces.GPU
19
- def bot_streaming(message, history):
20
 
21
- # print(message)
22
- txt = message['text']
23
 
24
- ext_buffer = f"USER: \n{txt}\nASSISTANT:"
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- if message['files']:
27
- if len(message['files']) == 1:
28
- image = [message['files'][0]]
29
- elif len(message['files']) > 1:
30
- image = [msg for msg in message['files']]
31
- else:
32
- # if there's no image uploaded for this turn, look for images in the past turns
33
- # kept inside tuples, take the last one
34
- for hist in history:
35
- if type(hist[0])==tuple:
36
- image = hist[0][0]
37
 
38
- if message['files'] is None:
39
- gr.Error("You need to upload an image or video for LLaVA to work.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- image_extensions = Image.registered_extensions()
42
- image_extensions = tuple([ex for ex, f in image_extensions.items()])
43
- if len(image) == 1:
44
- image = Image.open(image[0]).convert("RGB")
45
- prompt = f"USER: <image>\n{message['text']}\nASSISTANT:"
46
 
47
- elif len(image) > 1:
48
- image_list = []
49
- user_prompt = message['text']
 
 
 
 
 
 
 
 
 
50
 
51
- for img in image:
52
- img = Image.open(img).convert("RGB")
53
- image_list.append(img)
54
 
55
- toks = "<image>" * len(image_list)
56
- prompt = "USER: "+ toks + f"\n{user_prompt}\nASSISTANT:"
57
 
58
- image = image_list
59
 
 
 
 
 
 
60
 
61
- inputs = processor(image, prompt, return_tensors="pt").to("cuda", torch.float16)
62
- streamer = TextIteratorStreamer(processor, **{"max_new_tokens": 200, "skip_special_tokens": True})
63
- generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=100)
64
- generated_text = ""
65
 
66
- thread = Thread(target=model.generate, kwargs=generation_kwargs)
67
- thread.start()
 
68
 
 
 
69
 
 
 
70
 
71
- buffer = ""
72
- for new_text in streamer:
73
- buffer += new_text
74
- generated_text_without_prompt = buffer[len(ext_buffer) + 2:]
75
- time.sleep(0.01)
76
- yield generated_text_without_prompt
77
 
 
 
 
 
 
 
 
 
 
78
 
79
- demo = gr.ChatInterface(fn=bot_streaming, title="LLaVA ",
80
- textbox=gr.MultimodalTextbox(file_count="multiple"),
81
- description="Try EgoLlava. If you don't upload an image, you will receive an error. ",
82
- stop_btn="Stop Generation", multimodal=True)
83
- demo.launch(debug=True)
 
1
+ import argparse
2
+ import datetime
3
+ import hashlib
4
+ import json
5
+ import os
6
+ import subprocess
7
+ import sys
8
  import time
 
 
 
9
 
10
+ import gradio as gr
11
+ import requests
12
+
13
+ from llava.constants import LOGDIR
14
+ from llava.conversation import SeparatorStyle, conv_templates, default_conversation
15
+ from llava.utils import (
16
+ build_logger,
17
+ moderation_msg,
18
+ server_error_msg,
19
+ violates_moderation,
20
+ )
21
+
22
+ logger = build_logger("gradio_web_server", "gradio_web_server.log")
23
+
24
+ headers = {"User-Agent": "LLaVA Client"}
25
+
26
+ no_change_btn = gr.Button.update()
27
+ enable_btn = gr.Button.update(interactive=True)
28
+ disable_btn = gr.Button.update(interactive=False)
29
+
30
+ priority = {
31
+ "vicuna-13b": "aaaaaaa",
32
+ "koala-13b": "aaaaaab",
33
+ }
34
+
35
+
36
+ def get_conv_log_filename():
37
+ t = datetime.datetime.now()
38
+ name = os.path.join(LOGDIR, f"{t.year}-{t.month:02d}-{t.day:02d}-conv.json")
39
+ return name
40
+
41
+
42
+ def get_model_list():
43
+ ret = requests.post(args.controller_url + "/refresh_all_workers")
44
+ assert ret.status_code == 200
45
+ ret = requests.post(args.controller_url + "/list_models")
46
+ models = ret.json()["models"]
47
+ models.sort(key=lambda x: priority.get(x, x))
48
+ logger.info(f"Models: {models}")
49
+ return models
50
+
51
+
52
+ get_window_url_params = """
53
+ function() {
54
+ const params = new URLSearchParams(window.location.search);
55
+ url_params = Object.fromEntries(params);
56
+ console.log(url_params);
57
+ return url_params;
58
+ }
59
+ """
60
+
61
+
62
+ def load_demo(url_params, request: gr.Request):
63
+ logger.info(f"load_demo. ip: {request.client.host}. params: {url_params}")
64
+
65
+ dropdown_update = gr.Dropdown.update(visible=True)
66
+ if "model" in url_params:
67
+ model = url_params["model"]
68
+ if model in models:
69
+ dropdown_update = gr.Dropdown.update(value=model, visible=True)
70
+
71
+ state = default_conversation.copy()
72
+ return state, dropdown_update
73
+
74
+
75
+ def load_demo_refresh_model_list(request: gr.Request):
76
+ logger.info(f"load_demo. ip: {request.client.host}")
77
+ models = get_model_list()
78
+ state = default_conversation.copy()
79
+
80
+ models_downloaded = True if models else False
81
+
82
+ model_dropdown_kwargs = {
83
+ "choices": [],
84
+ "value": "Downloading the models...",
85
+ "interactive": models_downloaded,
86
+ }
87
+
88
+ if models_downloaded:
89
+ model_dropdown_kwargs["choices"] = models
90
+ model_dropdown_kwargs["value"] = models[0]
91
+
92
+ models_dropdown_update = gr.Dropdown.update(**model_dropdown_kwargs)
93
+
94
+ send_button_update = gr.Button.update(
95
+ interactive=models_downloaded,
96
+ )
97
+
98
+ return state, models_dropdown_update, send_button_update
99
+
100
+
101
+ def vote_last_response(state, vote_type, model_selector, request: gr.Request):
102
+ with open(get_conv_log_filename(), "a") as fout:
103
+ data = {
104
+ "tstamp": round(time.time(), 4),
105
+ "type": vote_type,
106
+ "model": model_selector,
107
+ "state": state.dict(),
108
+ "ip": request.client.host,
109
+ }
110
+ fout.write(json.dumps(data) + "\n")
111
+
112
+
113
+ def upvote_last_response(state, model_selector, request: gr.Request):
114
+ logger.info(f"upvote. ip: {request.client.host}")
115
+ vote_last_response(state, "upvote", model_selector, request)
116
+ return ("",) + (disable_btn,) * 3
117
+
118
+
119
+ def downvote_last_response(state, model_selector, request: gr.Request):
120
+ logger.info(f"downvote. ip: {request.client.host}")
121
+ vote_last_response(state, "downvote", model_selector, request)
122
+ return ("",) + (disable_btn,) * 3
123
+
124
+
125
+ def flag_last_response(state, model_selector, request: gr.Request):
126
+ logger.info(f"flag. ip: {request.client.host}")
127
+ vote_last_response(state, "flag", model_selector, request)
128
+ return ("",) + (disable_btn,) * 3
129
+
130
+
131
+ def regenerate(state, image_process_mode, request: gr.Request):
132
+ logger.info(f"regenerate. ip: {request.client.host}")
133
+ state.messages[-1][-1] = None
134
+ prev_human_msg = state.messages[-2]
135
+ if type(prev_human_msg[1]) in (tuple, list):
136
+ prev_human_msg[1] = (*prev_human_msg[1][:2], image_process_mode)
137
+ state.skip_next = False
138
+ return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
139
+
140
+
141
+ def clear_history(request: gr.Request):
142
+ logger.info(f"clear_history. ip: {request.client.host}")
143
+ state = default_conversation.copy()
144
+ return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
145
+
146
+
147
+ def add_text(state, text, image, image_process_mode, request: gr.Request):
148
+ logger.info(f"add_text. ip: {request.client.host}. len: {len(text)}")
149
+ if len(text) <= 0 and image is None:
150
+ state.skip_next = True
151
+ return (state, state.to_gradio_chatbot(), "", None) + (no_change_btn,) * 5
152
+ if args.moderate:
153
+ flagged = violates_moderation(text)
154
+ if flagged:
155
+ state.skip_next = True
156
+ return (state, state.to_gradio_chatbot(), moderation_msg, None) + (
157
+ no_change_btn,
158
+ ) * 5
159
+
160
+ text = text[:1536] # Hard cut-off
161
+ if image is not None:
162
+ text = text[:1200] # Hard cut-off for images
163
+ if "<image>" not in text:
164
+ # text = '<Image><image></Image>' + text
165
+ text = text + "\n<image>"
166
+ text = (text, image, image_process_mode)
167
+ if len(state.get_images(return_pil=True)) > 0:
168
+ state = default_conversation.copy()
169
+ state.append_message(state.roles[0], text)
170
+ state.append_message(state.roles[1], None)
171
+ state.skip_next = False
172
+ return (state, state.to_gradio_chatbot(), "", None) + (disable_btn,) * 5
173
+
174
+
175
+ def http_bot(
176
+ state, model_selector, temperature, top_p, max_new_tokens, request: gr.Request
177
+ ):
178
+ logger.info(f"http_bot. ip: {request.client.host}")
179
+ start_tstamp = time.time()
180
+ model_name = model_selector
181
+
182
+ if state.skip_next:
183
+ # This generate call is skipped due to invalid inputs
184
+ yield (state, state.to_gradio_chatbot()) + (no_change_btn,) * 5
185
+ return
186
+
187
+ if len(state.messages) == state.offset + 2:
188
+ # First round of conversation
189
+ if "llava" in model_name.lower():
190
+ if "llama-2" in model_name.lower():
191
+ template_name = "llava_llama_2"
192
+ elif "v1" in model_name.lower():
193
+ if "mmtag" in model_name.lower():
194
+ template_name = "v1_mmtag"
195
+ elif (
196
+ "plain" in model_name.lower()
197
+ and "finetune" not in model_name.lower()
198
+ ):
199
+ template_name = "v1_mmtag"
200
+ else:
201
+ template_name = "llava_v1"
202
+ elif "mpt" in model_name.lower():
203
+ template_name = "mpt"
204
+ else:
205
+ if "mmtag" in model_name.lower():
206
+ template_name = "v0_mmtag"
207
+ elif (
208
+ "plain" in model_name.lower()
209
+ and "finetune" not in model_name.lower()
210
+ ):
211
+ template_name = "v0_mmtag"
212
+ else:
213
+ template_name = "llava_v0"
214
+ elif "mpt" in model_name:
215
+ template_name = "mpt_text"
216
+ elif "llama-2" in model_name:
217
+ template_name = "llama_2"
218
+ else:
219
+ template_name = "vicuna_v1"
220
+ new_state = conv_templates[template_name].copy()
221
+ new_state.append_message(new_state.roles[0], state.messages[-2][1])
222
+ new_state.append_message(new_state.roles[1], None)
223
+ state = new_state
224
+
225
+ # Query worker address
226
+ controller_url = args.controller_url
227
+ ret = requests.post(
228
+ controller_url + "/get_worker_address", json={"model": model_name}
229
+ )
230
+ worker_addr = ret.json()["address"]
231
+ logger.info(f"model_name: {model_name}, worker_addr: {worker_addr}")
232
+
233
+ # No available worker
234
+ if worker_addr == "":
235
+ state.messages[-1][-1] = server_error_msg
236
+ yield (
237
+ state,
238
+ state.to_gradio_chatbot(),
239
+ disable_btn,
240
+ disable_btn,
241
+ disable_btn,
242
+ enable_btn,
243
+ enable_btn,
244
+ )
245
+ return
246
+
247
+ # Construct prompt
248
+ prompt = state.get_prompt()
249
+
250
+ all_images = state.get_images(return_pil=True)
251
+ all_image_hash = [hashlib.md5(image.tobytes()).hexdigest() for image in all_images]
252
+ for image, hash in zip(all_images, all_image_hash):
253
+ t = datetime.datetime.now()
254
+ filename = os.path.join(
255
+ LOGDIR, "serve_images", f"{t.year}-{t.month:02d}-{t.day:02d}", f"{hash}.jpg"
256
+ )
257
+ if not os.path.isfile(filename):
258
+ os.makedirs(os.path.dirname(filename), exist_ok=True)
259
+ image.save(filename)
260
+
261
+ # Make requests
262
+ pload = {
263
+ "model": model_name,
264
+ "prompt": prompt,
265
+ "temperature": float(temperature),
266
+ "top_p": float(top_p),
267
+ "max_new_tokens": min(int(max_new_tokens), 1536),
268
+ "stop": state.sep
269
+ if state.sep_style in [SeparatorStyle.SINGLE, SeparatorStyle.MPT]
270
+ else state.sep2,
271
+ "images": f"List of {len(state.get_images())} images: {all_image_hash}",
272
+ }
273
+ logger.info(f"==== request ====\n{pload}")
274
+
275
+ pload["images"] = state.get_images()
276
+
277
+ state.messages[-1][-1] = "▌"
278
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
279
+
280
+ try:
281
+ # Stream output
282
+ response = requests.post(
283
+ worker_addr + "/worker_generate_stream",
284
+ headers=headers,
285
+ json=pload,
286
+ stream=True,
287
+ timeout=10,
288
+ )
289
+ for chunk in response.iter_lines(decode_unicode=False, delimiter=b"\0"):
290
+ if chunk:
291
+ data = json.loads(chunk.decode())
292
+ if data["error_code"] == 0:
293
+ output = data["text"][len(prompt) :].strip()
294
+ state.messages[-1][-1] = output + "▌"
295
+ yield (state, state.to_gradio_chatbot()) + (disable_btn,) * 5
296
+ else:
297
+ output = data["text"] + f" (error_code: {data['error_code']})"
298
+ state.messages[-1][-1] = output
299
+ yield (state, state.to_gradio_chatbot()) + (
300
+ disable_btn,
301
+ disable_btn,
302
+ disable_btn,
303
+ enable_btn,
304
+ enable_btn,
305
+ )
306
+ return
307
+ time.sleep(0.03)
308
+ except requests.exceptions.RequestException as e:
309
+ state.messages[-1][-1] = server_error_msg
310
+ yield (state, state.to_gradio_chatbot()) + (
311
+ disable_btn,
312
+ disable_btn,
313
+ disable_btn,
314
+ enable_btn,
315
+ enable_btn,
316
+ )
317
+ return
318
+
319
+ state.messages[-1][-1] = state.messages[-1][-1][:-1]
320
+ yield (state, state.to_gradio_chatbot()) + (enable_btn,) * 5
321
+
322
+ finish_tstamp = time.time()
323
+ logger.info(f"{output}")
324
+
325
+ with open(get_conv_log_filename(), "a") as fout:
326
+ data = {
327
+ "tstamp": round(finish_tstamp, 4),
328
+ "type": "chat",
329
+ "model": model_name,
330
+ "start": round(start_tstamp, 4),
331
+ "finish": round(start_tstamp, 4),
332
+ "state": state.dict(),
333
+ "images": all_image_hash,
334
+ "ip": request.client.host,
335
+ }
336
+ fout.write(json.dumps(data) + "\n")
337
+
338
+
339
+ title_markdown = """
340
+ # 🌋 LLaVA: Large Language and Vision Assistant
341
+ [[Project Page]](https://llava-vl.github.io) [[Paper]](https://arxiv.org/abs/2304.08485) [[Code]](https://github.com/haotian-liu/LLaVA) [[Model]](https://github.com/haotian-liu/LLaVA/blob/main/docs/MODEL_ZOO.md)
342
+ ONLY WORKS WITH GPU!
343
+ You can load the model with 4-bit or 8-bit quantization to make it fit in smaller hardwares. Setting the environment variable `bits` to control the quantization.
344
+ *Note: 8-bit seems to be slower than both 4-bit/16-bit. Although it has enough VRAM to support 8-bit, until we figure out the inference speed issue, we recommend 4-bit for A10G for the best efficiency.*
345
+ Recommended configurations:
346
+ | Hardware | T4-Small (16G) | A10G-Small (24G) | A100-Large (40G) |
347
+ |-------------------|-----------------|------------------|------------------|
348
+ | **Bits** | 4 (default) | 4 | 16 |
349
+ """
350
+
351
+ tos_markdown = """
352
+ ### Terms of use
353
+ By using this service, users are required to agree to the following terms:
354
+ The service is a research preview intended for non-commercial use only. It only provides limited safety measures and may generate offensive content. It must not be used for any illegal, harmful, violent, racist, or sexual purposes. The service may collect user dialogue data for future research.
355
+ Please click the "Flag" button if you get any inappropriate answer! We will collect those to keep improving our moderator.
356
+ For an optimal experience, please use desktop computers for this demo, as mobile devices may compromise its quality.
357
+ """
358
+
359
+
360
+ learn_more_markdown = """
361
+ ### License
362
+ The service is a research preview intended for non-commercial use only, subject to the model [License](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md) of LLaMA, [Terms of Use](https://openai.com/policies/terms-of-use) of the data generated by OpenAI, and [Privacy Practices](https://chrome.google.com/webstore/detail/sharegpt-share-your-chatg/daiacboceoaocpibfodeljbdfacokfjb) of ShareGPT. Please contact us if you find any potential violation.
363
+ """
364
+
365
+ block_css = """
366
+ #buttons button {
367
+ min-width: min(120px,100%);
368
+ }
369
+ """
370
+
371
+
372
+ def build_demo(embed_mode):
373
+ models = get_model_list()
374
+
375
+ textbox = gr.Textbox(
376
+ show_label=False, placeholder="Enter text and press ENTER", container=False
377
+ )
378
+ with gr.Blocks(title="LLaVA", theme=gr.themes.Default(), css=block_css) as demo:
379
+ state = gr.State(default_conversation.copy())
380
+
381
+ if not embed_mode:
382
+ gr.Markdown(title_markdown)
383
+
384
+ with gr.Row():
385
+ with gr.Column(scale=3):
386
+ with gr.Row(elem_id="model_selector_row"):
387
+ model_selector = gr.Dropdown(
388
+ choices=models,
389
+ value=models[0] if models else "Downloading the models...",
390
+ interactive=True if models else False,
391
+ show_label=False,
392
+ container=False,
393
+ )
394
+
395
+ imagebox = gr.Image(type="pil")
396
+ image_process_mode = gr.Radio(
397
+ ["Crop", "Resize", "Pad", "Default"],
398
+ value="Default",
399
+ label="Preprocess for non-square image",
400
+ visible=False,
401
+ )
402
+
403
+ cur_dir = os.path.dirname(os.path.abspath(__file__))
404
+ gr.Examples(
405
+ examples=[
406
+ [
407
+ f"{cur_dir}/examples/extreme_ironing.jpg",
408
+ "What is unusual about this image?",
409
+ ],
410
+ [
411
+ f"{cur_dir}/examples/waterview.jpg",
412
+ "What are the things I should be cautious about when I visit here?",
413
+ ],
414
+ ],
415
+ inputs=[imagebox, textbox],
416
+ )
417
+
418
+ with gr.Accordion("Parameters", open=False) as parameter_row:
419
+ temperature = gr.Slider(
420
+ minimum=0.0,
421
+ maximum=1.0,
422
+ value=0.2,
423
+ step=0.1,
424
+ interactive=True,
425
+ label="Temperature",
426
+ )
427
+ top_p = gr.Slider(
428
+ minimum=0.0,
429
+ maximum=1.0,
430
+ value=0.7,
431
+ step=0.1,
432
+ interactive=True,
433
+ label="Top P",
434
+ )
435
+ max_output_tokens = gr.Slider(
436
+ minimum=0,
437
+ maximum=1024,
438
+ value=512,
439
+ step=64,
440
+ interactive=True,
441
+ label="Max output tokens",
442
+ )
443
+
444
+ with gr.Column(scale=8):
445
+ chatbot = gr.Chatbot(
446
+ elem_id="chatbot", label="LLaVA Chatbot", height=550
447
+ )
448
+ with gr.Row():
449
+ with gr.Column(scale=8):
450
+ textbox.render()
451
+ with gr.Column(scale=1, min_width=50):
452
+ submit_btn = gr.Button(
453
+ value="Send", variant="primary", interactive=False
454
+ )
455
+ with gr.Row(elem_id="buttons") as button_row:
456
+ upvote_btn = gr.Button(value="👍 Upvote", interactive=False)
457
+ downvote_btn = gr.Button(value="👎 Downvote", interactive=False)
458
+ flag_btn = gr.Button(value="⚠️ Flag", interactive=False)
459
+ # stop_btn = gr.Button(value="⏹️ Stop Generation", interactive=False)
460
+ regenerate_btn = gr.Button(value="🔄 Regenerate", interactive=False)
461
+ clear_btn = gr.Button(value="🗑️ Clear history", interactive=False)
462
+
463
+ if not embed_mode:
464
+ gr.Markdown(tos_markdown)
465
+ gr.Markdown(learn_more_markdown)
466
+ url_params = gr.JSON(visible=False)
467
 
468
+ # Register listeners
469
+ btn_list = [upvote_btn, downvote_btn, flag_btn, regenerate_btn, clear_btn]
470
+ upvote_btn.click(
471
+ upvote_last_response,
472
+ [state, model_selector],
473
+ [textbox, upvote_btn, downvote_btn, flag_btn],
474
+ )
475
+ downvote_btn.click(
476
+ downvote_last_response,
477
+ [state, model_selector],
478
+ [textbox, upvote_btn, downvote_btn, flag_btn],
479
+ )
480
+ flag_btn.click(
481
+ flag_last_response,
482
+ [state, model_selector],
483
+ [textbox, upvote_btn, downvote_btn, flag_btn],
484
+ )
485
+ regenerate_btn.click(
486
+ regenerate,
487
+ [state, image_process_mode],
488
+ [state, chatbot, textbox, imagebox] + btn_list,
489
+ ).then(
490
+ http_bot,
491
+ [state, model_selector, temperature, top_p, max_output_tokens],
492
+ [state, chatbot] + btn_list,
493
+ )
494
+ clear_btn.click(
495
+ clear_history, None, [state, chatbot, textbox, imagebox] + btn_list
496
+ )
497
 
498
+ textbox.submit(
499
+ add_text,
500
+ [state, textbox, imagebox, image_process_mode],
501
+ [state, chatbot, textbox, imagebox] + btn_list,
502
+ ).then(
503
+ http_bot,
504
+ [state, model_selector, temperature, top_p, max_output_tokens],
505
+ [state, chatbot] + btn_list,
506
+ )
507
+ submit_btn.click(
508
+ add_text,
509
+ [state, textbox, imagebox, image_process_mode],
510
+ [state, chatbot, textbox, imagebox] + btn_list,
511
+ ).then(
512
+ http_bot,
513
+ [state, model_selector, temperature, top_p, max_output_tokens],
514
+ [state, chatbot] + btn_list,
515
+ )
516
 
517
+ if args.model_list_mode == "once":
518
+ demo.load(
519
+ load_demo,
520
+ [url_params],
521
+ [state, model_selector],
522
+ _js=get_window_url_params,
523
+ )
524
+ elif args.model_list_mode == "reload":
525
+ demo.load(
526
+ load_demo_refresh_model_list, None, [state, model_selector, submit_btn]
527
+ )
528
+ else:
529
+ raise ValueError(f"Unknown model list mode: {args.model_list_mode}")
530
 
531
+ return demo
 
532
 
 
 
533
 
534
+ def start_controller():
535
+ logger.info("Starting the controller")
536
+ controller_command = [
537
+ "python",
538
+ "-m",
539
+ "llava.serve.controller",
540
+ "--host",
541
+ "0.0.0.0",
542
+ "--port",
543
+ "10000",
544
+ ]
545
+ return subprocess.Popen(controller_command)
546
 
 
 
 
 
 
 
 
 
 
 
 
547
 
548
+ def start_worker(model_path: str, bits=16):
549
+ logger.info(f"Starting the model worker for the model {model_path}")
550
+ model_name = model_path.strip("/").split("/")[-1]
551
+ assert bits in [4, 8, 16], "It can be only loaded with 16-bit, 8-bit, and 4-bit."
552
+ if bits != 16:
553
+ model_name += f"-{bits}bit"
554
+ worker_command = [
555
+ "python",
556
+ "-m",
557
+ "llava.serve.model_worker",
558
+ "--host",
559
+ "0.0.0.0",
560
+ "--controller",
561
+ "http://localhost:10000",
562
+ "--model-path",
563
+ model_path,
564
+ "--model-name",
565
+ model_name,
566
+ ]
567
+ if bits != 16:
568
+ worker_command += [f"--load-{bits}bit"]
569
+ return subprocess.Popen(worker_command)
570
 
 
 
 
 
 
571
 
572
+ def get_args():
573
+ parser = argparse.ArgumentParser()
574
+ parser.add_argument("--host", type=str, default="0.0.0.0")
575
+ parser.add_argument("--port", type=int)
576
+ parser.add_argument("--controller-url", type=str, default="http://localhost:10000")
577
+ parser.add_argument("--concurrency-count", type=int, default=8)
578
+ parser.add_argument(
579
+ "--model-list-mode", type=str, default="reload", choices=["once", "reload"]
580
+ )
581
+ parser.add_argument("--share", action="store_true")
582
+ parser.add_argument("--moderate", action="store_true")
583
+ parser.add_argument("--embed", action="store_true")
584
 
585
+ args = parser.parse_args()
 
 
586
 
587
+ return args
 
588
 
 
589
 
590
+ def start_demo(args):
591
+ demo = build_demo(args.embed)
592
+ demo.queue(
593
+ concurrency_count=args.concurrency_count, status_update_rate=10, api_open=False
594
+ ).launch(server_name=args.host, server_port=args.port, share=args.share)
595
 
 
 
 
 
596
 
597
+ if __name__ == "__main__":
598
+ args = get_args()
599
+ logger.info(f"args: {args}")
600
 
601
+ model_path = "rogerxi/Spatial-LLaVA-7B"
602
+ bits = int(os.getenv("bits", 8))
603
 
604
+ controller_proc = start_controller()
605
+ worker_proc = start_worker(model_path, bits=bits)
606
 
607
+ # Wait for worker and controller to start
608
+ time.sleep(10)
 
 
 
 
609
 
610
+ exit_status = 0
611
+ try:
612
+ start_demo(args)
613
+ except Exception as e:
614
+ print(e)
615
+ exit_status = 1
616
+ finally:
617
+ worker_proc.kill()
618
+ controller_proc.kill()
619
 
620
+ sys.exit(exit_status)
 
 
 
 
cog.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration for Cog ⚙️
2
+ # Reference: https://github.com/replicate/cog/blob/main/docs/yaml.md
3
+
4
+ build:
5
+ gpu: true
6
+
7
+ python_version: "3.11"
8
+
9
+ python_packages:
10
+ - "torch==2.0.1"
11
+ - "accelerate==0.21.0"
12
+ - "bitsandbytes==0.41.0"
13
+ - "deepspeed==0.9.5"
14
+ - "einops-exts==0.0.4"
15
+ - "einops==0.6.1"
16
+ - "gradio==3.35.2"
17
+ - "gradio_client==0.2.9"
18
+ - "httpx==0.24.0"
19
+ - "markdown2==2.4.10"
20
+ - "numpy==1.26.0"
21
+ - "peft==0.4.0"
22
+ - "scikit-learn==1.2.2"
23
+ - "sentencepiece==0.1.99"
24
+ - "shortuuid==1.0.11"
25
+ - "timm==0.6.13"
26
+ - "tokenizers==0.13.3"
27
+ - "torch==2.0.1"
28
+ - "torchvision==0.15.2"
29
+ - "transformers==4.31.0"
30
+ - "wandb==0.15.12"
31
+ - "wavedrom==2.0.3.post3"
32
+ - "Pygments==2.16.1"
33
+ run:
34
+ - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.0.3/pget" && chmod +x /usr/local/bin/pget
35
+
36
+ # predict.py defines how predictions are run on your model
37
+ predict: "predict.py:Predictor"
llava/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .model import LlavaLlamaForCausalLM
llava/constants.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ CONTROLLER_HEART_BEAT_EXPIRATION = 30
2
+ WORKER_HEART_BEAT_INTERVAL = 15
3
+
4
+ LOGDIR = "."
5
+
6
+ # Model Constants
7
+ IGNORE_INDEX = -100
8
+ IMAGE_TOKEN_INDEX = -200
9
+ DEFAULT_IMAGE_TOKEN = "<image>"
10
+ DEFAULT_IMAGE_PATCH_TOKEN = "<im_patch>"
11
+ DEFAULT_IM_START_TOKEN = "<im_start>"
12
+ DEFAULT_IM_END_TOKEN = "<im_end>"
13
+ IMAGE_PLACEHOLDER = "<image-placeholder>"
llava/conversation.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ from enum import auto, Enum
3
+ from typing import List, Tuple
4
+ import base64
5
+ from io import BytesIO
6
+ from PIL import Image
7
+
8
+
9
+ class SeparatorStyle(Enum):
10
+ """Different separator style."""
11
+ SINGLE = auto()
12
+ TWO = auto()
13
+ MPT = auto()
14
+ PLAIN = auto()
15
+ LLAMA_2 = auto()
16
+
17
+
18
+ @dataclasses.dataclass
19
+ class Conversation:
20
+ """A class that keeps all conversation history."""
21
+ system: str
22
+ roles: List[str]
23
+ messages: List[List[str]]
24
+ offset: int
25
+ sep_style: SeparatorStyle = SeparatorStyle.SINGLE
26
+ sep: str = "###"
27
+ sep2: str = None
28
+ version: str = "Unknown"
29
+
30
+ skip_next: bool = False
31
+
32
+ def get_prompt(self):
33
+ messages = self.messages
34
+ if len(messages) > 0 and type(messages[0][1]) is tuple:
35
+ messages = self.messages.copy()
36
+ init_role, init_msg = messages[0].copy()
37
+ init_msg = init_msg[0].replace("<image>", "").strip()
38
+ if 'mmtag' in self.version:
39
+ messages[0] = (init_role, init_msg)
40
+ messages.insert(0, (self.roles[0], "<Image><image></Image>"))
41
+ messages.insert(1, (self.roles[1], "Received."))
42
+ else:
43
+ messages[0] = (init_role, "<image>\n" + init_msg)
44
+
45
+ if self.sep_style == SeparatorStyle.SINGLE:
46
+ ret = self.system + self.sep
47
+ for role, message in messages:
48
+ if message:
49
+ if type(message) is tuple:
50
+ message, _, _ = message
51
+ ret += role + ": " + message + self.sep
52
+ else:
53
+ ret += role + ":"
54
+ elif self.sep_style == SeparatorStyle.TWO:
55
+ seps = [self.sep, self.sep2]
56
+ ret = self.system + seps[0]
57
+ for i, (role, message) in enumerate(messages):
58
+ if message:
59
+ if type(message) is tuple:
60
+ message, _, _ = message
61
+ ret += role + ": " + message + seps[i % 2]
62
+ else:
63
+ ret += role + ":"
64
+ elif self.sep_style == SeparatorStyle.MPT:
65
+ ret = self.system + self.sep
66
+ for role, message in messages:
67
+ if message:
68
+ if type(message) is tuple:
69
+ message, _, _ = message
70
+ ret += role + message + self.sep
71
+ else:
72
+ ret += role
73
+ elif self.sep_style == SeparatorStyle.LLAMA_2:
74
+ wrap_sys = lambda msg: f"<<SYS>>\n{msg}\n<</SYS>>\n\n" if len(msg) > 0 else msg
75
+ wrap_inst = lambda msg: f"[INST] {msg} [/INST]"
76
+ ret = ""
77
+
78
+ for i, (role, message) in enumerate(messages):
79
+ if i == 0:
80
+ assert message, "first message should not be none"
81
+ assert role == self.roles[0], "first message should come from user"
82
+ if message:
83
+ if type(message) is tuple:
84
+ message, _, _ = message
85
+ if i == 0: message = wrap_sys(self.system) + message
86
+ if i % 2 == 0:
87
+ message = wrap_inst(message)
88
+ ret += self.sep + message
89
+ else:
90
+ ret += " " + message + " " + self.sep2
91
+ else:
92
+ ret += ""
93
+ ret = ret.lstrip(self.sep)
94
+ elif self.sep_style == SeparatorStyle.PLAIN:
95
+ seps = [self.sep, self.sep2]
96
+ ret = self.system
97
+ for i, (role, message) in enumerate(messages):
98
+ if message:
99
+ if type(message) is tuple:
100
+ message, _, _ = message
101
+ ret += message + seps[i % 2]
102
+ else:
103
+ ret += ""
104
+ else:
105
+ raise ValueError(f"Invalid style: {self.sep_style}")
106
+
107
+ return ret
108
+
109
+ def append_message(self, role, message):
110
+ self.messages.append([role, message])
111
+
112
+ def process_image(self, image, image_process_mode, return_pil=False, image_format='PNG', max_len=1344, min_len=672):
113
+ if image_process_mode == "Pad":
114
+ def expand2square(pil_img, background_color=(122, 116, 104)):
115
+ width, height = pil_img.size
116
+ if width == height:
117
+ return pil_img
118
+ elif width > height:
119
+ result = Image.new(pil_img.mode, (width, width), background_color)
120
+ result.paste(pil_img, (0, (width - height) // 2))
121
+ return result
122
+ else:
123
+ result = Image.new(pil_img.mode, (height, height), background_color)
124
+ result.paste(pil_img, ((height - width) // 2, 0))
125
+ return result
126
+ image = expand2square(image)
127
+ elif image_process_mode in ["Default", "Crop"]:
128
+ pass
129
+ elif image_process_mode == "Resize":
130
+ image = image.resize((336, 336))
131
+ else:
132
+ raise ValueError(f"Invalid image_process_mode: {image_process_mode}")
133
+ if max(image.size) > max_len:
134
+ max_hw, min_hw = max(image.size), min(image.size)
135
+ aspect_ratio = max_hw / min_hw
136
+ shortest_edge = int(min(max_len / aspect_ratio, min_len, min_hw))
137
+ longest_edge = int(shortest_edge * aspect_ratio)
138
+ W, H = image.size
139
+ if H > W:
140
+ H, W = longest_edge, shortest_edge
141
+ else:
142
+ H, W = shortest_edge, longest_edge
143
+ image = image.resize((W, H))
144
+ if return_pil:
145
+ return image
146
+ else:
147
+ buffered = BytesIO()
148
+ image.save(buffered, format=image_format)
149
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
150
+ return img_b64_str
151
+
152
+ def get_images(self, return_pil=False):
153
+ images = []
154
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
155
+ if i % 2 == 0:
156
+ if type(msg) is tuple:
157
+ msg, image, image_process_mode = msg
158
+ image = self.process_image(image, image_process_mode, return_pil=return_pil)
159
+ images.append(image)
160
+ return images
161
+
162
+ def to_gradio_chatbot(self):
163
+ ret = []
164
+ for i, (role, msg) in enumerate(self.messages[self.offset:]):
165
+ if i % 2 == 0:
166
+ if type(msg) is tuple:
167
+ msg, image, image_process_mode = msg
168
+ img_b64_str = self.process_image(
169
+ image, "Default", return_pil=False,
170
+ image_format='JPEG')
171
+ img_str = f'<img src="data:image/jpeg;base64,{img_b64_str}" alt="user upload image" />'
172
+ msg = img_str + msg.replace('<image>', '').strip()
173
+ ret.append([msg, None])
174
+ else:
175
+ ret.append([msg, None])
176
+ else:
177
+ ret[-1][-1] = msg
178
+ return ret
179
+
180
+ def copy(self):
181
+ return Conversation(
182
+ system=self.system,
183
+ roles=self.roles,
184
+ messages=[[x, y] for x, y in self.messages],
185
+ offset=self.offset,
186
+ sep_style=self.sep_style,
187
+ sep=self.sep,
188
+ sep2=self.sep2,
189
+ version=self.version)
190
+
191
+ def dict(self):
192
+ if len(self.get_images()) > 0:
193
+ return {
194
+ "system": self.system,
195
+ "roles": self.roles,
196
+ "messages": [[x, y[0] if type(y) is tuple else y] for x, y in self.messages],
197
+ "offset": self.offset,
198
+ "sep": self.sep,
199
+ "sep2": self.sep2,
200
+ }
201
+ return {
202
+ "system": self.system,
203
+ "roles": self.roles,
204
+ "messages": self.messages,
205
+ "offset": self.offset,
206
+ "sep": self.sep,
207
+ "sep2": self.sep2,
208
+ }
209
+
210
+
211
+ conv_vicuna_v0 = Conversation(
212
+ system="A chat between a curious human and an artificial intelligence assistant. "
213
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
214
+ roles=("Human", "Assistant"),
215
+ messages=(
216
+ ("Human", "What are the key differences between renewable and non-renewable energy sources?"),
217
+ ("Assistant",
218
+ "Renewable energy sources are those that can be replenished naturally in a relatively "
219
+ "short amount of time, such as solar, wind, hydro, geothermal, and biomass. "
220
+ "Non-renewable energy sources, on the other hand, are finite and will eventually be "
221
+ "depleted, such as coal, oil, and natural gas. Here are some key differences between "
222
+ "renewable and non-renewable energy sources:\n"
223
+ "1. Availability: Renewable energy sources are virtually inexhaustible, while non-renewable "
224
+ "energy sources are finite and will eventually run out.\n"
225
+ "2. Environmental impact: Renewable energy sources have a much lower environmental impact "
226
+ "than non-renewable sources, which can lead to air and water pollution, greenhouse gas emissions, "
227
+ "and other negative effects.\n"
228
+ "3. Cost: Renewable energy sources can be more expensive to initially set up, but they typically "
229
+ "have lower operational costs than non-renewable sources.\n"
230
+ "4. Reliability: Renewable energy sources are often more reliable and can be used in more remote "
231
+ "locations than non-renewable sources.\n"
232
+ "5. Flexibility: Renewable energy sources are often more flexible and can be adapted to different "
233
+ "situations and needs, while non-renewable sources are more rigid and inflexible.\n"
234
+ "6. Sustainability: Renewable energy sources are more sustainable over the long term, while "
235
+ "non-renewable sources are not, and their depletion can lead to economic and social instability.\n")
236
+ ),
237
+ offset=2,
238
+ sep_style=SeparatorStyle.SINGLE,
239
+ sep="###",
240
+ )
241
+
242
+ conv_vicuna_v1 = Conversation(
243
+ system="A chat between a curious user and an artificial intelligence assistant. "
244
+ "The assistant gives helpful, detailed, and polite answers to the user's questions.",
245
+ roles=("USER", "ASSISTANT"),
246
+ version="v1",
247
+ messages=(),
248
+ offset=0,
249
+ sep_style=SeparatorStyle.TWO,
250
+ sep=" ",
251
+ sep2="</s>",
252
+ )
253
+
254
+ conv_llama_2 = Conversation(
255
+ system="""You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.
256
+
257
+ If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""",
258
+ roles=("USER", "ASSISTANT"),
259
+ version="llama_v2",
260
+ messages=(),
261
+ offset=0,
262
+ sep_style=SeparatorStyle.LLAMA_2,
263
+ sep="<s>",
264
+ sep2="</s>",
265
+ )
266
+
267
+ conv_llava_llama_2 = Conversation(
268
+ system="You are a helpful language and vision assistant. "
269
+ "You are able to understand the visual content that the user provides, "
270
+ "and assist the user with a variety of tasks using natural language.",
271
+ roles=("USER", "ASSISTANT"),
272
+ version="llama_v2",
273
+ messages=(),
274
+ offset=0,
275
+ sep_style=SeparatorStyle.LLAMA_2,
276
+ sep="<s>",
277
+ sep2="</s>",
278
+ )
279
+
280
+ conv_mpt = Conversation(
281
+ system="""<|im_start|>system
282
+ A conversation between a user and an LLM-based AI assistant. The assistant gives helpful and honest answers.""",
283
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
284
+ version="mpt",
285
+ messages=(),
286
+ offset=0,
287
+ sep_style=SeparatorStyle.MPT,
288
+ sep="<|im_end|>",
289
+ )
290
+
291
+ conv_llava_plain = Conversation(
292
+ system="",
293
+ roles=("", ""),
294
+ messages=(
295
+ ),
296
+ offset=0,
297
+ sep_style=SeparatorStyle.PLAIN,
298
+ sep="\n",
299
+ )
300
+
301
+ conv_llava_v0 = Conversation(
302
+ system="A chat between a curious human and an artificial intelligence assistant. "
303
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
304
+ roles=("Human", "Assistant"),
305
+ messages=(
306
+ ),
307
+ offset=0,
308
+ sep_style=SeparatorStyle.SINGLE,
309
+ sep="###",
310
+ )
311
+
312
+ conv_llava_v0_mmtag = Conversation(
313
+ system="A chat between a curious user and an artificial intelligence assistant. "
314
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
315
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
316
+ roles=("Human", "Assistant"),
317
+ messages=(
318
+ ),
319
+ offset=0,
320
+ sep_style=SeparatorStyle.SINGLE,
321
+ sep="###",
322
+ version="v0_mmtag",
323
+ )
324
+
325
+ conv_llava_v1 = Conversation(
326
+ system="A chat between a curious human and an artificial intelligence assistant. "
327
+ "The assistant gives helpful, detailed, and polite answers to the human's questions.",
328
+ roles=("USER", "ASSISTANT"),
329
+ version="v1",
330
+ messages=(),
331
+ offset=0,
332
+ sep_style=SeparatorStyle.TWO,
333
+ sep=" ",
334
+ sep2="</s>",
335
+ )
336
+
337
+ conv_llava_v1_mmtag = Conversation(
338
+ system="A chat between a curious user and an artificial intelligence assistant. "
339
+ "The assistant is able to understand the visual content that the user provides, and assist the user with a variety of tasks using natural language."
340
+ "The visual content will be provided with the following format: <Image>visual content</Image>.",
341
+ roles=("USER", "ASSISTANT"),
342
+ messages=(),
343
+ offset=0,
344
+ sep_style=SeparatorStyle.TWO,
345
+ sep=" ",
346
+ sep2="</s>",
347
+ version="v1_mmtag",
348
+ )
349
+
350
+ conv_mistral_instruct = Conversation(
351
+ system="",
352
+ roles=("USER", "ASSISTANT"),
353
+ version="llama_v2",
354
+ messages=(),
355
+ offset=0,
356
+ sep_style=SeparatorStyle.LLAMA_2,
357
+ sep="",
358
+ sep2="</s>",
359
+ )
360
+
361
+ conv_chatml_direct = Conversation(
362
+ system="""<|im_start|>system
363
+ Answer the questions.""",
364
+ roles=("<|im_start|>user\n", "<|im_start|>assistant\n"),
365
+ version="mpt",
366
+ messages=(),
367
+ offset=0,
368
+ sep_style=SeparatorStyle.MPT,
369
+ sep="<|im_end|>",
370
+ )
371
+
372
+ default_conversation = conv_vicuna_v1
373
+ conv_templates = {
374
+ "default": conv_vicuna_v0,
375
+ "v0": conv_vicuna_v0,
376
+ "v1": conv_vicuna_v1,
377
+ "vicuna_v1": conv_vicuna_v1,
378
+ "llama_2": conv_llama_2,
379
+ "mistral_instruct": conv_mistral_instruct,
380
+ "chatml_direct": conv_chatml_direct,
381
+ "mistral_direct": conv_chatml_direct,
382
+
383
+ "plain": conv_llava_plain,
384
+ "v0_plain": conv_llava_plain,
385
+ "llava_v0": conv_llava_v0,
386
+ "v0_mmtag": conv_llava_v0_mmtag,
387
+ "llava_v1": conv_llava_v1,
388
+ "v1_mmtag": conv_llava_v1_mmtag,
389
+ "llava_llama_2": conv_llava_llama_2,
390
+
391
+ "mpt": conv_mpt,
392
+ }
393
+
394
+
395
+ if __name__ == "__main__":
396
+ print(default_conversation.get_prompt())
llava/eval/eval_gpt_review.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+
5
+ import openai
6
+ import tqdm
7
+ import ray
8
+ import time
9
+
10
+ NUM_SECONDS_TO_SLEEP = 3
11
+
12
+ @ray.remote(num_cpus=4)
13
+ def get_eval(content: str, max_tokens: int):
14
+ while True:
15
+ try:
16
+ response = openai.ChatCompletion.create(
17
+ model='gpt-4',
18
+ messages=[{
19
+ 'role': 'system',
20
+ 'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
21
+ }, {
22
+ 'role': 'user',
23
+ 'content': content,
24
+ }],
25
+ temperature=0.2, # TODO: figure out which temperature is best for evaluation
26
+ max_tokens=max_tokens,
27
+ )
28
+ break
29
+ except openai.error.RateLimitError:
30
+ pass
31
+ except Exception as e:
32
+ print(e)
33
+ time.sleep(NUM_SECONDS_TO_SLEEP)
34
+
35
+ print('success!')
36
+ return response['choices'][0]['message']['content']
37
+
38
+
39
+ def parse_score(review):
40
+ try:
41
+ score_pair = review.split('\n')[0]
42
+ score_pair = score_pair.replace(',', ' ')
43
+ sp = score_pair.split(' ')
44
+ if len(sp) == 2:
45
+ return [float(sp[0]), float(sp[1])]
46
+ else:
47
+ print('error', review)
48
+ return [-1, -1]
49
+ except Exception as e:
50
+ print(e)
51
+ print('error', review)
52
+ return [-1, -1]
53
+
54
+
55
+ if __name__ == '__main__':
56
+ parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
57
+ parser.add_argument('-q', '--question')
58
+ # parser.add_argument('-a', '--answer')
59
+ parser.add_argument('-a', '--answer-list', nargs='+', default=[])
60
+ parser.add_argument('-r', '--rule')
61
+ parser.add_argument('-o', '--output')
62
+ parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
63
+ args = parser.parse_args()
64
+
65
+ ray.init()
66
+
67
+ f_q = open(os.path.expanduser(args.question))
68
+ f_ans1 = open(os.path.expanduser(args.answer_list[0]))
69
+ f_ans2 = open(os.path.expanduser(args.answer_list[1]))
70
+ rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
71
+
72
+ review_file = open(f'{args.output}', 'w')
73
+
74
+ js_list = []
75
+ handles = []
76
+ idx = 0
77
+ for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
78
+ # if idx == 1:
79
+ # break
80
+
81
+ ques = json.loads(ques_js)
82
+ ans1 = json.loads(ans1_js)
83
+ ans2 = json.loads(ans2_js)
84
+
85
+ category = json.loads(ques_js)['category']
86
+ if category in rule_dict:
87
+ rule = rule_dict[category]
88
+ else:
89
+ rule = rule_dict['default']
90
+ prompt = rule['prompt']
91
+ role = rule['role']
92
+ content = (f'[Question]\n{ques["text"]}\n\n'
93
+ f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
94
+ f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
95
+ f'[System]\n{prompt}\n\n')
96
+ js_list.append({
97
+ 'id': idx+1,
98
+ 'question_id': ques['question_id'],
99
+ 'answer1_id': ans1['answer_id'],
100
+ 'answer2_id': ans2['answer_id'],
101
+ 'category': category})
102
+ idx += 1
103
+ handles.append(get_eval.remote(content, args.max_tokens))
104
+ # To avoid the rate limit set by OpenAI
105
+ time.sleep(NUM_SECONDS_TO_SLEEP)
106
+
107
+ reviews = ray.get(handles)
108
+ for idx, review in enumerate(reviews):
109
+ scores = parse_score(review)
110
+ js_list[idx]['content'] = review
111
+ js_list[idx]['tuple'] = scores
112
+ review_file.write(json.dumps(js_list[idx]) + '\n')
113
+ review_file.close()
llava/eval/eval_gpt_review_bench.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+
5
+ import openai
6
+ import time
7
+
8
+ NUM_SECONDS_TO_SLEEP = 0.5
9
+
10
+
11
+ def get_eval(content: str, max_tokens: int):
12
+ while True:
13
+ try:
14
+ response = openai.ChatCompletion.create(
15
+ model='gpt-4-0314',
16
+ messages=[{
17
+ 'role': 'system',
18
+ 'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
19
+ }, {
20
+ 'role': 'user',
21
+ 'content': content,
22
+ }],
23
+ temperature=0.2, # TODO: figure out which temperature is best for evaluation
24
+ max_tokens=max_tokens,
25
+ )
26
+ break
27
+ except openai.error.RateLimitError:
28
+ pass
29
+ except Exception as e:
30
+ print(e)
31
+ time.sleep(NUM_SECONDS_TO_SLEEP)
32
+
33
+ return response['choices'][0]['message']['content']
34
+
35
+
36
+ def parse_score(review):
37
+ try:
38
+ score_pair = review.split('\n')[0]
39
+ score_pair = score_pair.replace(',', ' ')
40
+ sp = score_pair.split(' ')
41
+ if len(sp) == 2:
42
+ return [float(sp[0]), float(sp[1])]
43
+ else:
44
+ print('error', review)
45
+ return [-1, -1]
46
+ except Exception as e:
47
+ print(e)
48
+ print('error', review)
49
+ return [-1, -1]
50
+
51
+
52
+ if __name__ == '__main__':
53
+ parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
54
+ parser.add_argument('-q', '--question')
55
+ parser.add_argument('-c', '--context')
56
+ parser.add_argument('-a', '--answer-list', nargs='+', default=[])
57
+ parser.add_argument('-r', '--rule')
58
+ parser.add_argument('-o', '--output')
59
+ parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
60
+ args = parser.parse_args()
61
+
62
+ f_q = open(os.path.expanduser(args.question))
63
+ f_ans1 = open(os.path.expanduser(args.answer_list[0]))
64
+ f_ans2 = open(os.path.expanduser(args.answer_list[1]))
65
+ rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
66
+
67
+ if os.path.isfile(os.path.expanduser(args.output)):
68
+ cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
69
+ else:
70
+ cur_reviews = []
71
+
72
+ review_file = open(f'{args.output}', 'a')
73
+
74
+ context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
75
+ image_to_context = {context['image']: context for context in context_list}
76
+
77
+ handles = []
78
+ idx = 0
79
+ for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
80
+ ques = json.loads(ques_js)
81
+ ans1 = json.loads(ans1_js)
82
+ ans2 = json.loads(ans2_js)
83
+
84
+ inst = image_to_context[ques['image']]
85
+
86
+ if isinstance(inst['caption'], list):
87
+ cap_str = '\n'.join(inst['caption'])
88
+ else:
89
+ cap_str = inst['caption']
90
+
91
+ category = 'llava_bench_' + json.loads(ques_js)['category']
92
+ if category in rule_dict:
93
+ rule = rule_dict[category]
94
+ else:
95
+ assert False, f"Visual QA category not found in rule file: {category}."
96
+ prompt = rule['prompt']
97
+ role = rule['role']
98
+ content = (f'[Context]\n{cap_str}\n\n'
99
+ f'[Question]\n{ques["text"]}\n\n'
100
+ f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
101
+ f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
102
+ f'[System]\n{prompt}\n\n')
103
+ cur_js = {
104
+ 'id': idx+1,
105
+ 'question_id': ques['question_id'],
106
+ 'answer1_id': ans1.get('answer_id', ans1['question_id']),
107
+ 'answer2_id': ans2.get('answer_id', ans2['answer_id']),
108
+ 'category': category
109
+ }
110
+ if idx >= len(cur_reviews):
111
+ review = get_eval(content, args.max_tokens)
112
+ scores = parse_score(review)
113
+ cur_js['content'] = review
114
+ cur_js['tuple'] = scores
115
+ review_file.write(json.dumps(cur_js) + '\n')
116
+ review_file.flush()
117
+ else:
118
+ print(f'Skipping {idx} as we already have it.')
119
+ idx += 1
120
+ print(idx)
121
+ review_file.close()
llava/eval/eval_gpt_review_visual.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+
5
+ import openai
6
+ import time
7
+
8
+ NUM_SECONDS_TO_SLEEP = 0.5
9
+
10
+
11
+ def get_eval(content: str, max_tokens: int):
12
+ while True:
13
+ try:
14
+ response = openai.ChatCompletion.create(
15
+ model='gpt-4-0314',
16
+ messages=[{
17
+ 'role': 'system',
18
+ 'content': 'You are a helpful and precise assistant for checking the quality of the answer.'
19
+ }, {
20
+ 'role': 'user',
21
+ 'content': content,
22
+ }],
23
+ temperature=0.2, # TODO: figure out which temperature is best for evaluation
24
+ max_tokens=max_tokens,
25
+ )
26
+ break
27
+ except openai.error.RateLimitError:
28
+ pass
29
+ except Exception as e:
30
+ print(e)
31
+ time.sleep(NUM_SECONDS_TO_SLEEP)
32
+
33
+ return response['choices'][0]['message']['content']
34
+
35
+
36
+ def parse_score(review):
37
+ try:
38
+ score_pair = review.split('\n')[0]
39
+ score_pair = score_pair.replace(',', ' ')
40
+ sp = score_pair.split(' ')
41
+ if len(sp) == 2:
42
+ return [float(sp[0]), float(sp[1])]
43
+ else:
44
+ print('error', review)
45
+ return [-1, -1]
46
+ except Exception as e:
47
+ print(e)
48
+ print('error', review)
49
+ return [-1, -1]
50
+
51
+
52
+ if __name__ == '__main__':
53
+ parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
54
+ parser.add_argument('-q', '--question')
55
+ parser.add_argument('-c', '--context')
56
+ parser.add_argument('-a', '--answer-list', nargs='+', default=[])
57
+ parser.add_argument('-r', '--rule')
58
+ parser.add_argument('-o', '--output')
59
+ parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
60
+ args = parser.parse_args()
61
+
62
+ f_q = open(os.path.expanduser(args.question))
63
+ f_ans1 = open(os.path.expanduser(args.answer_list[0]))
64
+ f_ans2 = open(os.path.expanduser(args.answer_list[1]))
65
+ rule_dict = json.load(open(os.path.expanduser(args.rule), 'r'))
66
+
67
+ if os.path.isfile(os.path.expanduser(args.output)):
68
+ cur_reviews = [json.loads(line) for line in open(os.path.expanduser(args.output))]
69
+ else:
70
+ cur_reviews = []
71
+
72
+ review_file = open(f'{args.output}', 'a')
73
+
74
+ context_list = [json.loads(line) for line in open(os.path.expanduser(args.context))]
75
+ image_to_context = {context['image']: context for context in context_list}
76
+
77
+ handles = []
78
+ idx = 0
79
+ for ques_js, ans1_js, ans2_js in zip(f_q, f_ans1, f_ans2):
80
+ ques = json.loads(ques_js)
81
+ ans1 = json.loads(ans1_js)
82
+ ans2 = json.loads(ans2_js)
83
+
84
+ inst = image_to_context[ques['image']]
85
+ cap_str = '\n'.join(inst['captions'])
86
+ box_str = '\n'.join([f'{instance["category"]}: {instance["bbox"]}' for instance in inst['instances']])
87
+
88
+ category = json.loads(ques_js)['category']
89
+ if category in rule_dict:
90
+ rule = rule_dict[category]
91
+ else:
92
+ assert False, f"Visual QA category not found in rule file: {category}."
93
+ prompt = rule['prompt']
94
+ role = rule['role']
95
+ content = (f'[Context]\n{cap_str}\n\n{box_str}\n\n'
96
+ f'[Question]\n{ques["text"]}\n\n'
97
+ f'[{role} 1]\n{ans1["text"]}\n\n[End of {role} 1]\n\n'
98
+ f'[{role} 2]\n{ans2["text"]}\n\n[End of {role} 2]\n\n'
99
+ f'[System]\n{prompt}\n\n')
100
+ cur_js = {
101
+ 'id': idx+1,
102
+ 'question_id': ques['question_id'],
103
+ 'answer1_id': ans1.get('answer_id', ans1['question_id']),
104
+ 'answer2_id': ans2.get('answer_id', ans2['answer_id']),
105
+ 'category': category
106
+ }
107
+ if idx >= len(cur_reviews):
108
+ review = get_eval(content, args.max_tokens)
109
+ scores = parse_score(review)
110
+ cur_js['content'] = review
111
+ cur_js['tuple'] = scores
112
+ review_file.write(json.dumps(cur_js) + '\n')
113
+ review_file.flush()
114
+ else:
115
+ print(f'Skipping {idx} as we already have it.')
116
+ idx += 1
117
+ print(idx)
118
+ review_file.close()
llava/eval/eval_pope.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import argparse
4
+
5
+ def eval_pope(answers, label_file):
6
+ label_list = [json.loads(q)['label'] for q in open(label_file, 'r')]
7
+
8
+ for answer in answers:
9
+ text = answer['text']
10
+
11
+ # Only keep the first sentence
12
+ if text.find('.') != -1:
13
+ text = text.split('.')[0]
14
+
15
+ text = text.replace(',', '')
16
+ words = text.split(' ')
17
+ if 'No' in words or 'not' in words or 'no' in words:
18
+ answer['text'] = 'no'
19
+ else:
20
+ answer['text'] = 'yes'
21
+
22
+ for i in range(len(label_list)):
23
+ if label_list[i] == 'no':
24
+ label_list[i] = 0
25
+ else:
26
+ label_list[i] = 1
27
+
28
+ pred_list = []
29
+ for answer in answers:
30
+ if answer['text'] == 'no':
31
+ pred_list.append(0)
32
+ else:
33
+ pred_list.append(1)
34
+
35
+ pos = 1
36
+ neg = 0
37
+ yes_ratio = pred_list.count(1) / len(pred_list)
38
+
39
+ TP, TN, FP, FN = 0, 0, 0, 0
40
+ for pred, label in zip(pred_list, label_list):
41
+ if pred == pos and label == pos:
42
+ TP += 1
43
+ elif pred == pos and label == neg:
44
+ FP += 1
45
+ elif pred == neg and label == neg:
46
+ TN += 1
47
+ elif pred == neg and label == pos:
48
+ FN += 1
49
+
50
+ print('TP\tFP\tTN\tFN\t')
51
+ print('{}\t{}\t{}\t{}'.format(TP, FP, TN, FN))
52
+
53
+ precision = float(TP) / float(TP + FP)
54
+ recall = float(TP) / float(TP + FN)
55
+ f1 = 2*precision*recall / (precision + recall)
56
+ acc = (TP + TN) / (TP + TN + FP + FN)
57
+ print('Accuracy: {}'.format(acc))
58
+ print('Precision: {}'.format(precision))
59
+ print('Recall: {}'.format(recall))
60
+ print('F1 score: {}'.format(f1))
61
+ print('Yes ratio: {}'.format(yes_ratio))
62
+ print('%.3f, %.3f, %.3f, %.3f, %.3f' % (f1, acc, precision, recall, yes_ratio) )
63
+
64
+ if __name__ == "__main__":
65
+ parser = argparse.ArgumentParser()
66
+ parser.add_argument("--annotation-dir", type=str)
67
+ parser.add_argument("--question-file", type=str)
68
+ parser.add_argument("--result-file", type=str)
69
+ args = parser.parse_args()
70
+
71
+ questions = [json.loads(line) for line in open(args.question_file)]
72
+ questions = {question['question_id']: question for question in questions}
73
+ answers = [json.loads(q) for q in open(args.result_file)]
74
+ for file in os.listdir(args.annotation_dir):
75
+ assert file.startswith('coco_pope_')
76
+ assert file.endswith('.json')
77
+ category = file[10:-5]
78
+ cur_answers = [x for x in answers if questions[x['question_id']]['category'] == category]
79
+ print('Category: {}, # samples: {}'.format(category, len(cur_answers)))
80
+ eval_pope(cur_answers, os.path.join(args.annotation_dir, file))
81
+ print("====================================")
llava/eval/eval_science_qa.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import re
5
+ import random
6
+
7
+
8
+ def get_args():
9
+ parser = argparse.ArgumentParser()
10
+ parser.add_argument('--base-dir', type=str)
11
+ parser.add_argument('--result-file', type=str)
12
+ parser.add_argument('--output-file', type=str)
13
+ parser.add_argument('--output-result', type=str)
14
+ parser.add_argument('--split', type=str, default='test')
15
+ parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
16
+ return parser.parse_args()
17
+
18
+
19
+ def convert_caps(results):
20
+ fakecaps = []
21
+ for result in results:
22
+ image_id = result['question_id']
23
+ caption = result['text']
24
+ fakecaps.append({"image_id": int(image_id), "caption": caption})
25
+ return fakecaps
26
+
27
+
28
+ def get_pred_idx(prediction, choices, options):
29
+ """
30
+ Get the index (e.g. 2) from the prediction (e.g. 'C')
31
+ """
32
+ if prediction in options[:len(choices)]:
33
+ return options.index(prediction)
34
+ else:
35
+ return -1
36
+ return random.choice(range(len(choices)))
37
+
38
+
39
+ if __name__ == "__main__":
40
+ args = get_args()
41
+
42
+ base_dir = args.base_dir
43
+ split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
44
+ problems = json.load(open(os.path.join(base_dir, "problems.json")))
45
+ predictions = [json.loads(line) for line in open(args.result_file)]
46
+ predictions = {pred['question_id']: pred for pred in predictions}
47
+ split_problems = {idx: problems[idx] for idx in split_indices}
48
+
49
+ results = {'correct': [], 'incorrect': []}
50
+ sqa_results = {}
51
+ sqa_results['acc'] = None
52
+ sqa_results['correct'] = None
53
+ sqa_results['count'] = None
54
+ sqa_results['results'] = {}
55
+ sqa_results['outputs'] = {}
56
+
57
+ for prob_id, prob in split_problems.items():
58
+ if prob_id not in predictions:
59
+ pred = {'text': 'FAILED', 'prompt': 'Unknown'}
60
+ pred_text = 'FAILED'
61
+ else:
62
+ pred = predictions[prob_id]
63
+ pred_text = pred['text']
64
+
65
+ if pred_text in args.options:
66
+ answer = pred_text
67
+ elif len(pred_text) >= 3 and pred_text[0] in args.options and pred_text[1:3] == ". ":
68
+ answer = pred_text[0]
69
+ else:
70
+ pattern = re.compile(r'The answer is ([A-Z]).')
71
+ res = pattern.findall(pred_text)
72
+ if len(res) == 1:
73
+ answer = res[0] # 'A', 'B', ...
74
+ else:
75
+ answer = "FAILED"
76
+
77
+ pred_idx = get_pred_idx(answer, prob['choices'], args.options)
78
+
79
+ analysis = {
80
+ 'question_id': prob_id,
81
+ 'parsed_ans': answer,
82
+ 'ground_truth': args.options[prob['answer']],
83
+ 'question': pred['prompt'],
84
+ 'pred': pred_text,
85
+ 'is_multimodal': '<image>' in pred['prompt'],
86
+ }
87
+
88
+ sqa_results['results'][prob_id] = get_pred_idx(answer, prob['choices'], args.options)
89
+ sqa_results['outputs'][prob_id] = pred_text
90
+
91
+ if pred_idx == prob['answer']:
92
+ results['correct'].append(analysis)
93
+ else:
94
+ results['incorrect'].append(analysis)
95
+
96
+ correct = len(results['correct'])
97
+ total = len(results['correct']) + len(results['incorrect'])
98
+
99
+ ###### IMG ######
100
+ multimodal_correct = len([x for x in results['correct'] if x['is_multimodal']])
101
+ multimodal_incorrect = len([x for x in results['incorrect'] if x['is_multimodal']])
102
+ multimodal_total = multimodal_correct + multimodal_incorrect
103
+ ###### IMG ######
104
+
105
+ print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%, IMG-Accuracy: {multimodal_correct / multimodal_total * 100:.2f}%')
106
+
107
+ sqa_results['acc'] = correct / total * 100
108
+ sqa_results['correct'] = correct
109
+ sqa_results['count'] = total
110
+
111
+ with open(args.output_file, 'w') as f:
112
+ json.dump(results, f, indent=2)
113
+ with open(args.output_result, 'w') as f:
114
+ json.dump(sqa_results, f, indent=2)
llava/eval/eval_science_qa_gpt4.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import re
5
+ import random
6
+ from collections import defaultdict
7
+
8
+
9
+ def get_args():
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument('--base-dir', type=str)
12
+ parser.add_argument('--gpt4-result', type=str)
13
+ parser.add_argument('--our-result', type=str)
14
+ parser.add_argument('--split', type=str, default='test')
15
+ parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
16
+ return parser.parse_args()
17
+
18
+
19
+ def convert_caps(results):
20
+ fakecaps = []
21
+ for result in results:
22
+ image_id = result['question_id']
23
+ caption = result['text']
24
+ fakecaps.append({"image_id": int(image_id), "caption": caption})
25
+ return fakecaps
26
+
27
+
28
+ def get_pred_idx(prediction, choices, options):
29
+ """
30
+ Get the index (e.g. 2) from the prediction (e.g. 'C')
31
+ """
32
+ if prediction in options[:len(choices)]:
33
+ return options.index(prediction)
34
+ else:
35
+ return random.choice(range(len(choices)))
36
+
37
+
38
+ if __name__ == "__main__":
39
+ args = get_args()
40
+
41
+ base_dir = args.base_dir
42
+ split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
43
+ problems = json.load(open(os.path.join(base_dir, "problems.json")))
44
+ our_predictions = [json.loads(line) for line in open(args.our_result)]
45
+ our_predictions = {pred['question_id']: pred for pred in our_predictions}
46
+ split_problems = {idx: problems[idx] for idx in split_indices}
47
+
48
+ gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
49
+
50
+ results = defaultdict(lambda: 0)
51
+
52
+ for prob_id, prob in split_problems.items():
53
+ if prob_id not in our_predictions:
54
+ continue
55
+ if prob_id not in gpt4_predictions:
56
+ continue
57
+ our_pred = our_predictions[prob_id]['text']
58
+ gpt4_pred = gpt4_predictions[prob_id]
59
+
60
+ pattern = re.compile(r'The answer is ([A-Z]).')
61
+ our_res = pattern.findall(our_pred)
62
+ if len(our_res) == 1:
63
+ our_answer = our_res[0] # 'A', 'B', ...
64
+ else:
65
+ our_answer = "FAILED"
66
+ gpt4_res = pattern.findall(gpt4_pred)
67
+ if len(gpt4_res) == 1:
68
+ gpt4_answer = gpt4_res[0] # 'A', 'B', ...
69
+ else:
70
+ gpt4_answer = "FAILED"
71
+
72
+ our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
73
+ gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
74
+
75
+ if gpt4_answer == 'FAILED':
76
+ results['gpt4_failed'] += 1
77
+ # continue
78
+ gpt4_pred_idx = our_pred_idx
79
+ # if our_pred_idx != prob['answer']:
80
+ # print(our_predictions[prob_id]['prompt'])
81
+ # print('-----------------')
82
+ # print(f'LECTURE: {prob["lecture"]}')
83
+ # print(f'SOLUTION: {prob["solution"]}')
84
+ # print('=====================')
85
+ else:
86
+ # continue
87
+ pass
88
+ # gpt4_pred_idx = our_pred_idx
89
+
90
+ if gpt4_pred_idx == prob['answer']:
91
+ results['correct'] += 1
92
+ else:
93
+ results['incorrect'] += 1
94
+
95
+
96
+ if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
97
+ results['correct_upperbound'] += 1
98
+
99
+ correct = results['correct']
100
+ total = results['correct'] + results['incorrect']
101
+ print(f'Total: {total}, Correct: {correct}, Accuracy: {correct / total * 100:.2f}%')
102
+ print(f'Total: {total}, Correct (upper): {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
103
+ print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
104
+
llava/eval/eval_science_qa_gpt4_requery.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+ import re
5
+ import random
6
+ from collections import defaultdict
7
+
8
+
9
+ def get_args():
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument('--base-dir', type=str)
12
+ parser.add_argument('--gpt4-result', type=str)
13
+ parser.add_argument('--requery-result', type=str)
14
+ parser.add_argument('--our-result', type=str)
15
+ parser.add_argument('--output-result', type=str)
16
+ parser.add_argument('--split', type=str, default='test')
17
+ parser.add_argument('--options', type=list, default=["A", "B", "C", "D", "E"])
18
+ return parser.parse_args()
19
+
20
+
21
+ def convert_caps(results):
22
+ fakecaps = []
23
+ for result in results:
24
+ image_id = result['question_id']
25
+ caption = result['text']
26
+ fakecaps.append({"image_id": int(image_id), "caption": caption})
27
+ return fakecaps
28
+
29
+
30
+ def get_pred_idx(prediction, choices, options):
31
+ """
32
+ Get the index (e.g. 2) from the prediction (e.g. 'C')
33
+ """
34
+ if prediction in options[:len(choices)]:
35
+ return options.index(prediction)
36
+ else:
37
+ return random.choice(range(len(choices)))
38
+
39
+
40
+ if __name__ == "__main__":
41
+ args = get_args()
42
+
43
+ base_dir = args.base_dir
44
+ split_indices = json.load(open(os.path.join(base_dir, "pid_splits.json")))[args.split]
45
+ problems = json.load(open(os.path.join(base_dir, "problems.json")))
46
+ our_predictions = [json.loads(line) for line in open(args.our_result)]
47
+ our_predictions = {pred['question_id']: pred for pred in our_predictions}
48
+ split_problems = {idx: problems[idx] for idx in split_indices}
49
+
50
+ requery_predictions = [json.loads(line) for line in open(args.requery_result)]
51
+ requery_predictions = {pred['question_id']: pred for pred in requery_predictions}
52
+
53
+ gpt4_predictions = json.load(open(args.gpt4_result))['outputs']
54
+
55
+ results = defaultdict(lambda: 0)
56
+
57
+ sqa_results = {}
58
+ sqa_results['acc'] = None
59
+ sqa_results['correct'] = None
60
+ sqa_results['count'] = None
61
+ sqa_results['results'] = {}
62
+ sqa_results['outputs'] = {}
63
+
64
+ for prob_id, prob in split_problems.items():
65
+ if prob_id not in our_predictions:
66
+ assert False
67
+ if prob_id not in gpt4_predictions:
68
+ assert False
69
+ our_pred = our_predictions[prob_id]['text']
70
+ gpt4_pred = gpt4_predictions[prob_id]
71
+ if prob_id not in requery_predictions:
72
+ results['missing_requery'] += 1
73
+ requery_pred = "MISSING"
74
+ else:
75
+ requery_pred = requery_predictions[prob_id]['text']
76
+
77
+ pattern = re.compile(r'The answer is ([A-Z]).')
78
+ our_res = pattern.findall(our_pred)
79
+ if len(our_res) == 1:
80
+ our_answer = our_res[0] # 'A', 'B', ...
81
+ else:
82
+ our_answer = "FAILED"
83
+
84
+ requery_res = pattern.findall(requery_pred)
85
+ if len(requery_res) == 1:
86
+ requery_answer = requery_res[0] # 'A', 'B', ...
87
+ else:
88
+ requery_answer = "FAILED"
89
+
90
+ gpt4_res = pattern.findall(gpt4_pred)
91
+ if len(gpt4_res) == 1:
92
+ gpt4_answer = gpt4_res[0] # 'A', 'B', ...
93
+ else:
94
+ gpt4_answer = "FAILED"
95
+
96
+ our_pred_idx = get_pred_idx(our_answer, prob['choices'], args.options)
97
+ gpt4_pred_idx = get_pred_idx(gpt4_answer, prob['choices'], args.options)
98
+ requery_pred_idx = get_pred_idx(requery_answer, prob['choices'], args.options)
99
+
100
+ results['total'] += 1
101
+
102
+ if gpt4_answer == 'FAILED':
103
+ results['gpt4_failed'] += 1
104
+ if gpt4_pred_idx == prob['answer']:
105
+ results['gpt4_correct'] += 1
106
+ if our_pred_idx == prob['answer']:
107
+ results['gpt4_ourvisual_correct'] += 1
108
+ elif gpt4_pred_idx == prob['answer']:
109
+ results['gpt4_correct'] += 1
110
+ results['gpt4_ourvisual_correct'] += 1
111
+
112
+ if our_pred_idx == prob['answer']:
113
+ results['our_correct'] += 1
114
+
115
+ if requery_answer == 'FAILED':
116
+ sqa_results['results'][prob_id] = our_pred_idx
117
+ if our_pred_idx == prob['answer']:
118
+ results['requery_correct'] += 1
119
+ else:
120
+ sqa_results['results'][prob_id] = requery_pred_idx
121
+ if requery_pred_idx == prob['answer']:
122
+ results['requery_correct'] += 1
123
+ else:
124
+ print(f"""
125
+ Question ({args.options[prob['answer']]}): {our_predictions[prob_id]['prompt']}
126
+ Our ({our_answer}): {our_pred}
127
+ GPT-4 ({gpt4_answer}): {gpt4_pred}
128
+ Requery ({requery_answer}): {requery_pred}
129
+ print("=====================================")
130
+ """)
131
+
132
+ if gpt4_pred_idx == prob['answer'] or our_pred_idx == prob['answer']:
133
+ results['correct_upperbound'] += 1
134
+
135
+ total = results['total']
136
+ print(f'Total: {total}, Our-Correct: {results["our_correct"]}, Accuracy: {results["our_correct"] / total * 100:.2f}%')
137
+ print(f'Total: {total}, GPT-4-Correct: {results["gpt4_correct"]}, Accuracy: {results["gpt4_correct"] / total * 100:.2f}%')
138
+ print(f'Total: {total}, GPT-4 NO-ANS (RANDOM): {results["gpt4_failed"]}, Percentage: {results["gpt4_failed"] / total * 100:.2f}%')
139
+ print(f'Total: {total}, GPT-4-OursVisual-Correct: {results["gpt4_ourvisual_correct"]}, Accuracy: {results["gpt4_ourvisual_correct"] / total * 100:.2f}%')
140
+ print(f'Total: {total}, Requery-Correct: {results["requery_correct"]}, Accuracy: {results["requery_correct"] / total * 100:.2f}%')
141
+ print(f'Total: {total}, Correct upper: {results["correct_upperbound"]}, Accuracy: {results["correct_upperbound"] / total * 100:.2f}%')
142
+
143
+ sqa_results['acc'] = results["requery_correct"] / total * 100
144
+ sqa_results['correct'] = results["requery_correct"]
145
+ sqa_results['count'] = total
146
+
147
+ with open(args.output_result, 'w') as f:
148
+ json.dump(sqa_results, f, indent=2)
149
+
llava/eval/eval_textvqa.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ import json
4
+ import re
5
+
6
+ from llava.eval.m4c_evaluator import TextVQAAccuracyEvaluator
7
+
8
+
9
+ def get_args():
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument('--annotation-file', type=str)
12
+ parser.add_argument('--result-file', type=str)
13
+ parser.add_argument('--result-dir', type=str)
14
+ return parser.parse_args()
15
+
16
+
17
+ def prompt_processor(prompt):
18
+ if prompt.startswith('OCR tokens: '):
19
+ pattern = r"Question: (.*?) Short answer:"
20
+ match = re.search(pattern, prompt, re.DOTALL)
21
+ question = match.group(1)
22
+ elif 'Reference OCR token: ' in prompt and len(prompt.split('\n')) == 3:
23
+ if prompt.startswith('Reference OCR token:'):
24
+ question = prompt.split('\n')[1]
25
+ else:
26
+ question = prompt.split('\n')[0]
27
+ elif len(prompt.split('\n')) == 2:
28
+ question = prompt.split('\n')[0]
29
+ else:
30
+ assert False
31
+
32
+ return question.lower()
33
+
34
+
35
+ def eval_single(annotation_file, result_file):
36
+ experiment_name = os.path.splitext(os.path.basename(result_file))[0]
37
+ print(experiment_name)
38
+ annotations = json.load(open(annotation_file))['data']
39
+ annotations = {(annotation['image_id'], annotation['question'].lower()): annotation for annotation in annotations}
40
+ results = [json.loads(line) for line in open(result_file)]
41
+
42
+ pred_list = []
43
+ for result in results:
44
+ annotation = annotations[(result['question_id'], prompt_processor(result['prompt']))]
45
+ pred_list.append({
46
+ "pred_answer": result['text'],
47
+ "gt_answers": annotation['answers'],
48
+ })
49
+
50
+ evaluator = TextVQAAccuracyEvaluator()
51
+ print('Samples: {}\nAccuracy: {:.2f}%\n'.format(len(pred_list), 100. * evaluator.eval_pred_list(pred_list)))
52
+
53
+
54
+ if __name__ == "__main__":
55
+ args = get_args()
56
+
57
+ if args.result_file is not None:
58
+ eval_single(args.annotation_file, args.result_file)
59
+
60
+ if args.result_dir is not None:
61
+ for result_file in sorted(os.listdir(args.result_dir)):
62
+ if not result_file.endswith('.jsonl'):
63
+ print(f'Skipping {result_file}')
64
+ continue
65
+ eval_single(args.annotation_file, os.path.join(args.result_dir, result_file))
llava/eval/generate_webpage_data_from_table.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate json file for webpage."""
2
+ import json
3
+ import os
4
+ import re
5
+
6
+ # models = ['llama', 'alpaca', 'gpt35', 'bard']
7
+ models = ['vicuna']
8
+
9
+
10
+ def read_jsonl(path: str, key: str=None):
11
+ data = []
12
+ with open(os.path.expanduser(path)) as f:
13
+ for line in f:
14
+ if not line:
15
+ continue
16
+ data.append(json.loads(line))
17
+ if key is not None:
18
+ data.sort(key=lambda x: x[key])
19
+ data = {item[key]: item for item in data}
20
+ return data
21
+
22
+
23
+ def trim_hanging_lines(s: str, n: int) -> str:
24
+ s = s.strip()
25
+ for _ in range(n):
26
+ s = s.split('\n', 1)[1].strip()
27
+ return s
28
+
29
+
30
+ if __name__ == '__main__':
31
+ questions = read_jsonl('table/question.jsonl', key='question_id')
32
+
33
+ # alpaca_answers = read_jsonl('table/answer/answer_alpaca-13b.jsonl', key='question_id')
34
+ # bard_answers = read_jsonl('table/answer/answer_bard.jsonl', key='question_id')
35
+ # gpt35_answers = read_jsonl('table/answer/answer_gpt35.jsonl', key='question_id')
36
+ # llama_answers = read_jsonl('table/answer/answer_llama-13b.jsonl', key='question_id')
37
+ vicuna_answers = read_jsonl('table/answer/answer_vicuna-13b.jsonl', key='question_id')
38
+ ours_answers = read_jsonl('table/results/llama-13b-hf-alpaca.jsonl', key='question_id')
39
+
40
+ review_vicuna = read_jsonl('table/review/review_vicuna-13b_llama-13b-hf-alpaca.jsonl', key='question_id')
41
+ # review_alpaca = read_jsonl('table/review/review_alpaca-13b_vicuna-13b.jsonl', key='question_id')
42
+ # review_bard = read_jsonl('table/review/review_bard_vicuna-13b.jsonl', key='question_id')
43
+ # review_gpt35 = read_jsonl('table/review/review_gpt35_vicuna-13b.jsonl', key='question_id')
44
+ # review_llama = read_jsonl('table/review/review_llama-13b_vicuna-13b.jsonl', key='question_id')
45
+
46
+ records = []
47
+ for qid in questions.keys():
48
+ r = {
49
+ 'id': qid,
50
+ 'category': questions[qid]['category'],
51
+ 'question': questions[qid]['text'],
52
+ 'answers': {
53
+ # 'alpaca': alpaca_answers[qid]['text'],
54
+ # 'llama': llama_answers[qid]['text'],
55
+ # 'bard': bard_answers[qid]['text'],
56
+ # 'gpt35': gpt35_answers[qid]['text'],
57
+ 'vicuna': vicuna_answers[qid]['text'],
58
+ 'ours': ours_answers[qid]['text'],
59
+ },
60
+ 'evaluations': {
61
+ # 'alpaca': review_alpaca[qid]['text'],
62
+ # 'llama': review_llama[qid]['text'],
63
+ # 'bard': review_bard[qid]['text'],
64
+ 'vicuna': review_vicuna[qid]['content'],
65
+ # 'gpt35': review_gpt35[qid]['text'],
66
+ },
67
+ 'scores': {
68
+ 'vicuna': review_vicuna[qid]['tuple'],
69
+ # 'alpaca': review_alpaca[qid]['score'],
70
+ # 'llama': review_llama[qid]['score'],
71
+ # 'bard': review_bard[qid]['score'],
72
+ # 'gpt35': review_gpt35[qid]['score'],
73
+ },
74
+ }
75
+
76
+ # cleanup data
77
+ cleaned_evals = {}
78
+ for k, v in r['evaluations'].items():
79
+ v = v.strip()
80
+ lines = v.split('\n')
81
+ # trim the first line if it's a pair of numbers
82
+ if re.match(r'\d+[, ]+\d+', lines[0]):
83
+ lines = lines[1:]
84
+ v = '\n'.join(lines)
85
+ cleaned_evals[k] = v.replace('Assistant 1', "**Assistant 1**").replace('Assistant 2', '**Assistant 2**')
86
+
87
+ r['evaluations'] = cleaned_evals
88
+ records.append(r)
89
+
90
+ # Reorder the records, this is optional
91
+ for r in records:
92
+ if r['id'] <= 20:
93
+ r['id'] += 60
94
+ else:
95
+ r['id'] -= 20
96
+ for r in records:
97
+ if r['id'] <= 50:
98
+ r['id'] += 10
99
+ elif 50 < r['id'] <= 60:
100
+ r['id'] -= 50
101
+ for r in records:
102
+ if r['id'] == 7:
103
+ r['id'] = 1
104
+ elif r['id'] < 7:
105
+ r['id'] += 1
106
+
107
+ records.sort(key=lambda x: x['id'])
108
+
109
+ # Write to file
110
+ with open('webpage/data.json', 'w') as f:
111
+ json.dump({'questions': records, 'models': models}, f, indent=2)
llava/eval/m4c_evaluator.py ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Facebook, Inc. and its affiliates.
2
+ import re
3
+
4
+ from tqdm import tqdm
5
+
6
+
7
+ class EvalAIAnswerProcessor:
8
+ """
9
+ Processes an answer similar to Eval AI
10
+ copied from
11
+ https://github.com/facebookresearch/mmf/blob/c46b3b3391275b4181567db80943473a89ab98ab/pythia/tasks/processors.py#L897
12
+ """
13
+
14
+ CONTRACTIONS = {
15
+ "aint": "ain't",
16
+ "arent": "aren't",
17
+ "cant": "can't",
18
+ "couldve": "could've",
19
+ "couldnt": "couldn't",
20
+ "couldn'tve": "couldn't've",
21
+ "couldnt've": "couldn't've",
22
+ "didnt": "didn't",
23
+ "doesnt": "doesn't",
24
+ "dont": "don't",
25
+ "hadnt": "hadn't",
26
+ "hadnt've": "hadn't've",
27
+ "hadn'tve": "hadn't've",
28
+ "hasnt": "hasn't",
29
+ "havent": "haven't",
30
+ "hed": "he'd",
31
+ "hed've": "he'd've",
32
+ "he'dve": "he'd've",
33
+ "hes": "he's",
34
+ "howd": "how'd",
35
+ "howll": "how'll",
36
+ "hows": "how's",
37
+ "Id've": "I'd've",
38
+ "I'dve": "I'd've",
39
+ "Im": "I'm",
40
+ "Ive": "I've",
41
+ "isnt": "isn't",
42
+ "itd": "it'd",
43
+ "itd've": "it'd've",
44
+ "it'dve": "it'd've",
45
+ "itll": "it'll",
46
+ "let's": "let's",
47
+ "maam": "ma'am",
48
+ "mightnt": "mightn't",
49
+ "mightnt've": "mightn't've",
50
+ "mightn'tve": "mightn't've",
51
+ "mightve": "might've",
52
+ "mustnt": "mustn't",
53
+ "mustve": "must've",
54
+ "neednt": "needn't",
55
+ "notve": "not've",
56
+ "oclock": "o'clock",
57
+ "oughtnt": "oughtn't",
58
+ "ow's'at": "'ow's'at",
59
+ "'ows'at": "'ow's'at",
60
+ "'ow'sat": "'ow's'at",
61
+ "shant": "shan't",
62
+ "shed've": "she'd've",
63
+ "she'dve": "she'd've",
64
+ "she's": "she's",
65
+ "shouldve": "should've",
66
+ "shouldnt": "shouldn't",
67
+ "shouldnt've": "shouldn't've",
68
+ "shouldn'tve": "shouldn't've",
69
+ "somebody'd": "somebodyd",
70
+ "somebodyd've": "somebody'd've",
71
+ "somebody'dve": "somebody'd've",
72
+ "somebodyll": "somebody'll",
73
+ "somebodys": "somebody's",
74
+ "someoned": "someone'd",
75
+ "someoned've": "someone'd've",
76
+ "someone'dve": "someone'd've",
77
+ "someonell": "someone'll",
78
+ "someones": "someone's",
79
+ "somethingd": "something'd",
80
+ "somethingd've": "something'd've",
81
+ "something'dve": "something'd've",
82
+ "somethingll": "something'll",
83
+ "thats": "that's",
84
+ "thered": "there'd",
85
+ "thered've": "there'd've",
86
+ "there'dve": "there'd've",
87
+ "therere": "there're",
88
+ "theres": "there's",
89
+ "theyd": "they'd",
90
+ "theyd've": "they'd've",
91
+ "they'dve": "they'd've",
92
+ "theyll": "they'll",
93
+ "theyre": "they're",
94
+ "theyve": "they've",
95
+ "twas": "'twas",
96
+ "wasnt": "wasn't",
97
+ "wed've": "we'd've",
98
+ "we'dve": "we'd've",
99
+ "weve": "we've",
100
+ "werent": "weren't",
101
+ "whatll": "what'll",
102
+ "whatre": "what're",
103
+ "whats": "what's",
104
+ "whatve": "what've",
105
+ "whens": "when's",
106
+ "whered": "where'd",
107
+ "wheres": "where's",
108
+ "whereve": "where've",
109
+ "whod": "who'd",
110
+ "whod've": "who'd've",
111
+ "who'dve": "who'd've",
112
+ "wholl": "who'll",
113
+ "whos": "who's",
114
+ "whove": "who've",
115
+ "whyll": "why'll",
116
+ "whyre": "why're",
117
+ "whys": "why's",
118
+ "wont": "won't",
119
+ "wouldve": "would've",
120
+ "wouldnt": "wouldn't",
121
+ "wouldnt've": "wouldn't've",
122
+ "wouldn'tve": "wouldn't've",
123
+ "yall": "y'all",
124
+ "yall'll": "y'all'll",
125
+ "y'allll": "y'all'll",
126
+ "yall'd've": "y'all'd've",
127
+ "y'alld've": "y'all'd've",
128
+ "y'all'dve": "y'all'd've",
129
+ "youd": "you'd",
130
+ "youd've": "you'd've",
131
+ "you'dve": "you'd've",
132
+ "youll": "you'll",
133
+ "youre": "you're",
134
+ "youve": "you've",
135
+ }
136
+
137
+ NUMBER_MAP = {
138
+ "none": "0",
139
+ "zero": "0",
140
+ "one": "1",
141
+ "two": "2",
142
+ "three": "3",
143
+ "four": "4",
144
+ "five": "5",
145
+ "six": "6",
146
+ "seven": "7",
147
+ "eight": "8",
148
+ "nine": "9",
149
+ "ten": "10",
150
+ }
151
+ ARTICLES = ["a", "an", "the"]
152
+ PERIOD_STRIP = re.compile(r"(?!<=\d)(\.)(?!\d)")
153
+ COMMA_STRIP = re.compile(r"(?<=\d)(\,)+(?=\d)")
154
+ PUNCTUATIONS = [
155
+ ";",
156
+ r"/",
157
+ "[",
158
+ "]",
159
+ '"',
160
+ "{",
161
+ "}",
162
+ "(",
163
+ ")",
164
+ "=",
165
+ "+",
166
+ "\\",
167
+ "_",
168
+ "-",
169
+ ">",
170
+ "<",
171
+ "@",
172
+ "`",
173
+ ",",
174
+ "?",
175
+ "!",
176
+ ]
177
+
178
+ def __init__(self, *args, **kwargs):
179
+ pass
180
+
181
+ def word_tokenize(self, word):
182
+ word = word.lower()
183
+ word = word.replace(",", "").replace("?", "").replace("'s", " 's")
184
+ return word.strip()
185
+
186
+ def process_punctuation(self, in_text):
187
+ out_text = in_text
188
+ for p in self.PUNCTUATIONS:
189
+ if (p + " " in in_text or " " + p in in_text) or (
190
+ re.search(self.COMMA_STRIP, in_text) is not None
191
+ ):
192
+ out_text = out_text.replace(p, "")
193
+ else:
194
+ out_text = out_text.replace(p, " ")
195
+ out_text = self.PERIOD_STRIP.sub("", out_text, re.UNICODE)
196
+ return out_text
197
+
198
+ def process_digit_article(self, in_text):
199
+ out_text = []
200
+ temp_text = in_text.lower().split()
201
+ for word in temp_text:
202
+ word = self.NUMBER_MAP.setdefault(word, word)
203
+ if word not in self.ARTICLES:
204
+ out_text.append(word)
205
+ else:
206
+ pass
207
+ for word_id, word in enumerate(out_text):
208
+ if word in self.CONTRACTIONS:
209
+ out_text[word_id] = self.CONTRACTIONS[word]
210
+ out_text = " ".join(out_text)
211
+ return out_text
212
+
213
+ def __call__(self, item):
214
+ item = self.word_tokenize(item)
215
+ item = item.replace("\n", " ").replace("\t", " ").strip()
216
+ item = self.process_punctuation(item)
217
+ item = self.process_digit_article(item)
218
+ return item
219
+
220
+
221
+ class TextVQAAccuracyEvaluator:
222
+ def __init__(self):
223
+ self.answer_processor = EvalAIAnswerProcessor()
224
+
225
+ def _compute_answer_scores(self, raw_answers):
226
+ """
227
+ compute the accuracy (soft score) of human answers
228
+ """
229
+ answers = [self.answer_processor(a) for a in raw_answers]
230
+ assert len(answers) == 10
231
+ gt_answers = list(enumerate(answers))
232
+ unique_answers = set(answers)
233
+ unique_answer_scores = {}
234
+
235
+ for unique_answer in unique_answers:
236
+ accs = []
237
+ for gt_answer in gt_answers:
238
+ other_answers = [item for item in gt_answers if item != gt_answer]
239
+ matching_answers = [
240
+ item for item in other_answers if item[1] == unique_answer
241
+ ]
242
+ acc = min(1, float(len(matching_answers)) / 3)
243
+ accs.append(acc)
244
+ unique_answer_scores[unique_answer] = sum(accs) / len(accs)
245
+
246
+ return unique_answer_scores
247
+
248
+ def eval_pred_list(self, pred_list):
249
+ pred_scores = []
250
+ for entry in tqdm(pred_list):
251
+ pred_answer = self.answer_processor(entry["pred_answer"])
252
+ unique_answer_scores = self._compute_answer_scores(entry["gt_answers"])
253
+ score = unique_answer_scores.get(pred_answer, 0.0)
254
+ pred_scores.append(score)
255
+
256
+ accuracy = sum(pred_scores) / len(pred_scores)
257
+ return accuracy
258
+
259
+
260
+ class STVQAAccuracyEvaluator:
261
+ def __init__(self):
262
+ self.answer_processor = EvalAIAnswerProcessor()
263
+
264
+ def eval_pred_list(self, pred_list):
265
+ pred_scores = []
266
+ for entry in pred_list:
267
+ pred_answer = self.answer_processor(entry["pred_answer"])
268
+ gts = [self.answer_processor(a) for a in entry["gt_answers"]]
269
+ score = 1.0 if pred_answer in gts else 0.0
270
+ pred_scores.append(score)
271
+
272
+ accuracy = sum(pred_scores) / len(pred_scores)
273
+ return accuracy
274
+
275
+
276
+ class STVQAANLSEvaluator:
277
+ def __init__(self):
278
+ import editdistance # install with `pip install editdistance`
279
+
280
+ self.get_edit_distance = editdistance.eval
281
+
282
+ def get_anls(self, s1, s2):
283
+ s1 = s1.lower().strip()
284
+ s2 = s2.lower().strip()
285
+ iou = 1 - self.get_edit_distance(s1, s2) / max(len(s1), len(s2))
286
+ anls = iou if iou >= 0.5 else 0.0
287
+ return anls
288
+
289
+ def eval_pred_list(self, pred_list):
290
+ pred_scores = []
291
+ for entry in pred_list:
292
+ anls = max(
293
+ self.get_anls(entry["pred_answer"], gt) for gt in entry["gt_answers"]
294
+ )
295
+ pred_scores.append(anls)
296
+
297
+ accuracy = sum(pred_scores) / len(pred_scores)
298
+ return accuracy
299
+
300
+
301
+ class TextCapsBleu4Evaluator:
302
+ def __init__(self):
303
+ # The following script requires Java 1.8.0 and pycocotools installed.
304
+ # The pycocoevalcap can be installed with pip as
305
+ # pip install git+https://github.com/ronghanghu/coco-caption.git@python23
306
+ # Original pycocoevalcap code is at https://github.com/tylin/coco-caption
307
+ # but has no python3 support yet.
308
+ try:
309
+ from pycocoevalcap.bleu.bleu import Bleu
310
+ from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer
311
+ except ModuleNotFoundError:
312
+ print(
313
+ "Please install pycocoevalcap module using "
314
+ "pip install git+https://github.com/ronghanghu/coco-caption.git@python23" # noqa
315
+ )
316
+ raise
317
+
318
+ self.tokenizer = PTBTokenizer()
319
+ self.scorer = Bleu(4)
320
+
321
+ def eval_pred_list(self, pred_list):
322
+ # Create reference and hypotheses captions.
323
+ gts = {}
324
+ res = {}
325
+ for idx, entry in enumerate(pred_list):
326
+ gts[idx] = [{"caption": a} for a in entry["gt_answers"]]
327
+ res[idx] = [{"caption": entry["pred_answer"]}]
328
+
329
+ gts = self.tokenizer.tokenize(gts)
330
+ res = self.tokenizer.tokenize(res)
331
+ score, _ = self.scorer.compute_score(gts, res)
332
+
333
+ bleu4 = score[3] # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4)
334
+ return bleu4
llava/eval/model_qa.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ from transformers import AutoTokenizer, AutoModelForCausalLM, StoppingCriteria
3
+ import torch
4
+ import os
5
+ import json
6
+ from tqdm import tqdm
7
+ import shortuuid
8
+
9
+ from llava.conversation import default_conversation
10
+ from llava.utils import disable_torch_init
11
+
12
+
13
+ @torch.inference_mode()
14
+ def eval_model(model_name, questions_file, answers_file):
15
+ # Model
16
+ disable_torch_init()
17
+ model_name = os.path.expanduser(model_name)
18
+ tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
19
+ model = AutoModelForCausalLM.from_pretrained(model_name,
20
+ torch_dtype=torch.float16).cuda()
21
+
22
+
23
+ ques_file = open(os.path.expanduser(questions_file), "r")
24
+ ans_file = open(os.path.expanduser(answers_file), "w")
25
+ for i, line in enumerate(tqdm(ques_file)):
26
+ idx = json.loads(line)["question_id"]
27
+ qs = json.loads(line)["text"]
28
+ cat = json.loads(line)["category"]
29
+ conv = default_conversation.copy()
30
+ conv.append_message(conv.roles[0], qs)
31
+ prompt = conv.get_prompt()
32
+ inputs = tokenizer([prompt])
33
+ input_ids = torch.as_tensor(inputs.input_ids).cuda()
34
+ output_ids = model.generate(
35
+ input_ids,
36
+ do_sample=True,
37
+ use_cache=True,
38
+ temperature=0.7,
39
+ max_new_tokens=1024,)
40
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
41
+ try:
42
+ index = outputs.index(conv.sep, len(prompt))
43
+ except ValueError:
44
+ outputs += conv.sep
45
+ index = outputs.index(conv.sep, len(prompt))
46
+
47
+ outputs = outputs[len(prompt) + len(conv.roles[1]) + 2:index].strip()
48
+ ans_id = shortuuid.uuid()
49
+ ans_file.write(json.dumps({"question_id": idx,
50
+ "text": outputs,
51
+ "answer_id": ans_id,
52
+ "model_id": model_name,
53
+ "metadata": {}}) + "\n")
54
+ ans_file.flush()
55
+ ans_file.close()
56
+
57
+ if __name__ == "__main__":
58
+ parser = argparse.ArgumentParser()
59
+ parser.add_argument("--model-name", type=str, default="facebook/opt-350m")
60
+ parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
61
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
62
+ args = parser.parse_args()
63
+
64
+ eval_model(args.model_name, args.question_file, args.answers_file)
llava/eval/model_vqa.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ import os
4
+ import json
5
+ from tqdm import tqdm
6
+ import shortuuid
7
+
8
+ from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
9
+ from llava.conversation import conv_templates, SeparatorStyle
10
+ from llava.model.builder import load_pretrained_model
11
+ from llava.utils import disable_torch_init
12
+ from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
13
+
14
+ from PIL import Image
15
+ import math
16
+
17
+
18
+ def split_list(lst, n):
19
+ """Split a list into n (roughly) equal-sized chunks"""
20
+ chunk_size = math.ceil(len(lst) / n) # integer division
21
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
22
+
23
+
24
+ def get_chunk(lst, n, k):
25
+ chunks = split_list(lst, n)
26
+ return chunks[k]
27
+
28
+
29
+ def eval_model(args):
30
+ # Model
31
+ disable_torch_init()
32
+ model_path = os.path.expanduser(args.model_path)
33
+ model_name = get_model_name_from_path(model_path)
34
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
35
+
36
+ questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
37
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
38
+ answers_file = os.path.expanduser(args.answers_file)
39
+ os.makedirs(os.path.dirname(answers_file), exist_ok=True)
40
+ ans_file = open(answers_file, "w")
41
+ for line in tqdm(questions):
42
+ idx = line["question_id"]
43
+ image_file = line["image"]
44
+ qs = line["text"]
45
+ cur_prompt = qs
46
+ if model.config.mm_use_im_start_end:
47
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
48
+ else:
49
+ qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
50
+
51
+ conv = conv_templates[args.conv_mode].copy()
52
+ conv.append_message(conv.roles[0], qs)
53
+ conv.append_message(conv.roles[1], None)
54
+ prompt = conv.get_prompt()
55
+
56
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
57
+
58
+ image = Image.open(os.path.join(args.image_folder, image_file)).convert('RGB')
59
+ image_tensor = process_images([image], image_processor, model.config)[0]
60
+
61
+ with torch.inference_mode():
62
+ output_ids = model.generate(
63
+ input_ids,
64
+ images=image_tensor.unsqueeze(0).half().cuda(),
65
+ image_sizes=[image.size],
66
+ do_sample=True if args.temperature > 0 else False,
67
+ temperature=args.temperature,
68
+ top_p=args.top_p,
69
+ num_beams=args.num_beams,
70
+ # no_repeat_ngram_size=3,
71
+ max_new_tokens=1024,
72
+ use_cache=True)
73
+
74
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
75
+
76
+ ans_id = shortuuid.uuid()
77
+ ans_file.write(json.dumps({"question_id": idx,
78
+ "prompt": cur_prompt,
79
+ "text": outputs,
80
+ "answer_id": ans_id,
81
+ "model_id": model_name,
82
+ "metadata": {}}) + "\n")
83
+ ans_file.flush()
84
+ ans_file.close()
85
+
86
+ if __name__ == "__main__":
87
+ parser = argparse.ArgumentParser()
88
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
89
+ parser.add_argument("--model-base", type=str, default=None)
90
+ parser.add_argument("--image-folder", type=str, default="")
91
+ parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
92
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
93
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
94
+ parser.add_argument("--num-chunks", type=int, default=1)
95
+ parser.add_argument("--chunk-idx", type=int, default=0)
96
+ parser.add_argument("--temperature", type=float, default=0.2)
97
+ parser.add_argument("--top_p", type=float, default=None)
98
+ parser.add_argument("--num_beams", type=int, default=1)
99
+ args = parser.parse_args()
100
+
101
+ eval_model(args)
llava/eval/model_vqa_loader.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ import os
4
+ import json
5
+ from tqdm import tqdm
6
+ import shortuuid
7
+
8
+ from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
9
+ from llava.conversation import conv_templates, SeparatorStyle
10
+ from llava.model.builder import load_pretrained_model
11
+ from llava.utils import disable_torch_init
12
+ from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
13
+ from torch.utils.data import Dataset, DataLoader
14
+
15
+ from PIL import Image
16
+ import math
17
+
18
+
19
+ def split_list(lst, n):
20
+ """Split a list into n (roughly) equal-sized chunks"""
21
+ chunk_size = math.ceil(len(lst) / n) # integer division
22
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
23
+
24
+
25
+ def get_chunk(lst, n, k):
26
+ chunks = split_list(lst, n)
27
+ return chunks[k]
28
+
29
+
30
+ # Custom dataset class
31
+ class CustomDataset(Dataset):
32
+ def __init__(self, questions, image_folder, tokenizer, image_processor, model_config):
33
+ self.questions = questions
34
+ self.image_folder = image_folder
35
+ self.tokenizer = tokenizer
36
+ self.image_processor = image_processor
37
+ self.model_config = model_config
38
+
39
+ def __getitem__(self, index):
40
+ line = self.questions[index]
41
+ image_file = line["image"]
42
+ qs = line["text"]
43
+ if self.model_config.mm_use_im_start_end:
44
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
45
+ else:
46
+ qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
47
+
48
+ conv = conv_templates[args.conv_mode].copy()
49
+ conv.append_message(conv.roles[0], qs)
50
+ conv.append_message(conv.roles[1], None)
51
+ prompt = conv.get_prompt()
52
+
53
+ image = Image.open(os.path.join(self.image_folder, image_file)).convert('RGB')
54
+ image_tensor = process_images([image], self.image_processor, self.model_config)[0]
55
+
56
+ input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt')
57
+
58
+ return input_ids, image_tensor, image.size
59
+
60
+ def __len__(self):
61
+ return len(self.questions)
62
+
63
+
64
+ def collate_fn(batch):
65
+ input_ids, image_tensors, image_sizes = zip(*batch)
66
+ input_ids = torch.stack(input_ids, dim=0)
67
+ image_tensors = torch.stack(image_tensors, dim=0)
68
+ return input_ids, image_tensors, image_sizes
69
+
70
+
71
+ # DataLoader
72
+ def create_data_loader(questions, image_folder, tokenizer, image_processor, model_config, batch_size=1, num_workers=4):
73
+ assert batch_size == 1, "batch_size must be 1"
74
+ dataset = CustomDataset(questions, image_folder, tokenizer, image_processor, model_config)
75
+ data_loader = DataLoader(dataset, batch_size=batch_size, num_workers=num_workers, shuffle=False, collate_fn=collate_fn)
76
+ return data_loader
77
+
78
+
79
+ def eval_model(args):
80
+ # Model
81
+ disable_torch_init()
82
+ model_path = os.path.expanduser(args.model_path)
83
+ model_name = get_model_name_from_path(model_path)
84
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
85
+
86
+ questions = [json.loads(q) for q in open(os.path.expanduser(args.question_file), "r")]
87
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
88
+ answers_file = os.path.expanduser(args.answers_file)
89
+ os.makedirs(os.path.dirname(answers_file), exist_ok=True)
90
+ ans_file = open(answers_file, "w")
91
+
92
+ if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
93
+ args.conv_mode = args.conv_mode + '_mmtag'
94
+ print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
95
+
96
+ data_loader = create_data_loader(questions, args.image_folder, tokenizer, image_processor, model.config)
97
+
98
+ for (input_ids, image_tensor, image_sizes), line in tqdm(zip(data_loader, questions), total=len(questions)):
99
+ idx = line["question_id"]
100
+ cur_prompt = line["text"]
101
+
102
+ input_ids = input_ids.to(device='cuda', non_blocking=True)
103
+
104
+ with torch.inference_mode():
105
+ output_ids = model.generate(
106
+ input_ids,
107
+ images=image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True),
108
+ image_sizes=image_sizes,
109
+ do_sample=True if args.temperature > 0 else False,
110
+ temperature=args.temperature,
111
+ top_p=args.top_p,
112
+ num_beams=args.num_beams,
113
+ max_new_tokens=args.max_new_tokens,
114
+ use_cache=True)
115
+
116
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
117
+
118
+ ans_id = shortuuid.uuid()
119
+ ans_file.write(json.dumps({"question_id": idx,
120
+ "prompt": cur_prompt,
121
+ "text": outputs,
122
+ "answer_id": ans_id,
123
+ "model_id": model_name,
124
+ "metadata": {}}) + "\n")
125
+ # ans_file.flush()
126
+ ans_file.close()
127
+
128
+ if __name__ == "__main__":
129
+ parser = argparse.ArgumentParser()
130
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
131
+ parser.add_argument("--model-base", type=str, default=None)
132
+ parser.add_argument("--image-folder", type=str, default="")
133
+ parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
134
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
135
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
136
+ parser.add_argument("--num-chunks", type=int, default=1)
137
+ parser.add_argument("--chunk-idx", type=int, default=0)
138
+ parser.add_argument("--temperature", type=float, default=0.2)
139
+ parser.add_argument("--top_p", type=float, default=None)
140
+ parser.add_argument("--num_beams", type=int, default=1)
141
+ parser.add_argument("--max_new_tokens", type=int, default=128)
142
+ args = parser.parse_args()
143
+
144
+ eval_model(args)
llava/eval/model_vqa_mmbench.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ import os
4
+ import json
5
+ import pandas as pd
6
+ from tqdm import tqdm
7
+ import shortuuid
8
+
9
+ from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
10
+ from llava.conversation import conv_templates, SeparatorStyle
11
+ from llava.model.builder import load_pretrained_model
12
+ from llava.utils import disable_torch_init
13
+ from llava.mm_utils import tokenizer_image_token, process_images, load_image_from_base64, get_model_name_from_path
14
+
15
+ from PIL import Image
16
+ import math
17
+
18
+
19
+ all_options = ['A', 'B', 'C', 'D']
20
+
21
+
22
+ def split_list(lst, n):
23
+ """Split a list into n (roughly) equal-sized chunks"""
24
+ chunk_size = math.ceil(len(lst) / n) # integer division
25
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
26
+
27
+
28
+ def get_chunk(lst, n, k):
29
+ chunks = split_list(lst, n)
30
+ return chunks[k]
31
+
32
+
33
+ def is_none(value):
34
+ if value is None:
35
+ return True
36
+ if type(value) is float and math.isnan(value):
37
+ return True
38
+ if type(value) is str and value.lower() == 'nan':
39
+ return True
40
+ if type(value) is str and value.lower() == 'none':
41
+ return True
42
+ return False
43
+
44
+ def get_options(row, options):
45
+ parsed_options = []
46
+ for option in options:
47
+ option_value = row[option]
48
+ if is_none(option_value):
49
+ break
50
+ parsed_options.append(option_value)
51
+ return parsed_options
52
+
53
+
54
+ def eval_model(args):
55
+ # Model
56
+ disable_torch_init()
57
+ model_path = os.path.expanduser(args.model_path)
58
+ model_name = get_model_name_from_path(model_path)
59
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
60
+
61
+ questions = pd.read_table(os.path.expanduser(args.question_file))
62
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
63
+ answers_file = os.path.expanduser(args.answers_file)
64
+ os.makedirs(os.path.dirname(answers_file), exist_ok=True)
65
+ ans_file = open(answers_file, "w")
66
+
67
+ if 'plain' in model_name and 'finetune' not in model_name.lower() and 'mmtag' not in args.conv_mode:
68
+ args.conv_mode = args.conv_mode + '_mmtag'
69
+ print(f'It seems that this is a plain model, but it is not using a mmtag prompt, auto switching to {args.conv_mode}.')
70
+
71
+ for index, row in tqdm(questions.iterrows(), total=len(questions)):
72
+ options = get_options(row, all_options)
73
+ cur_option_char = all_options[:len(options)]
74
+
75
+ if args.all_rounds:
76
+ num_rounds = len(options)
77
+ else:
78
+ num_rounds = 1
79
+
80
+ for round_idx in range(num_rounds):
81
+ idx = row['index']
82
+ question = row['question']
83
+ hint = row['hint']
84
+ image = load_image_from_base64(row['image'])
85
+ if not is_none(hint):
86
+ question = hint + '\n' + question
87
+ for option_char, option in zip(all_options[:len(options)], options):
88
+ question = question + '\n' + option_char + '. ' + option
89
+ qs = cur_prompt = question
90
+ if model.config.mm_use_im_start_end:
91
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
92
+ else:
93
+ qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
94
+
95
+ if args.single_pred_prompt:
96
+ if args.lang == 'cn':
97
+ qs = qs + '\n' + "请直接回答选项字母。"
98
+ else:
99
+ qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
100
+
101
+ conv = conv_templates[args.conv_mode].copy()
102
+ conv.append_message(conv.roles[0], qs)
103
+ conv.append_message(conv.roles[1], None)
104
+ prompt = conv.get_prompt()
105
+
106
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
107
+
108
+ image_tensor = process_images([image], image_processor, model.config)[0]
109
+
110
+ with torch.inference_mode():
111
+ output_ids = model.generate(
112
+ input_ids,
113
+ images=image_tensor.unsqueeze(0).half().cuda(),
114
+ image_sizes=[image.size],
115
+ do_sample=True if args.temperature > 0 else False,
116
+ temperature=args.temperature,
117
+ top_p=args.top_p,
118
+ num_beams=args.num_beams,
119
+ # no_repeat_ngram_size=3,
120
+ max_new_tokens=1024,
121
+ use_cache=True)
122
+
123
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
124
+
125
+ ans_id = shortuuid.uuid()
126
+ ans_file.write(json.dumps({"question_id": idx,
127
+ "round_id": round_idx,
128
+ "prompt": cur_prompt,
129
+ "text": outputs,
130
+ "options": options,
131
+ "option_char": cur_option_char,
132
+ "answer_id": ans_id,
133
+ "model_id": model_name,
134
+ "metadata": {}}) + "\n")
135
+ ans_file.flush()
136
+
137
+ # rotate options
138
+ options = options[1:] + options[:1]
139
+ cur_option_char = cur_option_char[1:] + cur_option_char[:1]
140
+ ans_file.close()
141
+
142
+ if __name__ == "__main__":
143
+ parser = argparse.ArgumentParser()
144
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
145
+ parser.add_argument("--model-base", type=str, default=None)
146
+ parser.add_argument("--image-folder", type=str, default="")
147
+ parser.add_argument("--question-file", type=str, default="tables/question.jsonl")
148
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
149
+ parser.add_argument("--conv-mode", type=str, default="llava_v1")
150
+ parser.add_argument("--num-chunks", type=int, default=1)
151
+ parser.add_argument("--chunk-idx", type=int, default=0)
152
+ parser.add_argument("--temperature", type=float, default=0.2)
153
+ parser.add_argument("--top_p", type=float, default=None)
154
+ parser.add_argument("--num_beams", type=int, default=1)
155
+ parser.add_argument("--all-rounds", action="store_true")
156
+ parser.add_argument("--single-pred-prompt", action="store_true")
157
+ parser.add_argument("--lang", type=str, default="en")
158
+ args = parser.parse_args()
159
+
160
+ eval_model(args)
llava/eval/model_vqa_science.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+ import os
4
+ import json
5
+ from tqdm import tqdm
6
+ import shortuuid
7
+
8
+ from llava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
9
+ from llava.conversation import conv_templates, SeparatorStyle
10
+ from llava.model.builder import load_pretrained_model
11
+ from llava.utils import disable_torch_init
12
+ from llava.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path
13
+
14
+ from PIL import Image
15
+ import math
16
+
17
+
18
+ def split_list(lst, n):
19
+ """Split a list into n (roughly) equal-sized chunks"""
20
+ chunk_size = math.ceil(len(lst) / n) # integer division
21
+ return [lst[i:i+chunk_size] for i in range(0, len(lst), chunk_size)]
22
+
23
+
24
+ def get_chunk(lst, n, k):
25
+ chunks = split_list(lst, n)
26
+ return chunks[k]
27
+
28
+
29
+ def eval_model(args):
30
+ # Model
31
+ disable_torch_init()
32
+ model_path = os.path.expanduser(args.model_path)
33
+ model_name = get_model_name_from_path(model_path)
34
+ tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, args.model_base, model_name)
35
+
36
+ questions = json.load(open(os.path.expanduser(args.question_file), "r"))
37
+ questions = get_chunk(questions, args.num_chunks, args.chunk_idx)
38
+ answers_file = os.path.expanduser(args.answers_file)
39
+ os.makedirs(os.path.dirname(answers_file), exist_ok=True)
40
+ ans_file = open(answers_file, "w")
41
+ for i, line in enumerate(tqdm(questions)):
42
+ idx = line["id"]
43
+ question = line['conversations'][0]
44
+ qs = question['value'].replace('<image>', '').strip()
45
+ cur_prompt = qs
46
+
47
+ if 'image' in line:
48
+ image_file = line["image"]
49
+ image = Image.open(os.path.join(args.image_folder, image_file))
50
+ image_tensor = process_images([image], image_processor, model.config)[0]
51
+ images = image_tensor.unsqueeze(0).half().cuda()
52
+ image_sizes = [image.size]
53
+ if getattr(model.config, 'mm_use_im_start_end', False):
54
+ qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + qs
55
+ else:
56
+ qs = DEFAULT_IMAGE_TOKEN + '\n' + qs
57
+ cur_prompt = '<image>' + '\n' + cur_prompt
58
+ else:
59
+ images = None
60
+ image_sizes = None
61
+
62
+ if args.single_pred_prompt:
63
+ qs = qs + '\n' + "Answer with the option's letter from the given choices directly."
64
+ cur_prompt = cur_prompt + '\n' + "Answer with the option's letter from the given choices directly."
65
+
66
+ conv = conv_templates[args.conv_mode].copy()
67
+ conv.append_message(conv.roles[0], qs)
68
+ conv.append_message(conv.roles[1], None)
69
+ prompt = conv.get_prompt()
70
+
71
+ input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
72
+
73
+ with torch.inference_mode():
74
+ output_ids = model.generate(
75
+ input_ids,
76
+ images=images,
77
+ image_sizes=image_sizes,
78
+ do_sample=True if args.temperature > 0 else False,
79
+ temperature=args.temperature,
80
+ max_new_tokens=1024,
81
+ use_cache=True,
82
+ )
83
+
84
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
85
+
86
+ ans_id = shortuuid.uuid()
87
+ ans_file.write(json.dumps({"question_id": idx,
88
+ "prompt": cur_prompt,
89
+ "text": outputs,
90
+ "answer_id": ans_id,
91
+ "model_id": model_name,
92
+ "metadata": {}}) + "\n")
93
+ ans_file.flush()
94
+ ans_file.close()
95
+
96
+ if __name__ == "__main__":
97
+ parser = argparse.ArgumentParser()
98
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
99
+ parser.add_argument("--model-base", type=str, default=None)
100
+ parser.add_argument("--image-folder", type=str, default="")
101
+ parser.add_argument("--question-file", type=str, default="tables/question.json")
102
+ parser.add_argument("--answers-file", type=str, default="answer.jsonl")
103
+ parser.add_argument("--conv-mode", type=str, default="llava_v0")
104
+ parser.add_argument("--num-chunks", type=int, default=1)
105
+ parser.add_argument("--chunk-idx", type=int, default=0)
106
+ parser.add_argument("--temperature", type=float, default=0.2)
107
+ parser.add_argument("--answer-prompter", action="store_true")
108
+ parser.add_argument("--single-pred-prompt", action="store_true")
109
+ args = parser.parse_args()
110
+
111
+ eval_model(args)
llava/eval/qa_baseline_gpt35.py ADDED
@@ -0,0 +1,74 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Generate answers with GPT-3.5"""
2
+ # Note: you need to be using OpenAI Python v0.27.0 for the code below to work
3
+ import argparse
4
+ import json
5
+ import os
6
+ import time
7
+ import concurrent.futures
8
+
9
+ import openai
10
+ import tqdm
11
+ import shortuuid
12
+
13
+ MODEL = 'gpt-3.5-turbo'
14
+ MODEL_ID = 'gpt-3.5-turbo:20230327'
15
+
16
+ def get_answer(question_id: int, question: str, max_tokens: int):
17
+ ans = {
18
+ 'answer_id': shortuuid.uuid(),
19
+ 'question_id': question_id,
20
+ 'model_id': MODEL_ID,
21
+ }
22
+ for _ in range(3):
23
+ try:
24
+ response = openai.ChatCompletion.create(
25
+ model=MODEL,
26
+ messages=[{
27
+ 'role': 'system',
28
+ 'content': 'You are a helpful assistant.'
29
+ }, {
30
+ 'role': 'user',
31
+ 'content': question,
32
+ }],
33
+ max_tokens=max_tokens,
34
+ )
35
+ ans['text'] = response['choices'][0]['message']['content']
36
+ return ans
37
+ except Exception as e:
38
+ print('[ERROR]', e)
39
+ ans['text'] = '#ERROR#'
40
+ time.sleep(1)
41
+ return ans
42
+
43
+
44
+ if __name__ == '__main__':
45
+ parser = argparse.ArgumentParser(description='ChatGPT answer generation.')
46
+ parser.add_argument('-q', '--question')
47
+ parser.add_argument('-o', '--output')
48
+ parser.add_argument('--max-tokens', type=int, default=1024, help='maximum number of tokens produced in the output')
49
+ args = parser.parse_args()
50
+
51
+ questions_dict = {}
52
+ with open(os.path.expanduser(args.question)) as f:
53
+ for line in f:
54
+ if not line:
55
+ continue
56
+ q = json.loads(line)
57
+ questions_dict[q['question_id']] = q['text']
58
+
59
+ answers = []
60
+
61
+ with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
62
+ futures = []
63
+ for qid, question in questions_dict.items():
64
+ future = executor.submit(get_answer, qid, question, args.max_tokens)
65
+ futures.append(future)
66
+
67
+ for future in tqdm.tqdm(concurrent.futures.as_completed(futures), total=len(futures)):
68
+ answers.append(future.result())
69
+
70
+ answers.sort(key=lambda x: x['question_id'])
71
+
72
+ with open(os.path.expanduser(args.output), 'w') as f:
73
+ table = [json.dumps(ans) for ans in answers]
74
+ f.write('\n'.join(table))
llava/eval/run_llava.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import torch
3
+
4
+ from llava.constants import (
5
+ IMAGE_TOKEN_INDEX,
6
+ DEFAULT_IMAGE_TOKEN,
7
+ DEFAULT_IM_START_TOKEN,
8
+ DEFAULT_IM_END_TOKEN,
9
+ IMAGE_PLACEHOLDER,
10
+ )
11
+ from llava.conversation import conv_templates, SeparatorStyle
12
+ from llava.model.builder import load_pretrained_model
13
+ from llava.utils import disable_torch_init
14
+ from llava.mm_utils import (
15
+ process_images,
16
+ tokenizer_image_token,
17
+ get_model_name_from_path,
18
+ )
19
+
20
+ from PIL import Image
21
+
22
+ import requests
23
+ from PIL import Image
24
+ from io import BytesIO
25
+ import re
26
+
27
+
28
+ def image_parser(args):
29
+ out = args.image_file.split(args.sep)
30
+ return out
31
+
32
+
33
+ def load_image(image_file):
34
+ if image_file.startswith("http") or image_file.startswith("https"):
35
+ response = requests.get(image_file)
36
+ image = Image.open(BytesIO(response.content)).convert("RGB")
37
+ else:
38
+ image = Image.open(image_file).convert("RGB")
39
+ return image
40
+
41
+
42
+ def load_images(image_files):
43
+ out = []
44
+ for image_file in image_files:
45
+ image = load_image(image_file)
46
+ out.append(image)
47
+ return out
48
+
49
+
50
+ def eval_model(args):
51
+ # Model
52
+ disable_torch_init()
53
+
54
+ model_name = get_model_name_from_path(args.model_path)
55
+ tokenizer, model, image_processor, context_len = load_pretrained_model(
56
+ args.model_path, args.model_base, model_name
57
+ )
58
+
59
+ qs = args.query
60
+ image_token_se = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN
61
+ if IMAGE_PLACEHOLDER in qs:
62
+ if model.config.mm_use_im_start_end:
63
+ qs = re.sub(IMAGE_PLACEHOLDER, image_token_se, qs)
64
+ else:
65
+ qs = re.sub(IMAGE_PLACEHOLDER, DEFAULT_IMAGE_TOKEN, qs)
66
+ else:
67
+ if model.config.mm_use_im_start_end:
68
+ qs = image_token_se + "\n" + qs
69
+ else:
70
+ qs = DEFAULT_IMAGE_TOKEN + "\n" + qs
71
+
72
+ if "llama-2" in model_name.lower():
73
+ conv_mode = "llava_llama_2"
74
+ elif "mistral" in model_name.lower():
75
+ conv_mode = "mistral_instruct"
76
+ elif "v1.6-34b" in model_name.lower():
77
+ conv_mode = "chatml_direct"
78
+ elif "v1" in model_name.lower():
79
+ conv_mode = "llava_v1"
80
+ elif "mpt" in model_name.lower():
81
+ conv_mode = "mpt"
82
+ else:
83
+ conv_mode = "llava_v0"
84
+
85
+ if args.conv_mode is not None and conv_mode != args.conv_mode:
86
+ print(
87
+ "[WARNING] the auto inferred conversation mode is {}, while `--conv-mode` is {}, using {}".format(
88
+ conv_mode, args.conv_mode, args.conv_mode
89
+ )
90
+ )
91
+ else:
92
+ args.conv_mode = conv_mode
93
+
94
+ conv = conv_templates[args.conv_mode].copy()
95
+ conv.append_message(conv.roles[0], qs)
96
+ conv.append_message(conv.roles[1], None)
97
+ prompt = conv.get_prompt()
98
+
99
+ image_files = image_parser(args)
100
+ images = load_images(image_files)
101
+ image_sizes = [x.size for x in images]
102
+ images_tensor = process_images(
103
+ images,
104
+ image_processor,
105
+ model.config
106
+ ).to(model.device, dtype=torch.float16)
107
+
108
+ input_ids = (
109
+ tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt")
110
+ .unsqueeze(0)
111
+ .cuda()
112
+ )
113
+
114
+ with torch.inference_mode():
115
+ output_ids = model.generate(
116
+ input_ids,
117
+ images=images_tensor,
118
+ image_sizes=image_sizes,
119
+ do_sample=True if args.temperature > 0 else False,
120
+ temperature=args.temperature,
121
+ top_p=args.top_p,
122
+ num_beams=args.num_beams,
123
+ max_new_tokens=args.max_new_tokens,
124
+ use_cache=True,
125
+ )
126
+
127
+ outputs = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip()
128
+ print(outputs)
129
+
130
+
131
+ if __name__ == "__main__":
132
+ parser = argparse.ArgumentParser()
133
+ parser.add_argument("--model-path", type=str, default="facebook/opt-350m")
134
+ parser.add_argument("--model-base", type=str, default=None)
135
+ parser.add_argument("--image-file", type=str, required=True)
136
+ parser.add_argument("--query", type=str, required=True)
137
+ parser.add_argument("--conv-mode", type=str, default=None)
138
+ parser.add_argument("--sep", type=str, default=",")
139
+ parser.add_argument("--temperature", type=float, default=0.2)
140
+ parser.add_argument("--top_p", type=float, default=None)
141
+ parser.add_argument("--num_beams", type=int, default=1)
142
+ parser.add_argument("--max_new_tokens", type=int, default=512)
143
+ args = parser.parse_args()
144
+
145
+ eval_model(args)
llava/eval/summarize_gpt_review.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from collections import defaultdict
4
+
5
+ import numpy as np
6
+
7
+ import argparse
8
+
9
+ def parse_args():
10
+ parser = argparse.ArgumentParser(description='ChatGPT-based QA evaluation.')
11
+ parser.add_argument('-d', '--dir', default=None)
12
+ parser.add_argument('-v', '--version', default=None)
13
+ parser.add_argument('-s', '--select', nargs='*', default=None)
14
+ parser.add_argument('-f', '--files', nargs='*', default=[])
15
+ parser.add_argument('-i', '--ignore', nargs='*', default=[])
16
+ return parser.parse_args()
17
+
18
+
19
+ if __name__ == '__main__':
20
+ args = parse_args()
21
+
22
+ if args.ignore is not None:
23
+ args.ignore = [int(x) for x in args.ignore]
24
+
25
+ if len(args.files) > 0:
26
+ review_files = args.files
27
+ else:
28
+ review_files = [x for x in os.listdir(args.dir) if x.endswith('.jsonl') and (x.startswith('gpt4_text') or x.startswith('reviews_') or x.startswith('review_') or 'review' in args.dir)]
29
+
30
+ for review_file in sorted(review_files):
31
+ config = os.path.basename(review_file).replace('gpt4_text_', '').replace('.jsonl', '')
32
+ if args.select is not None and any(x not in config for x in args.select):
33
+ continue
34
+ if '0613' in config:
35
+ version = '0613'
36
+ else:
37
+ version = '0314'
38
+ if args.version is not None and args.version != version:
39
+ continue
40
+ scores = defaultdict(list)
41
+ print(config)
42
+ with open(os.path.join(args.dir, review_file) if args.dir is not None else review_file) as f:
43
+ for review_str in f:
44
+ review = json.loads(review_str)
45
+ if review['question_id'] in args.ignore:
46
+ continue
47
+ if 'category' in review:
48
+ scores[review['category']].append(review['tuple'])
49
+ scores['all'].append(review['tuple'])
50
+ else:
51
+ if 'tuple' in review:
52
+ scores['all'].append(review['tuple'])
53
+ else:
54
+ scores['all'].append(review['score'])
55
+ for k, v in sorted(scores.items()):
56
+ stats = np.asarray(v).mean(0).tolist()
57
+ stats = [round(x, 3) for x in stats]
58
+ # print(k, stats, round(stats[1]/stats[0]*100, 1))
59
+ print(k, round(stats[1]/stats[0]*100, 1), round(stats[0] * 10, 1), round(stats[1] * 10, 1))
60
+ print('=================================')
llava/eval/webpage/figures/alpaca.png ADDED
llava/eval/webpage/figures/bard.jpg ADDED
llava/eval/webpage/figures/chatgpt.svg ADDED
llava/eval/webpage/figures/llama.jpg ADDED
llava/eval/webpage/figures/swords_FILL0_wght300_GRAD0_opsz48.svg ADDED
llava/eval/webpage/figures/vicuna.jpeg ADDED
llava/eval/webpage/index.html ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
6
+ <title>Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots</title>
7
+ <link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/css/bootstrap.min.css">
8
+ <link rel="stylesheet" href="https://fonts.googleapis.com/icon?family=Material+Icons">
9
+ <link rel="stylesheet" href="styles.css">
10
+ </head>
11
+
12
+ <body>
13
+ <nav class="navbar navbar-expand-lg navbar-dark bg-dark">
14
+ <a class="navbar-brand" href="#">🏔️ Vicuna Evaluation Examples</a>
15
+ <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbarNav" aria-controls="navbarNav" aria-expanded="false" aria-label="Toggle navigation">
16
+ <span class="navbar-toggler-icon"></span>
17
+ </button>
18
+ <div class="collapse navbar-collapse" id="navbarNav">
19
+ <ul class="navbar-nav mr-auto">
20
+ <li class="nav-item">
21
+ <a class="nav-link" href="https://chat.lmsys.org/">Demo</a>
22
+ </li>
23
+ <li class="nav-item">
24
+ <a class="nav-link" href="https://vicuna.lmsys.org">Blog</a>
25
+ </li>
26
+ <li class="nav-item">
27
+ <a class="nav-link" href="https://github.com/lm-sys/FastChat">Github</a>
28
+ </li>
29
+ </ul>
30
+ </div>
31
+ </nav>
32
+
33
+ <div class="container mt-5">
34
+ <h2 class="text-center mb-5">Who's GPT-4's favorite? Battles between State-of-the-Art Chatbots</h2>
35
+
36
+ <!-- Selection -->
37
+ <div class="form-row">
38
+ <div class="form-group col-md-2">
39
+ <label for="category-select">Category</label>
40
+ <select class="form-control" id="category-select"></select>
41
+ </div>
42
+ <div class="form-group col-md-8">
43
+ <label for="question-select">Question</label>
44
+ <select class="form-control" id="question-select"></select>
45
+ </div>
46
+ <div class="form-group col-md-2">
47
+ <div class="col-md-2"><label>&nbsp;</label></div>
48
+ <div class="btn-group" role="group" aria-label="Left and Right Controller">
49
+ <button type="button" class="form-control btn btn-primary" id="prev-question"><i class="material-icons">keyboard_arrow_left</i></button>
50
+ <button type="button" class="form-control btn btn-primary" id="next-question"><i class="material-icons">keyboard_arrow_right</i></button>
51
+ </div>
52
+ </div>
53
+ </div>
54
+
55
+ <!-- "Battle" -->
56
+ <div class="row mb-4" style="justify-content: center;">
57
+ <div class="col" style="display: flex; justify-content: center; align-items: center;">
58
+ <label class="adjustable-font-size" id="other-score-label">*/10</label>
59
+ </div>
60
+ <div class="col">
61
+ <div class="vertical-flex-layout">
62
+ <img class="shadow figure-img img-fluid" src="" alt="other logo" width="150" id="other-model-figure">
63
+ </div>
64
+ </div>
65
+ <div class="col">
66
+ <div class="vertical-flex-layout">
67
+ <!-- from: https://fonts.google.com/icons?icon.query=battle&selected=Material+Symbols+Outlined:swords:FILL@0;wght@300;GRAD@0;opsz@48&icon.style=Outlined -->
68
+ <img class="figure-img img-fluid" src="figures/swords_FILL0_wght300_GRAD0_opsz48.svg" width="60" height="60">
69
+ </div>
70
+ </div>
71
+ <div class="col">
72
+ <div class="vertical-flex-layout">
73
+ <img class="shadow figure-img img-fluid" src="figures/vicuna.jpeg" alt="vicuna logo" width="150" id="our-model-figure">
74
+ </div>
75
+ </div>
76
+ <div class="col" style="display: flex; justify-content: center; align-items: center;">
77
+ <label class="adjustable-font-size" id="our-score-label">*/10</label>
78
+ </div>
79
+ </div>
80
+
81
+ <!-- Question Card -->
82
+ <div class="card mb-4">
83
+ <div class="card-body" id="selected-question"></div>
84
+ </div>
85
+
86
+ <!-- Answer Cards -->
87
+ <div class="row">
88
+ <div class="col-md-6">
89
+ <div class="card mb-4 expandable-card">
90
+ <div class="card-header" style="padding-bottom: 0.2rem" id="other-model-header-bg">
91
+ <div class="row">
92
+ <div class="col-md-5" style="align-items: center; display: flex;">
93
+ <label id="other-model-header">Assistant #1</label>
94
+ </div>
95
+ <div class="col-md-7">
96
+ <select class="form-control" id="model-select" style="height: fit-content; margin-top: -0.3rem;"></select>
97
+ </div>
98
+ </div>
99
+ </div>
100
+ <div class="card-body">
101
+ <div class="card-text-container">
102
+ <div class="card-text" id="other-model-answer"></div>
103
+ </div>
104
+ <div class="btn btn-primary expand-btn" style="display:flex;"></div>
105
+ </div>
106
+ </div>
107
+ </div>
108
+ <div class="col-md-6">
109
+ <div class="card mb-4 expandable-card">
110
+ <div class="card-header" id="our-model-header">
111
+ Assistant #2 (Vicuna, our model)
112
+ </div>
113
+ <div class="card-body">
114
+ <div class="card-text-container">
115
+ <div class="card-text" id="our-model-answer"></div>
116
+ </div>
117
+ <div class="btn btn-primary expand-btn" style="display:flex;"></div>
118
+ </div>
119
+ </div>
120
+ </div>
121
+ </div>
122
+
123
+ <!-- Evaluation -->
124
+ <div class="card expandable-card">
125
+ <div class="card-header" style="background-color: #c9c9f2;" id="evaluation-header">GPT-4 Evaluation</div>
126
+ <div class="card-body">
127
+ <div class="card-text-container">
128
+ <div class="card-text" id="evaluation-result"></div>
129
+ </div>
130
+ <div class="btn btn-primary expand-btn" style="display:flex;"></div>
131
+ </div>
132
+ </div>
133
+ </div>
134
+
135
+ <div class="container-fluid bg-light py-2">
136
+ <div class="text-center">
137
+ <small class="text-muted">This website is co-authored with <a href="https://openai.com" target="_blank">GPT-4</a>.</small>
138
+ </div>
139
+ </div>
140
+
141
+ <!-- Marked.js -->
142
+ <script src="https://cdn.jsdelivr.net/npm/marked@4.3.0/lib/marked.umd.min.js"></script>
143
+ <!-- Bootstrap and Popper.js JavaScript dependencies -->
144
+ <script src="https://code.jquery.com/jquery-3.5.1.slim.min.js"></script>
145
+ <script src="https://cdn.jsdelivr.net/npm/@popperjs/core@2.11.6/dist/umd/popper.min.js"></script>
146
+ <script src="https://maxcdn.bootstrapcdn.com/bootstrap/4.5.2/js/bootstrap.min.js"></script>
147
+
148
+ <script src="script.js"></script>
149
+ <script>
150
+ // Fetch the JSON file
151
+ fetch('data.json')
152
+ .then(response => response.json())
153
+ .then(json_data => {
154
+ // Populate the models and questions.
155
+ populateModels(json_data.models);
156
+ populateQuestions(json_data.questions);
157
+ displayQuestion(currentQuestionIndex);
158
+ }).catch(error => console.error(error));
159
+ </script>
160
+ </body>
161
+
162
+ </html>
llava/eval/webpage/script.js ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ // Description: Script for the evaluation webpage.
2
+
3
+ let currentQuestionIndex = 1;
4
+
5
+ // Store the model name mapping for later use.
6
+ modelNameMapping = {
7
+ "gpt35": "ChatGPT-3.5",
8
+ "gpt4": "GPT-4",
9
+ "alpaca": "Alpaca-13b",
10
+ "vicuna": "Vicuna-13b",
11
+ "llama": "LLaMA-13b",
12
+ "bard": "Bard",
13
+ };
14
+
15
+ modelFigureMapping = {
16
+ "vicuna": "figures/vicuna.jpeg",
17
+ // Image from: https://commons.wikimedia.org/wiki/File:ChatGPT_logo.svg
18
+ "gpt35": "figures/chatgpt.svg",
19
+ // Image from: https://www.reddit.com/r/logodesign/comments/1128aat/google_ai_bard_logo_design/
20
+ "bard": "figures/bard.jpg",
21
+ // Image from: https://crfm.stanford.edu/2023/03/13/alpaca.html
22
+ "alpaca": "figures/alpaca.png",
23
+ // Image adapted from https://commons.wikimedia.org/wiki/File:Llama_on_Machu_Picchu.jpg
24
+ "llama": "figures/llama.jpg",
25
+ }
26
+
27
+ // Store the question data in a mapping for later use.
28
+ questionMapping = {};
29
+ // Store the question ids in a mapping for later use.
30
+ categoryMapping = {};
31
+ // Store the number of questions for later use.
32
+ questionsCount = 0;
33
+
34
+
35
+ function text2Markdown(text) {
36
+ // Normalize the text for markdown rendering.
37
+ text = text.trim().replaceAll('\n\n', '\n').replaceAll('\n', '\n\n');
38
+ return marked.parse(text);
39
+ }
40
+
41
+ function capitalizeFirstChar(str) {
42
+ if (!str || str.length === 0) {
43
+ return str;
44
+ }
45
+ return str.charAt(0).toUpperCase() + str.slice(1);
46
+ }
47
+
48
+ function updateQuestionSelect(question_id) {
49
+ const select = document.getElementById('question-select');
50
+ // Clear the question select.
51
+ select.innerHTML = '';
52
+ // Populate the question select.
53
+ category = questionMapping[question_id].category;
54
+ categoryMapping[category].forEach(question_id => {
55
+ const question = questionMapping[question_id];
56
+ const option = document.createElement('option');
57
+ option.value = question_id;
58
+ option.textContent = 'Q' + question_id.toString() + ': ' + question.question;
59
+ select.appendChild(option);
60
+ });
61
+ select.value = question_id;
62
+ }
63
+
64
+ function updateModelSelect() {
65
+ const select = document.getElementById('model-select');
66
+ img_path = modelFigureMapping[select.value];
67
+ document.getElementById('other-model-figure').src = img_path;
68
+ }
69
+
70
+ function populateModels(models) {
71
+ const select = document.getElementById('model-select');
72
+ models.forEach(model => {
73
+ const option = document.createElement('option');
74
+ option.value = model;
75
+ option.textContent = modelNameMapping[model];
76
+ select.appendChild(option);
77
+ });
78
+ updateModelSelect();
79
+ }
80
+
81
+ function populateQuestions(questions) {
82
+ const category_select = document.getElementById('category-select');
83
+
84
+ questionsCount = questions.length;
85
+ questions.forEach(question => {
86
+ const option = document.createElement('option');
87
+ // Store the question data in a mapping for later use.
88
+ questionMapping[question.id] = {
89
+ category: question.category,
90
+ question: question.question,
91
+ answers: question.answers,
92
+ evaluations: question.evaluations,
93
+ scores: question.scores,
94
+ };
95
+ // Store the question id in the category mapping.
96
+ if (question.category in categoryMapping) {
97
+ categoryMapping[question.category].push(question.id);
98
+ } else {
99
+ categoryMapping[question.category] = [question.id];
100
+ const category_option = document.createElement('option');
101
+ category_option.value = question.category;
102
+ category_option.textContent = capitalizeFirstChar(question.category);
103
+ category_select.appendChild(category_option);
104
+ }
105
+ });
106
+ // Set the default category.
107
+ updateQuestionSelect(currentQuestionIndex);
108
+ }
109
+
110
+ function displayQuestion(index) {
111
+ const question = questionMapping[index].question;
112
+ document.getElementById('selected-question').innerHTML = text2Markdown('**Question:** ' + question);
113
+ displayAnswers(index);
114
+ }
115
+
116
+ function displayAnswers(index) {
117
+ const question = questionMapping[index];
118
+ const otherModel = document.getElementById('model-select').value;
119
+ // render the answers with markdown
120
+ document.getElementById('other-model-answer').innerHTML = text2Markdown(question.answers[otherModel]);
121
+ document.getElementById('our-model-answer').innerHTML = text2Markdown(question.answers.vicuna);
122
+
123
+ // Display evaluation
124
+ score = question.scores[otherModel];
125
+ score_text = modelNameMapping[otherModel] + " " + score[0] + "/10, Vicuna-13b " + score[1] + "/10";
126
+ document.getElementById('evaluation-header').textContent = "GPT-4 Evaluation" + " (Score: " + score_text + ")";
127
+ document.getElementById('evaluation-result').innerHTML = text2Markdown(question.evaluations[otherModel]);
128
+
129
+ // Update model names
130
+ let assistant1_title = "Assistant #1"; // (" + modelNameMapping[otherModel] + ")";
131
+ let assistant2_title = "Assistant #2 (Vicuna-13b, our model)";
132
+ // Update scores/labels.
133
+ let assistant1_score_label = score[0].toString() + '/10';
134
+ let assistant2_score_label = score[1].toString() + '/10';
135
+
136
+ const colorRed ='#fa9'; // '#eb978d';
137
+ // const colorGreen = '#c9f2c9';
138
+ const colorBlue = '#8ef'; // '#71dbf9';
139
+ const colorYellow = '#fe7'; // '#fada57';
140
+ let otherModelHeaderColor = '';
141
+ let ourModelHeaderColor = '';
142
+ // Update the winner.
143
+ if (score[0] == score[1]) {
144
+ assistant1_title = '🏆 ' + assistant1_title;
145
+ assistant1_score_label = '🏆 ' + assistant1_score_label;
146
+ assistant2_title = '🏆 ' + assistant2_title;
147
+ assistant2_score_label = '🏆 ' + assistant2_score_label;
148
+ otherModelHeaderColor = colorYellow;
149
+ ourModelHeaderColor = colorYellow;
150
+ } else if (score[0] > score[1]) {
151
+ assistant1_title = '🏆 ' + assistant1_title;
152
+ assistant1_score_label = '🏆 ' + assistant1_score_label;
153
+ otherModelHeaderColor = colorBlue;
154
+ ourModelHeaderColor = colorRed;
155
+ } else if (score[0] < score[1]) {
156
+ assistant2_title = '🏆 ' + assistant2_title;
157
+ assistant2_score_label = '🏆 ' + assistant2_score_label;
158
+ otherModelHeaderColor = colorRed;
159
+ ourModelHeaderColor = colorBlue;
160
+ }
161
+
162
+ document.getElementById('other-model-header-bg').style.backgroundColor = otherModelHeaderColor;
163
+ document.getElementById('our-model-header').style.backgroundColor = ourModelHeaderColor;
164
+
165
+ document.getElementById('other-model-header').textContent = assistant1_title;
166
+ document.getElementById('our-model-header').textContent = assistant2_title;
167
+
168
+ document.getElementById('other-score-label').textContent = assistant1_score_label;
169
+ document.getElementById('our-score-label').textContent = assistant2_score_label;
170
+
171
+ // Update expand buttons visibility for both cards after displaying answers
172
+ // Reset the expanded state and update expand buttons visibility for both cards after displaying answers
173
+ document.querySelectorAll('.expandable-card').forEach(card => {
174
+ card.classList.remove('expanded');
175
+ updateExpandButtonVisibility(card);
176
+ const expandBtn = card.querySelector('.expand-btn');
177
+ expandBtn.innerHTML = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_down</i> Show more'; // .textContent = 'Show more';
178
+ });
179
+ }
180
+
181
+ document.getElementById('question-select').addEventListener('change', e => {
182
+ currentQuestionIndex = parseInt(e.target.value);
183
+ displayQuestion(currentQuestionIndex);
184
+ });
185
+
186
+ document.getElementById('category-select').addEventListener('change', e => {
187
+ let currentCategory = e.target.value;
188
+ const questionIds = categoryMapping[currentCategory];
189
+ currentQuestionIndex = questionIds[0];
190
+ updateQuestionSelect(currentQuestionIndex);
191
+ displayQuestion(currentQuestionIndex);
192
+ });
193
+
194
+ // Update expand buttons whenever the model is changed
195
+ document.getElementById('model-select').addEventListener('change', () => {
196
+ displayAnswers(currentQuestionIndex);
197
+ document.querySelectorAll('.expandable-card').forEach(card => {
198
+ updateExpandButtonVisibility(card);
199
+ });
200
+ updateModelSelect();
201
+ });
202
+
203
+ function switchQuestionAndCategory() {
204
+ document.getElementById('question-select').value = currentQuestionIndex;
205
+ old_category = document.getElementById('category-select').value;
206
+ new_category = questionMapping[currentQuestionIndex].category;
207
+ if (old_category != new_category) {
208
+ document.getElementById('category-select').value = new_category;
209
+ updateQuestionSelect(currentQuestionIndex);
210
+ }
211
+ displayQuestion(currentQuestionIndex);
212
+ }
213
+
214
+ document.getElementById('prev-question').addEventListener('click', () => {
215
+ // Question index starts from 1.
216
+ currentQuestionIndex = Math.max(1, currentQuestionIndex - 1);
217
+ switchQuestionAndCategory();
218
+ });
219
+
220
+ document.getElementById('next-question').addEventListener('click', () => {
221
+ // Question index starts from 1.
222
+ currentQuestionIndex = Math.min(questionsCount, currentQuestionIndex + 1);
223
+ switchQuestionAndCategory();
224
+ });
225
+
226
+ function updateExpandButtonVisibility(card) {
227
+ const cardTextContainer = card.querySelector('.card-text-container');
228
+ const expandBtn = card.querySelector('.expand-btn');
229
+ if (cardTextContainer.scrollHeight > cardTextContainer.offsetHeight) {
230
+ expandBtn.style.display = 'flex';
231
+ } else {
232
+ expandBtn.style.display = 'none';
233
+ card.classList.add('expanded');
234
+ }
235
+ }
236
+
237
+ document.querySelectorAll('.expand-btn').forEach(btn => {
238
+ btn.addEventListener('click', e => {
239
+ const card = e.target.closest('.expandable-card');
240
+ card.classList.toggle('expanded');
241
+ const more = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_down</i> Show more';
242
+ const less = '<i class="material-icons" style="pointer-events: none">keyboard_arrow_up</i> Show less';
243
+ e.target.innerHTML = card.classList.contains('expanded') ? less : more;
244
+ });
245
+ });
llava/eval/webpage/styles.css ADDED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ body {
2
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
3
+ background-color: #f8f9fa;
4
+ }
5
+
6
+ .navbar-dark .navbar-nav .nav-link {
7
+ color: #f1cf68;
8
+ font-size: 1.1rem;
9
+ padding: 0.5rem 0.6rem;
10
+ }
11
+
12
+ .card-header {
13
+ font-weight: bold;
14
+ }
15
+
16
+ .card {
17
+ box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
18
+ transition: 0.3s;
19
+ }
20
+
21
+ .card:hover {
22
+ box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2);
23
+ }
24
+
25
+ button {
26
+ transition: background-color 0.3s;
27
+ }
28
+
29
+ button:hover {
30
+ background-color: #007bff;
31
+ }
32
+
33
+ @media (max-width: 767px) {
34
+ .form-row .form-group {
35
+ margin-bottom: 10px;
36
+ }
37
+ }
38
+
39
+ /* Extra styles */
40
+
41
+ .expandable-card .card-text-container {
42
+ max-height: 200px;
43
+ overflow-y: hidden;
44
+ position: relative;
45
+ }
46
+
47
+ .expandable-card.expanded .card-text-container {
48
+ max-height: none;
49
+ }
50
+
51
+ .expand-btn {
52
+ position: relative;
53
+ display: none;
54
+ background-color: rgba(255, 255, 255, 0.8);
55
+ color: #510c75;
56
+ border-color: transparent;
57
+ }
58
+
59
+ .expand-btn:hover {
60
+ background-color: rgba(200, 200, 200, 0.8);
61
+ text-decoration: none;
62
+ border-color: transparent;
63
+ color: #510c75;
64
+ }
65
+
66
+ .expand-btn:focus {
67
+ outline: none;
68
+ text-decoration: none;
69
+ }
70
+
71
+ .expandable-card:not(.expanded) .card-text-container:after {
72
+ content: "";
73
+ position: absolute;
74
+ bottom: 0;
75
+ left: 0;
76
+ width: 100%;
77
+ height: 90px;
78
+ background: linear-gradient(rgba(255, 255, 255, 0.2), rgba(255, 255, 255, 1));
79
+ }
80
+
81
+ .expandable-card:not(.expanded) .expand-btn {
82
+ margin-top: -40px;
83
+ }
84
+
85
+ .card-body {
86
+ padding-bottom: 5px;
87
+ }
88
+
89
+ .vertical-flex-layout {
90
+ justify-content: center;
91
+ align-items: center;
92
+ height: 100%;
93
+ display: flex;
94
+ flex-direction: column;
95
+ gap: 5px;
96
+ }
97
+
98
+ .figure-img {
99
+ max-width: 100%;
100
+ height: auto;
101
+ }
102
+
103
+ .adjustable-font-size {
104
+ font-size: calc(0.5rem + 2vw);
105
+ }
llava/mm_utils.py ADDED
@@ -0,0 +1,247 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ from io import BytesIO
3
+ import base64
4
+ import torch
5
+ import math
6
+ import ast
7
+
8
+ from transformers import StoppingCriteria
9
+ from llava.constants import IMAGE_TOKEN_INDEX
10
+
11
+
12
+ def select_best_resolution(original_size, possible_resolutions):
13
+ """
14
+ Selects the best resolution from a list of possible resolutions based on the original size.
15
+
16
+ Args:
17
+ original_size (tuple): The original size of the image in the format (width, height).
18
+ possible_resolutions (list): A list of possible resolutions in the format [(width1, height1), (width2, height2), ...].
19
+
20
+ Returns:
21
+ tuple: The best fit resolution in the format (width, height).
22
+ """
23
+ original_width, original_height = original_size
24
+ best_fit = None
25
+ max_effective_resolution = 0
26
+ min_wasted_resolution = float('inf')
27
+
28
+ for width, height in possible_resolutions:
29
+ scale = min(width / original_width, height / original_height)
30
+ downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
31
+ effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
32
+ wasted_resolution = (width * height) - effective_resolution
33
+
34
+ if effective_resolution > max_effective_resolution or (effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution):
35
+ max_effective_resolution = effective_resolution
36
+ min_wasted_resolution = wasted_resolution
37
+ best_fit = (width, height)
38
+
39
+ return best_fit
40
+
41
+
42
+ def resize_and_pad_image(image, target_resolution):
43
+ """
44
+ Resize and pad an image to a target resolution while maintaining aspect ratio.
45
+
46
+ Args:
47
+ image (PIL.Image.Image): The input image.
48
+ target_resolution (tuple): The target resolution (width, height) of the image.
49
+
50
+ Returns:
51
+ PIL.Image.Image: The resized and padded image.
52
+ """
53
+ original_width, original_height = image.size
54
+ target_width, target_height = target_resolution
55
+
56
+ scale_w = target_width / original_width
57
+ scale_h = target_height / original_height
58
+
59
+ if scale_w < scale_h:
60
+ new_width = target_width
61
+ new_height = min(math.ceil(original_height * scale_w), target_height)
62
+ else:
63
+ new_height = target_height
64
+ new_width = min(math.ceil(original_width * scale_h), target_width)
65
+
66
+ # Resize the image
67
+ resized_image = image.resize((new_width, new_height))
68
+
69
+ new_image = Image.new('RGB', (target_width, target_height), (0, 0, 0))
70
+ paste_x = (target_width - new_width) // 2
71
+ paste_y = (target_height - new_height) // 2
72
+ new_image.paste(resized_image, (paste_x, paste_y))
73
+
74
+ return new_image
75
+
76
+
77
+ def divide_to_patches(image, patch_size):
78
+ """
79
+ Divides an image into patches of a specified size.
80
+
81
+ Args:
82
+ image (PIL.Image.Image): The input image.
83
+ patch_size (int): The size of each patch.
84
+
85
+ Returns:
86
+ list: A list of PIL.Image.Image objects representing the patches.
87
+ """
88
+ patches = []
89
+ width, height = image.size
90
+ for i in range(0, height, patch_size):
91
+ for j in range(0, width, patch_size):
92
+ box = (j, i, j + patch_size, i + patch_size)
93
+ patch = image.crop(box)
94
+ patches.append(patch)
95
+
96
+ return patches
97
+
98
+
99
+ def get_anyres_image_grid_shape(image_size, grid_pinpoints, patch_size):
100
+ """
101
+ Calculate the shape of the image patch grid after the preprocessing for images of any resolution.
102
+
103
+ Args:
104
+ image_size (tuple): The size of the input image in the format (width, height).
105
+ grid_pinpoints (str): A string representation of a list of possible resolutions.
106
+ patch_size (int): The size of each image patch.
107
+
108
+ Returns:
109
+ tuple: The shape of the image patch grid in the format (width, height).
110
+ """
111
+ if type(grid_pinpoints) is list:
112
+ possible_resolutions = grid_pinpoints
113
+ else:
114
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
115
+ width, height = select_best_resolution(image_size, possible_resolutions)
116
+ return width // patch_size, height // patch_size
117
+
118
+
119
+ def process_anyres_image(image, processor, grid_pinpoints):
120
+ """
121
+ Process an image with variable resolutions.
122
+
123
+ Args:
124
+ image (PIL.Image.Image): The input image to be processed.
125
+ processor: The image processor object.
126
+ grid_pinpoints (str): A string representation of a list of possible resolutions.
127
+
128
+ Returns:
129
+ torch.Tensor: A tensor containing the processed image patches.
130
+ """
131
+ if type(grid_pinpoints) is list:
132
+ possible_resolutions = grid_pinpoints
133
+ else:
134
+ possible_resolutions = ast.literal_eval(grid_pinpoints)
135
+ best_resolution = select_best_resolution(image.size, possible_resolutions)
136
+ image_padded = resize_and_pad_image(image, best_resolution)
137
+
138
+ patches = divide_to_patches(image_padded, processor.crop_size['height'])
139
+
140
+ image_original_resize = image.resize((processor.size['shortest_edge'], processor.size['shortest_edge']))
141
+
142
+ image_patches = [image_original_resize] + patches
143
+ image_patches = [processor.preprocess(image_patch, return_tensors='pt')['pixel_values'][0]
144
+ for image_patch in image_patches]
145
+ return torch.stack(image_patches, dim=0)
146
+
147
+
148
+ def load_image_from_base64(image):
149
+ return Image.open(BytesIO(base64.b64decode(image)))
150
+
151
+
152
+ def expand2square(pil_img, background_color):
153
+ width, height = pil_img.size
154
+ if width == height:
155
+ return pil_img
156
+ elif width > height:
157
+ result = Image.new(pil_img.mode, (width, width), background_color)
158
+ result.paste(pil_img, (0, (width - height) // 2))
159
+ return result
160
+ else:
161
+ result = Image.new(pil_img.mode, (height, height), background_color)
162
+ result.paste(pil_img, ((height - width) // 2, 0))
163
+ return result
164
+
165
+
166
+ def process_images(images, image_processor, model_cfg):
167
+ image_aspect_ratio = getattr(model_cfg, "image_aspect_ratio", None)
168
+ new_images = []
169
+ if image_aspect_ratio == 'pad':
170
+ for image in images:
171
+ image = expand2square(image, tuple(int(x*255) for x in image_processor.image_mean))
172
+ image = image_processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
173
+ new_images.append(image)
174
+ elif image_aspect_ratio == "anyres":
175
+ for image in images:
176
+ image = process_anyres_image(image, image_processor, model_cfg.image_grid_pinpoints)
177
+ new_images.append(image)
178
+ else:
179
+ return image_processor(images, return_tensors='pt')['pixel_values']
180
+ if all(x.shape == new_images[0].shape for x in new_images):
181
+ new_images = torch.stack(new_images, dim=0)
182
+ return new_images
183
+
184
+
185
+ def tokenizer_image_token(prompt, tokenizer, image_token_index=IMAGE_TOKEN_INDEX, return_tensors=None):
186
+ prompt_chunks = [tokenizer(chunk).input_ids for chunk in prompt.split('<image>')]
187
+
188
+ def insert_separator(X, sep):
189
+ return [ele for sublist in zip(X, [sep]*len(X)) for ele in sublist][:-1]
190
+
191
+ input_ids = []
192
+ offset = 0
193
+ if len(prompt_chunks) > 0 and len(prompt_chunks[0]) > 0 and prompt_chunks[0][0] == tokenizer.bos_token_id:
194
+ offset = 1
195
+ input_ids.append(prompt_chunks[0][0])
196
+
197
+ for x in insert_separator(prompt_chunks, [image_token_index] * (offset + 1)):
198
+ input_ids.extend(x[offset:])
199
+
200
+ if return_tensors is not None:
201
+ if return_tensors == 'pt':
202
+ return torch.tensor(input_ids, dtype=torch.long)
203
+ raise ValueError(f'Unsupported tensor type: {return_tensors}')
204
+ return input_ids
205
+
206
+
207
+ def get_model_name_from_path(model_path):
208
+ model_path = model_path.strip("/")
209
+ model_paths = model_path.split("/")
210
+ if model_paths[-1].startswith('checkpoint-'):
211
+ return model_paths[-2] + "_" + model_paths[-1]
212
+ else:
213
+ return model_paths[-1]
214
+
215
+ class KeywordsStoppingCriteria(StoppingCriteria):
216
+ def __init__(self, keywords, tokenizer, input_ids):
217
+ self.keywords = keywords
218
+ self.keyword_ids = []
219
+ self.max_keyword_len = 0
220
+ for keyword in keywords:
221
+ cur_keyword_ids = tokenizer(keyword).input_ids
222
+ if len(cur_keyword_ids) > 1 and cur_keyword_ids[0] == tokenizer.bos_token_id:
223
+ cur_keyword_ids = cur_keyword_ids[1:]
224
+ if len(cur_keyword_ids) > self.max_keyword_len:
225
+ self.max_keyword_len = len(cur_keyword_ids)
226
+ self.keyword_ids.append(torch.tensor(cur_keyword_ids))
227
+ self.tokenizer = tokenizer
228
+ self.start_len = input_ids.shape[1]
229
+
230
+ def call_for_batch(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
231
+ offset = min(output_ids.shape[1] - self.start_len, self.max_keyword_len)
232
+ self.keyword_ids = [keyword_id.to(output_ids.device) for keyword_id in self.keyword_ids]
233
+ for keyword_id in self.keyword_ids:
234
+ truncated_output_ids = output_ids[0, -keyword_id.shape[0]:]
235
+ if torch.equal(truncated_output_ids, keyword_id):
236
+ return True
237
+ outputs = self.tokenizer.batch_decode(output_ids[:, -offset:], skip_special_tokens=True)[0]
238
+ for keyword in self.keywords:
239
+ if keyword in outputs:
240
+ return True
241
+ return False
242
+
243
+ def __call__(self, output_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
244
+ outputs = []
245
+ for i in range(output_ids.shape[0]):
246
+ outputs.append(self.call_for_batch(output_ids[i].unsqueeze(0), scores))
247
+ return all(outputs)
llava/model/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ try:
2
+ from .language_model.llava_llama import LlavaLlamaForCausalLM, LlavaConfig
3
+ from .language_model.llava_mpt import LlavaMptForCausalLM, LlavaMptConfig
4
+ from .language_model.llava_mistral import LlavaMistralForCausalLM, LlavaMistralConfig
5
+ except:
6
+ pass
llava/model/apply_delta.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 -m fastchat.model.apply_delta --base ~/model_weights/llama-7b --target ~/model_weights/vicuna-7b --delta lmsys/vicuna-7b-delta
4
+ """
5
+ import argparse
6
+
7
+ import torch
8
+ from tqdm import tqdm
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM
10
+ from llava import LlavaLlamaForCausalLM
11
+
12
+
13
+ def apply_delta(base_model_path, target_model_path, delta_path):
14
+ print("Loading base model")
15
+ base = AutoModelForCausalLM.from_pretrained(
16
+ base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17
+
18
+ print("Loading delta")
19
+ delta = LlavaLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
20
+ delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
21
+
22
+ print("Applying delta")
23
+ for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
24
+ if name not in base.state_dict():
25
+ assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26
+ continue
27
+ if param.data.shape == base.state_dict()[name].shape:
28
+ param.data += base.state_dict()[name]
29
+ else:
30
+ assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
31
+ f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
32
+ bparam = base.state_dict()[name]
33
+ param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
34
+
35
+ print("Saving target model")
36
+ delta.save_pretrained(target_model_path)
37
+ delta_tokenizer.save_pretrained(target_model_path)
38
+
39
+
40
+ if __name__ == "__main__":
41
+ parser = argparse.ArgumentParser()
42
+ parser.add_argument("--base-model-path", type=str, required=True)
43
+ parser.add_argument("--target-model-path", type=str, required=True)
44
+ parser.add_argument("--delta-path", type=str, required=True)
45
+
46
+ args = parser.parse_args()
47
+
48
+ apply_delta(args.base_model_path, args.target_model_path, args.delta_path)
llava/model/builder.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import os
17
+ import warnings
18
+ import shutil
19
+
20
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
21
+ import torch
22
+ from llava.model import *
23
+ from llava.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
24
+
25
+
26
+ def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto", device="cuda", use_flash_attn=False, **kwargs):
27
+ kwargs = {"device_map": device_map, **kwargs}
28
+
29
+ if device != "cuda":
30
+ kwargs['device_map'] = {"": device}
31
+
32
+ if load_8bit:
33
+ kwargs['load_in_8bit'] = True
34
+ elif load_4bit:
35
+ kwargs['load_in_4bit'] = True
36
+ kwargs['quantization_config'] = BitsAndBytesConfig(
37
+ load_in_4bit=True,
38
+ bnb_4bit_compute_dtype=torch.float16,
39
+ bnb_4bit_use_double_quant=True,
40
+ bnb_4bit_quant_type='nf4'
41
+ )
42
+ else:
43
+ kwargs['torch_dtype'] = torch.float16
44
+
45
+ if use_flash_attn:
46
+ kwargs['attn_implementation'] = 'flash_attention_2'
47
+
48
+ if 'llava' in model_name.lower():
49
+ # Load LLaVA model
50
+ if 'lora' in model_name.lower() and model_base is None:
51
+ warnings.warn('There is `lora` in model name but no `model_base` is provided. If you are loading a LoRA model, please provide the `model_base` argument. Detailed instruction: https://github.com/haotian-liu/LLaVA#launch-a-model-worker-lora-weights-unmerged.')
52
+ if 'lora' in model_name.lower() and model_base is not None:
53
+ from llava.model.language_model.llava_llama import LlavaConfig
54
+ lora_cfg_pretrained = LlavaConfig.from_pretrained(model_path)
55
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
56
+ print('Loading LLaVA from base model...')
57
+ model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
58
+ token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
59
+ if model.lm_head.weight.shape[0] != token_num:
60
+ model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
61
+ model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
62
+
63
+ print('Loading additional LLaVA weights...')
64
+ if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
65
+ non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
66
+ else:
67
+ # this is probably from HF Hub
68
+ from huggingface_hub import hf_hub_download
69
+ def load_from_hf(repo_id, filename, subfolder=None):
70
+ cache_file = hf_hub_download(
71
+ repo_id=repo_id,
72
+ filename=filename,
73
+ subfolder=subfolder)
74
+ return torch.load(cache_file, map_location='cpu')
75
+ non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
76
+ non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
77
+ if any(k.startswith('model.model.') for k in non_lora_trainables):
78
+ non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
79
+ model.load_state_dict(non_lora_trainables, strict=False)
80
+
81
+ from peft import PeftModel
82
+ print('Loading LoRA weights...')
83
+ model = PeftModel.from_pretrained(model, model_path)
84
+ print('Merging LoRA weights...')
85
+ model = model.merge_and_unload()
86
+ print('Model is loaded...')
87
+ elif model_base is not None:
88
+ # this may be mm projector only
89
+ print('Loading LLaVA from base model...')
90
+ if 'mpt' in model_name.lower():
91
+ if not os.path.isfile(os.path.join(model_path, 'configuration_mpt.py')):
92
+ shutil.copyfile(os.path.join(model_base, 'configuration_mpt.py'), os.path.join(model_path, 'configuration_mpt.py'))
93
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=True)
94
+ cfg_pretrained = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
95
+ model = LlavaMptForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
96
+ else:
97
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
98
+ cfg_pretrained = AutoConfig.from_pretrained(model_path)
99
+ model = LlavaLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
100
+
101
+ mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
102
+ mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
103
+ model.load_state_dict(mm_projector_weights, strict=False)
104
+ else:
105
+ if 'mpt' in model_name.lower():
106
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
107
+ model = LlavaMptForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
108
+ elif 'mistral' in model_name.lower():
109
+ tokenizer = AutoTokenizer.from_pretrained(model_path)
110
+ model = LlavaMistralForCausalLM.from_pretrained(
111
+ model_path,
112
+ low_cpu_mem_usage=True,
113
+ **kwargs
114
+ )
115
+ else:
116
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
117
+ model = LlavaLlamaForCausalLM.from_pretrained(
118
+ model_path,
119
+ low_cpu_mem_usage=True,
120
+ **kwargs
121
+ )
122
+ else:
123
+ # Load language model
124
+ if model_base is not None:
125
+ # PEFT model
126
+ from peft import PeftModel
127
+ tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
128
+ model = AutoModelForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, **kwargs)
129
+ print(f"Loading LoRA weights from {model_path}")
130
+ model = PeftModel.from_pretrained(model, model_path)
131
+ print(f"Merging weights")
132
+ model = model.merge_and_unload()
133
+ print('Convert to FP16...')
134
+ model.to(torch.float16)
135
+ else:
136
+ use_fast = False
137
+ if 'mpt' in model_name.lower():
138
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
139
+ model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs)
140
+ else:
141
+ tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
142
+ model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
143
+
144
+ image_processor = None
145
+
146
+ if 'llava' in model_name.lower():
147
+ mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
148
+ mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
149
+ if mm_use_im_patch_token:
150
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
151
+ if mm_use_im_start_end:
152
+ tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
153
+ model.resize_token_embeddings(len(tokenizer))
154
+
155
+ vision_tower = model.get_vision_tower()
156
+ if not vision_tower.is_loaded:
157
+ vision_tower.load_model(device_map=device_map)
158
+ if device_map != 'auto':
159
+ vision_tower.to(device=device_map, dtype=torch.float16)
160
+ image_processor = vision_tower.image_processor
161
+
162
+ if hasattr(model.config, "max_sequence_length"):
163
+ context_len = model.config.max_sequence_length
164
+ else:
165
+ context_len = 2048
166
+
167
+ return tokenizer, model, image_processor, context_len
llava/model/consolidate.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
4
+ """
5
+ import argparse
6
+
7
+ import torch
8
+ from transformers import AutoTokenizer, AutoModelForCausalLM
9
+ from llava.model import *
10
+ from llava.model.utils import auto_upgrade
11
+
12
+
13
+ def consolidate_ckpt(src_path, dst_path):
14
+ print("Loading model")
15
+ auto_upgrade(src_path)
16
+ src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17
+ src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
18
+ src_model.save_pretrained(dst_path)
19
+ src_tokenizer.save_pretrained(dst_path)
20
+
21
+
22
+ if __name__ == "__main__":
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument("--src", type=str, required=True)
25
+ parser.add_argument("--dst", type=str, required=True)
26
+
27
+ args = parser.parse_args()
28
+
29
+ consolidate_ckpt(args.src, args.dst)
llava/model/language_model/llava_llama.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+
21
+ from transformers import AutoConfig, AutoModelForCausalLM, \
22
+ LlamaConfig, LlamaModel, LlamaForCausalLM
23
+
24
+ from transformers.modeling_outputs import CausalLMOutputWithPast
25
+ from transformers.generation.utils import GenerateOutput
26
+
27
+ from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
28
+
29
+
30
+ class LlavaConfig(LlamaConfig):
31
+ model_type = "llava_llama"
32
+
33
+
34
+ class LlavaLlamaModel(LlavaMetaModel, LlamaModel):
35
+ config_class = LlavaConfig
36
+
37
+ def __init__(self, config: LlamaConfig):
38
+ super(LlavaLlamaModel, self).__init__(config)
39
+
40
+
41
+ class LlavaLlamaForCausalLM(LlamaForCausalLM, LlavaMetaForCausalLM):
42
+ config_class = LlavaConfig
43
+
44
+ def __init__(self, config):
45
+ super(LlamaForCausalLM, self).__init__(config)
46
+ self.model = LlavaLlamaModel(config)
47
+ self.pretraining_tp = config.pretraining_tp
48
+ self.vocab_size = config.vocab_size
49
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
50
+
51
+ # Initialize weights and apply final processing
52
+ self.post_init()
53
+
54
+ def get_model(self):
55
+ return self.model
56
+
57
+ def forward(
58
+ self,
59
+ input_ids: torch.LongTensor = None,
60
+ attention_mask: Optional[torch.Tensor] = None,
61
+ position_ids: Optional[torch.LongTensor] = None,
62
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
63
+ inputs_embeds: Optional[torch.FloatTensor] = None,
64
+ labels: Optional[torch.LongTensor] = None,
65
+ use_cache: Optional[bool] = None,
66
+ output_attentions: Optional[bool] = None,
67
+ output_hidden_states: Optional[bool] = None,
68
+ images: Optional[torch.FloatTensor] = None,
69
+ image_sizes: Optional[List[List[int]]] = None,
70
+ return_dict: Optional[bool] = None,
71
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
72
+
73
+ if inputs_embeds is None:
74
+ (
75
+ input_ids,
76
+ position_ids,
77
+ attention_mask,
78
+ past_key_values,
79
+ inputs_embeds,
80
+ labels
81
+ ) = self.prepare_inputs_labels_for_multimodal(
82
+ input_ids,
83
+ position_ids,
84
+ attention_mask,
85
+ past_key_values,
86
+ labels,
87
+ images,
88
+ image_sizes
89
+ )
90
+
91
+ return super().forward(
92
+ input_ids=input_ids,
93
+ attention_mask=attention_mask,
94
+ position_ids=position_ids,
95
+ past_key_values=past_key_values,
96
+ inputs_embeds=inputs_embeds,
97
+ labels=labels,
98
+ use_cache=use_cache,
99
+ output_attentions=output_attentions,
100
+ output_hidden_states=output_hidden_states,
101
+ return_dict=return_dict
102
+ )
103
+
104
+ @torch.no_grad()
105
+ def generate(
106
+ self,
107
+ inputs: Optional[torch.Tensor] = None,
108
+ images: Optional[torch.Tensor] = None,
109
+ image_sizes: Optional[torch.Tensor] = None,
110
+ **kwargs,
111
+ ) -> Union[GenerateOutput, torch.LongTensor]:
112
+ position_ids = kwargs.pop("position_ids", None)
113
+ attention_mask = kwargs.pop("attention_mask", None)
114
+ if "inputs_embeds" in kwargs:
115
+ raise NotImplementedError("`inputs_embeds` is not supported")
116
+
117
+ if images is not None:
118
+ (
119
+ inputs,
120
+ position_ids,
121
+ attention_mask,
122
+ _,
123
+ inputs_embeds,
124
+ _
125
+ ) = self.prepare_inputs_labels_for_multimodal(
126
+ inputs,
127
+ position_ids,
128
+ attention_mask,
129
+ None,
130
+ None,
131
+ images,
132
+ image_sizes=image_sizes
133
+ )
134
+ else:
135
+ inputs_embeds = self.get_model().embed_tokens(inputs)
136
+
137
+ return super().generate(
138
+ position_ids=position_ids,
139
+ attention_mask=attention_mask,
140
+ inputs_embeds=inputs_embeds,
141
+ **kwargs
142
+ )
143
+
144
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
145
+ inputs_embeds=None, **kwargs):
146
+ images = kwargs.pop("images", None)
147
+ image_sizes = kwargs.pop("image_sizes", None)
148
+ inputs = super().prepare_inputs_for_generation(
149
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
150
+ )
151
+ if images is not None:
152
+ inputs['images'] = images
153
+ if image_sizes is not None:
154
+ inputs['image_sizes'] = image_sizes
155
+ return inputs
156
+
157
+ AutoConfig.register("llava_llama", LlavaConfig)
158
+ AutoModelForCausalLM.register(LlavaConfig, LlavaLlamaForCausalLM)
llava/model/language_model/llava_mistral.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import List, Optional, Tuple, Union
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+ from torch.nn import CrossEntropyLoss
21
+
22
+ from transformers import AutoConfig, AutoModelForCausalLM, \
23
+ MistralConfig, MistralModel, MistralForCausalLM
24
+
25
+ from transformers.modeling_outputs import CausalLMOutputWithPast
26
+ from transformers.generation.utils import GenerateOutput
27
+
28
+ from ..llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
29
+
30
+
31
+ class LlavaMistralConfig(MistralConfig):
32
+ model_type = "llava_mistral"
33
+
34
+
35
+ class LlavaMistralModel(LlavaMetaModel, MistralModel):
36
+ config_class = LlavaMistralConfig
37
+
38
+ def __init__(self, config: MistralConfig):
39
+ super(LlavaMistralModel, self).__init__(config)
40
+
41
+
42
+ class LlavaMistralForCausalLM(MistralForCausalLM, LlavaMetaForCausalLM):
43
+ config_class = LlavaMistralConfig
44
+
45
+ def __init__(self, config):
46
+ super(MistralForCausalLM, self).__init__(config)
47
+ self.model = LlavaMistralModel(config)
48
+
49
+ self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
50
+
51
+ # Initialize weights and apply final processing
52
+ self.post_init()
53
+
54
+ def get_model(self):
55
+ return self.model
56
+
57
+ def forward(
58
+ self,
59
+ input_ids: torch.LongTensor = None,
60
+ attention_mask: Optional[torch.Tensor] = None,
61
+ position_ids: Optional[torch.LongTensor] = None,
62
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
63
+ inputs_embeds: Optional[torch.FloatTensor] = None,
64
+ labels: Optional[torch.LongTensor] = None,
65
+ use_cache: Optional[bool] = None,
66
+ output_attentions: Optional[bool] = None,
67
+ output_hidden_states: Optional[bool] = None,
68
+ images: Optional[torch.FloatTensor] = None,
69
+ image_sizes: Optional[List[List[int]]] = None,
70
+ return_dict: Optional[bool] = None,
71
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
72
+
73
+ if inputs_embeds is None:
74
+ (
75
+ input_ids,
76
+ position_ids,
77
+ attention_mask,
78
+ past_key_values,
79
+ inputs_embeds,
80
+ labels
81
+ ) = self.prepare_inputs_labels_for_multimodal(
82
+ input_ids,
83
+ position_ids,
84
+ attention_mask,
85
+ past_key_values,
86
+ labels,
87
+ images,
88
+ image_sizes
89
+ )
90
+
91
+ return super().forward(
92
+ input_ids=input_ids,
93
+ attention_mask=attention_mask,
94
+ position_ids=position_ids,
95
+ past_key_values=past_key_values,
96
+ inputs_embeds=inputs_embeds,
97
+ labels=labels,
98
+ use_cache=use_cache,
99
+ output_attentions=output_attentions,
100
+ output_hidden_states=output_hidden_states,
101
+ return_dict=return_dict
102
+ )
103
+
104
+ @torch.no_grad()
105
+ def generate(
106
+ self,
107
+ inputs: Optional[torch.Tensor] = None,
108
+ images: Optional[torch.Tensor] = None,
109
+ image_sizes: Optional[torch.Tensor] = None,
110
+ **kwargs,
111
+ ) -> Union[GenerateOutput, torch.LongTensor]:
112
+ position_ids = kwargs.pop("position_ids", None)
113
+ attention_mask = kwargs.pop("attention_mask", None)
114
+ if "inputs_embeds" in kwargs:
115
+ raise NotImplementedError("`inputs_embeds` is not supported")
116
+
117
+ if images is not None:
118
+ (
119
+ inputs,
120
+ position_ids,
121
+ attention_mask,
122
+ _,
123
+ inputs_embeds,
124
+ _
125
+ ) = self.prepare_inputs_labels_for_multimodal(
126
+ inputs,
127
+ position_ids,
128
+ attention_mask,
129
+ None,
130
+ None,
131
+ images,
132
+ image_sizes=image_sizes
133
+ )
134
+ else:
135
+ inputs_embeds = self.get_model().embed_tokens(inputs)
136
+
137
+ return super().generate(
138
+ position_ids=position_ids,
139
+ attention_mask=attention_mask,
140
+ inputs_embeds=inputs_embeds,
141
+ **kwargs
142
+ )
143
+
144
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None,
145
+ inputs_embeds=None, **kwargs):
146
+ images = kwargs.pop("images", None)
147
+ image_sizes = kwargs.pop("image_sizes", None)
148
+ inputs = super().prepare_inputs_for_generation(
149
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
150
+ )
151
+ if images is not None:
152
+ inputs['images'] = images
153
+ if image_sizes is not None:
154
+ inputs['image_sizes'] = image_sizes
155
+ return inputs
156
+
157
+ AutoConfig.register("llava_mistral", LlavaMistralConfig)
158
+ AutoModelForCausalLM.register(LlavaMistralConfig, LlavaMistralForCausalLM)
llava/model/language_model/llava_mpt.py ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from typing import Optional, Tuple
17
+
18
+ import torch
19
+
20
+ from transformers import AutoConfig, AutoModelForCausalLM, \
21
+ MptConfig, MptForCausalLM, MptModel
22
+ from llava.model.llava_arch import LlavaMetaModel, LlavaMetaForCausalLM
23
+
24
+
25
+ class LlavaMptConfig(MptConfig):
26
+ model_type = "llava_mpt"
27
+
28
+
29
+ class LlavaMptModel(LlavaMetaModel, MptModel):
30
+ config_class = LlavaMptConfig
31
+
32
+ def __init__(self, config: MptConfig):
33
+ config.hidden_size = config.d_model
34
+ super(LlavaMptModel, self).__init__(config)
35
+
36
+ def embed_tokens(self, x):
37
+ return self.wte(x)
38
+
39
+
40
+ class LlavaMptForCausalLM(MptForCausalLM, LlavaMetaForCausalLM):
41
+ config_class = LlavaMptConfig
42
+ supports_gradient_checkpointing = True
43
+
44
+ def __init__(self, config):
45
+ super(MptForCausalLM, self).__init__(config)
46
+
47
+ self.transformer = LlavaMptModel(config)
48
+ self.lm_head = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
49
+
50
+ # Initialize weights and apply final processing
51
+ self.post_init()
52
+
53
+ def get_model(self):
54
+ return self.transformer
55
+
56
+ def _set_gradient_checkpointing(self, module, value=False):
57
+ if isinstance(module, LlavaMptModel):
58
+ module.gradient_checkpointing = value
59
+
60
+ def forward(
61
+ self,
62
+ input_ids: Optional[torch.LongTensor] = None,
63
+ past_key_values: Optional[Tuple[Tuple[torch.Tensor, torch.Tensor], ...]] = None,
64
+ attention_mask: Optional[torch.Tensor] = None,
65
+ inputs_embeds: Optional[torch.Tensor] = None,
66
+ labels: Optional[torch.Tensor] = None,
67
+ use_cache: Optional[bool] = None,
68
+ output_attentions: Optional[bool] = None,
69
+ output_hidden_states: Optional[bool] = None,
70
+ return_dict: Optional[bool] = None,
71
+ images=None):
72
+
73
+ input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
74
+
75
+ return super().forward(
76
+ input_ids,
77
+ past_key_values=past_key_values,
78
+ attention_mask=attention_mask,
79
+ inputs_embeds=inputs_embeds,
80
+ labels=labels,
81
+ use_cache=use_cache,
82
+ output_attentions=output_attentions,
83
+ output_hidden_states=output_hidden_states,
84
+ return_dict=return_dict,
85
+ )
86
+
87
+ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, inputs_embeds=None, **kwargs):
88
+ images = kwargs.pop("images", None)
89
+ _inputs = super().prepare_inputs_for_generation(
90
+ input_ids, past_key_values=past_key_values, inputs_embeds=inputs_embeds, **kwargs
91
+ )
92
+ _inputs['images'] = images
93
+ return _inputs
94
+
95
+
96
+ AutoConfig.register("llava_mpt", LlavaMptConfig)
97
+ AutoModelForCausalLM.register(LlavaMptConfig, LlavaMptForCausalLM)
llava/model/llava_arch.py ADDED
@@ -0,0 +1,368 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2023 Haotian Liu
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from abc import ABC, abstractmethod
17
+
18
+ import torch
19
+ import torch.nn as nn
20
+
21
+ from .multimodal_encoder.builder import build_vision_tower
22
+ from .multimodal_projector.builder import build_vision_projector
23
+
24
+ from llava.constants import IGNORE_INDEX, IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
25
+
26
+ from llava.mm_utils import get_anyres_image_grid_shape
27
+
28
+
29
+ class LlavaMetaModel:
30
+
31
+ def __init__(self, config):
32
+ super(LlavaMetaModel, self).__init__(config)
33
+
34
+ if hasattr(config, "mm_vision_tower"):
35
+ self.vision_tower = build_vision_tower(config, delay_load=True)
36
+ self.mm_projector = build_vision_projector(config)
37
+
38
+ if 'unpad' in getattr(config, 'mm_patch_merge_type', ''):
39
+ self.image_newline = nn.Parameter(
40
+ torch.empty(config.hidden_size, dtype=self.dtype)
41
+ )
42
+
43
+ def get_vision_tower(self):
44
+ vision_tower = getattr(self, 'vision_tower', None)
45
+ if type(vision_tower) is list:
46
+ vision_tower = vision_tower[0]
47
+ return vision_tower
48
+
49
+ def initialize_vision_modules(self, model_args, fsdp=None):
50
+ vision_tower = model_args.vision_tower
51
+ mm_vision_select_layer = model_args.mm_vision_select_layer
52
+ mm_vision_select_feature = model_args.mm_vision_select_feature
53
+ pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
54
+ mm_patch_merge_type = model_args.mm_patch_merge_type
55
+
56
+ self.config.mm_vision_tower = vision_tower
57
+
58
+ if self.get_vision_tower() is None:
59
+ vision_tower = build_vision_tower(model_args)
60
+
61
+ if fsdp is not None and len(fsdp) > 0:
62
+ self.vision_tower = [vision_tower]
63
+ else:
64
+ self.vision_tower = vision_tower
65
+ else:
66
+ if fsdp is not None and len(fsdp) > 0:
67
+ vision_tower = self.vision_tower[0]
68
+ else:
69
+ vision_tower = self.vision_tower
70
+ vision_tower.load_model()
71
+
72
+ self.config.use_mm_proj = True
73
+ self.config.mm_projector_type = getattr(model_args, 'mm_projector_type', 'linear')
74
+ self.config.mm_hidden_size = vision_tower.hidden_size
75
+ self.config.mm_vision_select_layer = mm_vision_select_layer
76
+ self.config.mm_vision_select_feature = mm_vision_select_feature
77
+ self.config.mm_patch_merge_type = mm_patch_merge_type
78
+
79
+ if getattr(self, 'mm_projector', None) is None:
80
+ self.mm_projector = build_vision_projector(self.config)
81
+
82
+ if 'unpad' in mm_patch_merge_type:
83
+ embed_std = 1 / torch.sqrt(torch.tensor(self.config.hidden_size, dtype=self.dtype))
84
+ self.image_newline = nn.Parameter(
85
+ torch.randn(self.config.hidden_size, dtype=self.dtype) * embed_std
86
+ )
87
+ else:
88
+ # In case it is frozen by LoRA
89
+ for p in self.mm_projector.parameters():
90
+ p.requires_grad = True
91
+
92
+ if pretrain_mm_mlp_adapter is not None:
93
+ mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
94
+ def get_w(weights, keyword):
95
+ return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
96
+
97
+ self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
98
+
99
+
100
+ def unpad_image(tensor, original_size):
101
+ """
102
+ Unpads a PyTorch tensor of a padded and resized image.
103
+
104
+ Args:
105
+ tensor (torch.Tensor): The image tensor, assumed to be in CxHxW format.
106
+ original_size (tuple): The original size of PIL image (width, height).
107
+
108
+ Returns:
109
+ torch.Tensor: The unpadded image tensor.
110
+ """
111
+ original_width, original_height = original_size
112
+ current_height, current_width = tensor.shape[1:]
113
+
114
+ original_aspect_ratio = original_width / original_height
115
+ current_aspect_ratio = current_width / current_height
116
+
117
+ if original_aspect_ratio > current_aspect_ratio:
118
+ scale_factor = current_width / original_width
119
+ new_height = int(original_height * scale_factor)
120
+ padding = (current_height - new_height) // 2
121
+ unpadded_tensor = tensor[:, padding:current_height - padding, :]
122
+ else:
123
+ scale_factor = current_height / original_height
124
+ new_width = int(original_width * scale_factor)
125
+ padding = (current_width - new_width) // 2
126
+ unpadded_tensor = tensor[:, :, padding:current_width - padding]
127
+
128
+ return unpadded_tensor
129
+
130
+
131
+ class LlavaMetaForCausalLM(ABC):
132
+
133
+ @abstractmethod
134
+ def get_model(self):
135
+ pass
136
+
137
+ def get_vision_tower(self):
138
+ return self.get_model().get_vision_tower()
139
+
140
+ def encode_images(self, images):
141
+ image_features = self.get_model().get_vision_tower()(images)
142
+ image_features = self.get_model().mm_projector(image_features)
143
+ return image_features
144
+
145
+ def prepare_inputs_labels_for_multimodal(
146
+ self, input_ids, position_ids, attention_mask, past_key_values, labels,
147
+ images, image_sizes=None
148
+ ):
149
+ vision_tower = self.get_vision_tower()
150
+ if vision_tower is None or images is None or input_ids.shape[1] == 1:
151
+ return input_ids, position_ids, attention_mask, past_key_values, None, labels
152
+
153
+ if type(images) is list or images.ndim == 5:
154
+ if type(images) is list:
155
+ images = [x.unsqueeze(0) if x.ndim == 3 else x for x in images]
156
+ concat_images = torch.cat([image for image in images], dim=0)
157
+ image_features = self.encode_images(concat_images)
158
+ split_sizes = [image.shape[0] for image in images]
159
+ image_features = torch.split(image_features, split_sizes, dim=0)
160
+ mm_patch_merge_type = getattr(self.config, 'mm_patch_merge_type', 'flat')
161
+ image_aspect_ratio = getattr(self.config, 'image_aspect_ratio', 'square')
162
+ if mm_patch_merge_type == 'flat':
163
+ image_features = [x.flatten(0, 1) for x in image_features]
164
+ elif mm_patch_merge_type.startswith('spatial'):
165
+ new_image_features = []
166
+ for image_idx, image_feature in enumerate(image_features):
167
+ if image_feature.shape[0] > 1:
168
+ base_image_feature = image_feature[0]
169
+ image_feature = image_feature[1:]
170
+ height = width = self.get_vision_tower().num_patches_per_side
171
+ assert height * width == base_image_feature.shape[0]
172
+ if image_aspect_ratio == 'anyres':
173
+ num_patch_width, num_patch_height = get_anyres_image_grid_shape(image_sizes[image_idx], self.config.image_grid_pinpoints, self.get_vision_tower().config.image_size)
174
+ image_feature = image_feature.view(num_patch_height, num_patch_width, height, width, -1)
175
+ else:
176
+ raise NotImplementedError
177
+ if 'unpad' in mm_patch_merge_type:
178
+ image_feature = image_feature.permute(4, 0, 2, 1, 3).contiguous()
179
+ image_feature = image_feature.flatten(1, 2).flatten(2, 3)
180
+ image_feature = unpad_image(image_feature, image_sizes[image_idx])
181
+ image_feature = torch.cat((
182
+ image_feature,
183
+ self.model.image_newline[:, None, None].expand(*image_feature.shape[:-1], 1).to(image_feature.device)
184
+ ), dim=-1)
185
+ image_feature = image_feature.flatten(1, 2).transpose(0, 1)
186
+ else:
187
+ image_feature = image_feature.permute(0, 2, 1, 3, 4).contiguous()
188
+ image_feature = image_feature.flatten(0, 3)
189
+ image_feature = torch.cat((base_image_feature, image_feature), dim=0)
190
+ else:
191
+ image_feature = image_feature[0]
192
+ if 'unpad' in mm_patch_merge_type:
193
+ image_feature = torch.cat((
194
+ image_feature,
195
+ self.model.image_newline[None].to(image_feature.device)
196
+ ), dim=0)
197
+ new_image_features.append(image_feature)
198
+ image_features = new_image_features
199
+ else:
200
+ raise ValueError(f"Unexpected mm_patch_merge_type: {self.config.mm_patch_merge_type}")
201
+ else:
202
+ image_features = self.encode_images(images)
203
+
204
+ # TODO: image start / end is not implemented here to support pretraining.
205
+ if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
206
+ raise NotImplementedError
207
+
208
+ # Let's just add dummy tensors if they do not exist,
209
+ # it is a headache to deal with None all the time.
210
+ # But it is not ideal, and if you have a better idea,
211
+ # please open an issue / submit a PR, thanks.
212
+ _labels = labels
213
+ _position_ids = position_ids
214
+ _attention_mask = attention_mask
215
+ if attention_mask is None:
216
+ attention_mask = torch.ones_like(input_ids, dtype=torch.bool)
217
+ else:
218
+ attention_mask = attention_mask.bool()
219
+ if position_ids is None:
220
+ position_ids = torch.arange(0, input_ids.shape[1], dtype=torch.long, device=input_ids.device)
221
+ if labels is None:
222
+ labels = torch.full_like(input_ids, IGNORE_INDEX)
223
+
224
+ # remove the padding using attention_mask -- FIXME
225
+ _input_ids = input_ids
226
+ input_ids = [cur_input_ids[cur_attention_mask] for cur_input_ids, cur_attention_mask in zip(input_ids, attention_mask)]
227
+ labels = [cur_labels[cur_attention_mask] for cur_labels, cur_attention_mask in zip(labels, attention_mask)]
228
+
229
+ new_input_embeds = []
230
+ new_labels = []
231
+ cur_image_idx = 0
232
+ for batch_idx, cur_input_ids in enumerate(input_ids):
233
+ num_images = (cur_input_ids == IMAGE_TOKEN_INDEX).sum()
234
+ if num_images == 0:
235
+ cur_image_features = image_features[cur_image_idx]
236
+ cur_input_embeds_1 = self.get_model().embed_tokens(cur_input_ids)
237
+ cur_input_embeds = torch.cat([cur_input_embeds_1, cur_image_features[0:0]], dim=0)
238
+ new_input_embeds.append(cur_input_embeds)
239
+ new_labels.append(labels[batch_idx])
240
+ cur_image_idx += 1
241
+ continue
242
+
243
+ image_token_indices = [-1] + torch.where(cur_input_ids == IMAGE_TOKEN_INDEX)[0].tolist() + [cur_input_ids.shape[0]]
244
+ cur_input_ids_noim = []
245
+ cur_labels = labels[batch_idx]
246
+ cur_labels_noim = []
247
+ for i in range(len(image_token_indices) - 1):
248
+ cur_input_ids_noim.append(cur_input_ids[image_token_indices[i]+1:image_token_indices[i+1]])
249
+ cur_labels_noim.append(cur_labels[image_token_indices[i]+1:image_token_indices[i+1]])
250
+ split_sizes = [x.shape[0] for x in cur_labels_noim]
251
+ cur_input_embeds = self.get_model().embed_tokens(torch.cat(cur_input_ids_noim))
252
+ cur_input_embeds_no_im = torch.split(cur_input_embeds, split_sizes, dim=0)
253
+ cur_new_input_embeds = []
254
+ cur_new_labels = []
255
+
256
+ for i in range(num_images + 1):
257
+ cur_new_input_embeds.append(cur_input_embeds_no_im[i])
258
+ cur_new_labels.append(cur_labels_noim[i])
259
+ if i < num_images:
260
+ cur_image_features = image_features[cur_image_idx]
261
+ cur_image_idx += 1
262
+ cur_new_input_embeds.append(cur_image_features)
263
+ cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=cur_labels.device, dtype=cur_labels.dtype))
264
+
265
+ cur_new_input_embeds = [x.to(self.device) for x in cur_new_input_embeds]
266
+
267
+ cur_new_input_embeds = torch.cat(cur_new_input_embeds)
268
+ cur_new_labels = torch.cat(cur_new_labels)
269
+
270
+ new_input_embeds.append(cur_new_input_embeds)
271
+ new_labels.append(cur_new_labels)
272
+
273
+ # Truncate sequences to max length as image embeddings can make the sequence longer
274
+ tokenizer_model_max_length = getattr(self.config, 'tokenizer_model_max_length', None)
275
+ if tokenizer_model_max_length is not None:
276
+ new_input_embeds = [x[:tokenizer_model_max_length] for x in new_input_embeds]
277
+ new_labels = [x[:tokenizer_model_max_length] for x in new_labels]
278
+
279
+ # Combine them
280
+ max_len = max(x.shape[0] for x in new_input_embeds)
281
+ batch_size = len(new_input_embeds)
282
+
283
+ new_input_embeds_padded = []
284
+ new_labels_padded = torch.full((batch_size, max_len), IGNORE_INDEX, dtype=new_labels[0].dtype, device=new_labels[0].device)
285
+ attention_mask = torch.zeros((batch_size, max_len), dtype=attention_mask.dtype, device=attention_mask.device)
286
+ position_ids = torch.zeros((batch_size, max_len), dtype=position_ids.dtype, device=position_ids.device)
287
+
288
+ for i, (cur_new_embed, cur_new_labels) in enumerate(zip(new_input_embeds, new_labels)):
289
+ cur_len = cur_new_embed.shape[0]
290
+ if getattr(self.config, 'tokenizer_padding_side', 'right') == "left":
291
+ new_input_embeds_padded.append(torch.cat((
292
+ torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device),
293
+ cur_new_embed
294
+ ), dim=0))
295
+ if cur_len > 0:
296
+ new_labels_padded[i, -cur_len:] = cur_new_labels
297
+ attention_mask[i, -cur_len:] = True
298
+ position_ids[i, -cur_len:] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
299
+ else:
300
+ new_input_embeds_padded.append(torch.cat((
301
+ cur_new_embed,
302
+ torch.zeros((max_len - cur_len, cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)
303
+ ), dim=0))
304
+ if cur_len > 0:
305
+ new_labels_padded[i, :cur_len] = cur_new_labels
306
+ attention_mask[i, :cur_len] = True
307
+ position_ids[i, :cur_len] = torch.arange(0, cur_len, dtype=position_ids.dtype, device=position_ids.device)
308
+
309
+ new_input_embeds = torch.stack(new_input_embeds_padded, dim=0)
310
+
311
+ if _labels is None:
312
+ new_labels = None
313
+ else:
314
+ new_labels = new_labels_padded
315
+
316
+ if _attention_mask is None:
317
+ attention_mask = None
318
+ else:
319
+ attention_mask = attention_mask.to(dtype=_attention_mask.dtype)
320
+
321
+ if _position_ids is None:
322
+ position_ids = None
323
+
324
+ return None, position_ids, attention_mask, past_key_values, new_input_embeds, new_labels
325
+
326
+ def initialize_vision_tokenizer(self, model_args, tokenizer):
327
+ if model_args.mm_use_im_patch_token:
328
+ tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
329
+ self.resize_token_embeddings(len(tokenizer))
330
+
331
+ if model_args.mm_use_im_start_end:
332
+ num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
333
+ self.resize_token_embeddings(len(tokenizer))
334
+
335
+ if num_new_tokens > 0:
336
+ input_embeddings = self.get_input_embeddings().weight.data
337
+ output_embeddings = self.get_output_embeddings().weight.data
338
+
339
+ input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
340
+ dim=0, keepdim=True)
341
+ output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
342
+ dim=0, keepdim=True)
343
+
344
+ input_embeddings[-num_new_tokens:] = input_embeddings_avg
345
+ output_embeddings[-num_new_tokens:] = output_embeddings_avg
346
+
347
+ if model_args.tune_mm_mlp_adapter:
348
+ for p in self.get_input_embeddings().parameters():
349
+ p.requires_grad = True
350
+ for p in self.get_output_embeddings().parameters():
351
+ p.requires_grad = False
352
+
353
+ if model_args.pretrain_mm_mlp_adapter:
354
+ mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
355
+ embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
356
+ assert num_new_tokens == 2
357
+ if input_embeddings.shape == embed_tokens_weight.shape:
358
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
359
+ elif embed_tokens_weight.shape[0] == num_new_tokens:
360
+ input_embeddings[-num_new_tokens:] = embed_tokens_weight
361
+ else:
362
+ raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
363
+ elif model_args.mm_use_im_patch_token:
364
+ if model_args.tune_mm_mlp_adapter:
365
+ for p in self.get_input_embeddings().parameters():
366
+ p.requires_grad = False
367
+ for p in self.get_output_embeddings().parameters():
368
+ p.requires_grad = False
llava/model/make_delta.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Usage:
3
+ python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
4
+ """
5
+ import argparse
6
+
7
+ import torch
8
+ from tqdm import tqdm
9
+ from transformers import AutoTokenizer, AutoModelForCausalLM
10
+ from llava.model.utils import auto_upgrade
11
+
12
+
13
+ def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
14
+ print("Loading base model")
15
+ base = AutoModelForCausalLM.from_pretrained(
16
+ base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
17
+
18
+ print("Loading target model")
19
+ auto_upgrade(target_model_path)
20
+ target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
21
+
22
+ print("Calculating delta")
23
+ for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
24
+ if name not in base.state_dict():
25
+ assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
26
+ continue
27
+ if param.data.shape == base.state_dict()[name].shape:
28
+ param.data -= base.state_dict()[name]
29
+ else:
30
+ assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
31
+ bparam = base.state_dict()[name]
32
+ param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
33
+
34
+ print("Saving delta")
35
+ if hub_repo_id:
36
+ kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
37
+ else:
38
+ kwargs = {}
39
+ target.save_pretrained(delta_path, **kwargs)
40
+ target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
41
+ target_tokenizer.save_pretrained(delta_path, **kwargs)
42
+
43
+
44
+ if __name__ == "__main__":
45
+ parser = argparse.ArgumentParser()
46
+ parser.add_argument("--base-model-path", type=str, required=True)
47
+ parser.add_argument("--target-model-path", type=str, required=True)
48
+ parser.add_argument("--delta-path", type=str, required=True)
49
+ parser.add_argument("--hub-repo-id", type=str, default=None)
50
+ args = parser.parse_args()
51
+
52
+ make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)
llava/model/multimodal_encoder/builder.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from .clip_encoder import CLIPVisionTower, CLIPVisionTowerS2
3
+
4
+
5
+ def build_vision_tower(vision_tower_cfg, **kwargs):
6
+ vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
7
+ is_absolute_path_exists = os.path.exists(vision_tower)
8
+ use_s2 = getattr(vision_tower_cfg, 's2', False)
9
+ if is_absolute_path_exists or vision_tower.startswith("openai") or vision_tower.startswith("laion") or "ShareGPT4V" in vision_tower:
10
+ if use_s2:
11
+ return CLIPVisionTowerS2(vision_tower, args=vision_tower_cfg, **kwargs)
12
+ else:
13
+ return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
14
+
15
+ raise ValueError(f'Unknown vision tower: {vision_tower}')
llava/model/multimodal_encoder/clip_encoder.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
5
+
6
+
7
+ class CLIPVisionTower(nn.Module):
8
+ def __init__(self, vision_tower, args, delay_load=False):
9
+ super().__init__()
10
+
11
+ self.is_loaded = False
12
+
13
+ self.vision_tower_name = vision_tower
14
+ self.select_layer = args.mm_vision_select_layer
15
+ self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
16
+
17
+ if not delay_load:
18
+ self.load_model()
19
+ elif getattr(args, 'unfreeze_mm_vision_tower', False):
20
+ self.load_model()
21
+ else:
22
+ self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
23
+
24
+ def load_model(self, device_map=None):
25
+ if self.is_loaded:
26
+ print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
27
+ return
28
+
29
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
30
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
31
+ self.vision_tower.requires_grad_(False)
32
+
33
+ self.is_loaded = True
34
+
35
+ def feature_select(self, image_forward_outs):
36
+ image_features = image_forward_outs.hidden_states[self.select_layer]
37
+ if self.select_feature == 'patch':
38
+ image_features = image_features[:, 1:]
39
+ elif self.select_feature == 'cls_patch':
40
+ image_features = image_features
41
+ else:
42
+ raise ValueError(f'Unexpected select feature: {self.select_feature}')
43
+ return image_features
44
+
45
+ @torch.no_grad()
46
+ def forward(self, images):
47
+ if type(images) is list:
48
+ image_features = []
49
+ for image in images:
50
+ image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
51
+ image_feature = self.feature_select(image_forward_out).to(image.dtype)
52
+ image_features.append(image_feature)
53
+ else:
54
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
55
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
56
+
57
+ return image_features
58
+
59
+ @property
60
+ def dummy_feature(self):
61
+ return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
62
+
63
+ @property
64
+ def dtype(self):
65
+ return self.vision_tower.dtype
66
+
67
+ @property
68
+ def device(self):
69
+ return self.vision_tower.device
70
+
71
+ @property
72
+ def config(self):
73
+ if self.is_loaded:
74
+ return self.vision_tower.config
75
+ else:
76
+ return self.cfg_only
77
+
78
+ @property
79
+ def hidden_size(self):
80
+ return self.config.hidden_size
81
+
82
+ @property
83
+ def num_patches_per_side(self):
84
+ return self.config.image_size // self.config.patch_size
85
+
86
+ @property
87
+ def num_patches(self):
88
+ return (self.config.image_size // self.config.patch_size) ** 2
89
+
90
+
91
+
92
+ class CLIPVisionTowerS2(CLIPVisionTower):
93
+ def __init__(self, vision_tower, args, delay_load=False):
94
+ super().__init__(vision_tower, args, delay_load)
95
+
96
+ self.s2_scales = getattr(args, 's2_scales', '336,672,1008')
97
+ self.s2_scales = list(map(int, self.s2_scales.split(',')))
98
+ self.s2_scales.sort()
99
+ self.s2_split_size = self.s2_scales[0]
100
+ self.s2_image_size = self.s2_scales[-1]
101
+
102
+ try:
103
+ from s2wrapper import forward as multiscale_forward
104
+ except ImportError:
105
+ raise ImportError('Package s2wrapper not found! Please install by running: \npip install git+https://github.com/bfshi/scaling_on_scales.git')
106
+ self.multiscale_forward = multiscale_forward
107
+
108
+ # change resize/crop size in preprocessing to the largest image size in s2_scale
109
+ if not delay_load or getattr(args, 'unfreeze_mm_vision_tower', False):
110
+ self.image_processor.size['shortest_edge'] = self.s2_image_size
111
+ self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
112
+
113
+ def load_model(self, device_map=None):
114
+ if self.is_loaded:
115
+ print('{} is already loaded, `load_model` called again, skipping.'.format(self.vision_tower_name))
116
+ return
117
+
118
+ self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
119
+ self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name, device_map=device_map)
120
+ self.vision_tower.requires_grad_(False)
121
+
122
+ self.image_processor.size['shortest_edge'] = self.s2_image_size
123
+ self.image_processor.crop_size['height'] = self.image_processor.crop_size['width'] = self.s2_image_size
124
+
125
+ self.is_loaded = True
126
+
127
+ @torch.no_grad()
128
+ def forward_feature(self, images):
129
+ image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
130
+ image_features = self.feature_select(image_forward_outs).to(images.dtype)
131
+ return image_features
132
+
133
+ @torch.no_grad()
134
+ def forward(self, images):
135
+ if type(images) is list:
136
+ image_features = []
137
+ for image in images:
138
+ image_feature = self.multiscale_forward(self.forward_feature, image.unsqueeze(0), img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
139
+ image_features.append(image_feature)
140
+ else:
141
+ image_features = self.multiscale_forward(self.forward_feature, images, img_sizes=self.s2_scales, max_split_size=self.s2_split_size)
142
+
143
+ return image_features
144
+
145
+ @property
146
+ def hidden_size(self):
147
+ return self.config.hidden_size * len(self.s2_scales)
llava/model/multimodal_projector/builder.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import re
4
+
5
+
6
+ class IdentityMap(nn.Module):
7
+ def __init__(self):
8
+ super().__init__()
9
+
10
+ def forward(self, x, *args, **kwargs):
11
+ return x
12
+
13
+ @property
14
+ def config(self):
15
+ return {"mm_projector_type": 'identity'}
16
+
17
+
18
+ class SimpleResBlock(nn.Module):
19
+ def __init__(self, channels):
20
+ super().__init__()
21
+ self.pre_norm = nn.LayerNorm(channels)
22
+
23
+ self.proj = nn.Sequential(
24
+ nn.Linear(channels, channels),
25
+ nn.GELU(),
26
+ nn.Linear(channels, channels)
27
+ )
28
+ def forward(self, x):
29
+ x = self.pre_norm(x)
30
+ return x + self.proj(x)
31
+
32
+
33
+ def build_vision_projector(config, delay_load=False, **kwargs):
34
+ projector_type = getattr(config, 'mm_projector_type', 'linear')
35
+
36
+ if projector_type == 'linear':
37
+ return nn.Linear(config.mm_hidden_size, config.hidden_size)
38
+
39
+ mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
40
+ if mlp_gelu_match:
41
+ mlp_depth = int(mlp_gelu_match.group(1))
42
+ modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
43
+ for _ in range(1, mlp_depth):
44
+ modules.append(nn.GELU())
45
+ modules.append(nn.Linear(config.hidden_size, config.hidden_size))
46
+ return nn.Sequential(*modules)
47
+
48
+ if projector_type == 'identity':
49
+ return IdentityMap()
50
+
51
+ raise ValueError(f'Unknown projector type: {projector_type}')
llava/model/utils.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoConfig
2
+
3
+
4
+ def auto_upgrade(config):
5
+ cfg = AutoConfig.from_pretrained(config)
6
+ if 'llava' in config and 'llava' not in cfg.model_type:
7
+ assert cfg.model_type == 'llama'
8
+ print("You are using newer LLaVA code base, while the checkpoint of v0 is from older code base.")
9
+ print("You must upgrade the checkpoint to the new code base (this can be done automatically).")
10
+ confirm = input("Please confirm that you want to upgrade the checkpoint. [Y/N]")
11
+ if confirm.lower() in ["y", "yes"]:
12
+ print("Upgrading checkpoint...")
13
+ assert len(cfg.architectures) == 1
14
+ setattr(cfg.__class__, "model_type", "llava")
15
+ cfg.architectures[0] = 'LlavaLlamaForCausalLM'
16
+ cfg.save_pretrained(config)
17
+ print("Checkpoint upgraded.")
18
+ else:
19
+ print("Checkpoint upgrade aborted.")
20
+ exit(1)