Spaces:
Running
Running
Add support for multimodal processor chat_template.json
Browse files
app.py
CHANGED
|
@@ -13,6 +13,7 @@ hfapi = HfApi()
|
|
| 13 |
|
| 14 |
|
| 15 |
class ModelFiles(StrEnum):
|
|
|
|
| 16 |
TOKENIZER_CHAT_TEMPLATE = "tokenizer_chat_template.jinja"
|
| 17 |
TOKENIZER_CONFIG = "tokenizer_config.json"
|
| 18 |
TOKENIZER_INVERSE_TEMPLATE = "inverse_template.jinja"
|
|
@@ -27,6 +28,7 @@ example_labels = [
|
|
| 27 |
"Tool call with multiple responses",
|
| 28 |
"Tool call with complex tool definition",
|
| 29 |
"RAG call",
|
|
|
|
| 30 |
]
|
| 31 |
example_values = [
|
| 32 |
[
|
|
@@ -315,6 +317,37 @@ example_values = [
|
|
| 315 |
}
|
| 316 |
]""",
|
| 317 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 318 |
]
|
| 319 |
|
| 320 |
|
|
@@ -678,30 +711,32 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
| 678 |
org_template_tool_use = ""
|
| 679 |
org_template_rag = ""
|
| 680 |
|
| 681 |
-
|
| 682 |
-
|
|
|
|
|
|
|
| 683 |
|
| 684 |
-
|
| 685 |
-
|
| 686 |
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
|
| 706 |
tokenizer_chat_template = info.get(ModelFiles.TOKENIZER_CHAT_TEMPLATE, {})
|
| 707 |
org_template = tokenizer_chat_template.get("data", org_template)
|
|
@@ -766,15 +801,33 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
| 766 |
operations = []
|
| 767 |
pr_branch = branch if branch.startswith("refs/pr/") else None
|
| 768 |
|
| 769 |
-
|
| 770 |
-
if
|
| 771 |
-
tokenizer_config = TokenizerConfig(
|
| 772 |
|
| 773 |
tokenizer_config.chat_templates["default"] = template
|
| 774 |
tokenizer_config.chat_templates["tool_use"] = template_tool_use
|
| 775 |
tokenizer_config.chat_templates["rag"] = template_rag
|
| 776 |
# tokenizer_config.inverse_template = template_inverse
|
| 777 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 778 |
new_config = tokenizer_config.json(get_json_indent(org_config))
|
| 779 |
if org_config.endswith("\n"):
|
| 780 |
new_config += "\n"
|
|
@@ -817,6 +870,10 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
| 817 |
|
| 818 |
info["parent_commit"] = commit.oid
|
| 819 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 820 |
if org_config:
|
| 821 |
tokenizer_file["data"] = new_config
|
| 822 |
tokenizer_file["content"] = json.loads(new_config)
|
|
@@ -950,6 +1007,7 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
| 950 |
"disabled": info.disabled,
|
| 951 |
"gated": info.gated,
|
| 952 |
"private": info.private,
|
|
|
|
| 953 |
}
|
| 954 |
|
| 955 |
template_messages = example_values[0][1]
|
|
@@ -991,6 +1049,7 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
| 991 |
progress = gr.Progress(track_tqdm = True),
|
| 992 |
oauth_token: gr.OAuthToken | None = None,
|
| 993 |
):
|
|
|
|
| 994 |
write_access = False
|
| 995 |
|
| 996 |
if info and oauth_token:
|
|
@@ -1097,6 +1156,34 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
| 1097 |
else:
|
| 1098 |
gr.Warning(f"No {ModelFiles.TOKENIZER_CONFIG} found in repository...")
|
| 1099 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1100 |
pr_details = None
|
| 1101 |
if branch and branch.startswith("refs/pr/"):
|
| 1102 |
pr_num = branch.split("/")[-1]
|
|
@@ -1129,9 +1216,9 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
| 1129 |
pr_submit: gr.Button(
|
| 1130 |
value = f"Commit to PR #{pr_details.num}" if pr_details else "Create Pull Request",
|
| 1131 |
),
|
| 1132 |
-
|
| 1133 |
-
|
| 1134 |
-
|
| 1135 |
# inverse_template: gr.skip() if ModelFiles.TOKENIZER_INVERSE_TEMPLATE not in info else gr.Code(
|
| 1136 |
# value = info[ModelFiles.TOKENIZER_INVERSE_TEMPLATE]["data"],
|
| 1137 |
# ),
|
|
@@ -1198,7 +1285,7 @@ You can freely edit and test GGUF chat template(s) (and are encouraged to do so)
|
|
| 1198 |
pr_preview_title,
|
| 1199 |
pr_description,
|
| 1200 |
pr_submit,
|
| 1201 |
-
|
| 1202 |
# inverse_template,
|
| 1203 |
],
|
| 1204 |
show_api = False,
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
class ModelFiles(StrEnum):
|
| 16 |
+
CHAT_TEMPLATE_JSON = "chat_template.json"
|
| 17 |
TOKENIZER_CHAT_TEMPLATE = "tokenizer_chat_template.jinja"
|
| 18 |
TOKENIZER_CONFIG = "tokenizer_config.json"
|
| 19 |
TOKENIZER_INVERSE_TEMPLATE = "inverse_template.jinja"
|
|
|
|
| 28 |
"Tool call with multiple responses",
|
| 29 |
"Tool call with complex tool definition",
|
| 30 |
"RAG call",
|
| 31 |
+
"Multimodal user message",
|
| 32 |
]
|
| 33 |
example_values = [
|
| 34 |
[
|
|
|
|
| 317 |
}
|
| 318 |
]""",
|
| 319 |
],
|
| 320 |
+
[
|
| 321 |
+
"{}",
|
| 322 |
+
"""[
|
| 323 |
+
{
|
| 324 |
+
"role": "user",
|
| 325 |
+
"content": [
|
| 326 |
+
{
|
| 327 |
+
"type": "text",
|
| 328 |
+
"content": "Can this animal"
|
| 329 |
+
},
|
| 330 |
+
{
|
| 331 |
+
"type": "image"
|
| 332 |
+
},
|
| 333 |
+
{
|
| 334 |
+
"type": "text",
|
| 335 |
+
"content": "make this sound"
|
| 336 |
+
},
|
| 337 |
+
{
|
| 338 |
+
"type": "audio"
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"type": "text",
|
| 342 |
+
"content": "while moving like this?"
|
| 343 |
+
},
|
| 344 |
+
{
|
| 345 |
+
"type": "video"
|
| 346 |
+
}
|
| 347 |
+
]
|
| 348 |
+
}
|
| 349 |
+
]"""
|
| 350 |
+
]
|
| 351 |
]
|
| 352 |
|
| 353 |
|
|
|
|
| 711 |
org_template_tool_use = ""
|
| 712 |
org_template_rag = ""
|
| 713 |
|
| 714 |
+
for config_file_name in (ModelFiles.CHAT_TEMPLATE_JSON, ModelFiles.TOKENIZER_CONFIG):
|
| 715 |
+
config_file = info.get(config_file_name, {})
|
| 716 |
+
org_config = config_file.get("data")
|
| 717 |
+
org_content = config_file.get("content")
|
| 718 |
|
| 719 |
+
if org_content and ("chat_template" in org_content or not org_template):
|
| 720 |
+
tokenizer_config = TokenizerConfig(org_content)
|
| 721 |
|
| 722 |
+
org_template = tokenizer_config.chat_templates.get("default") or ""
|
| 723 |
+
org_template_tool_use = tokenizer_config.chat_templates.get("tool_use") or ""
|
| 724 |
+
org_template_rag = tokenizer_config.chat_templates.get("rag") or ""
|
| 725 |
+
# org_template_inverse = tokenizer_config.inverse_template or ""
|
| 726 |
|
| 727 |
+
tokenizer_config.chat_templates["default"] = template
|
| 728 |
+
tokenizer_config.chat_templates["tool_use"] = template_tool_use
|
| 729 |
+
tokenizer_config.chat_templates["rag"] = template_rag
|
| 730 |
+
# tokenizer_config.inverse_template = template_inverse
|
| 731 |
|
| 732 |
+
new_config = tokenizer_config.json(get_json_indent(org_config))
|
| 733 |
+
if org_config.endswith("\n"):
|
| 734 |
+
new_config += "\n"
|
| 735 |
|
| 736 |
+
changes += [
|
| 737 |
+
(token if token[1] in ("-", "+", "@") else token[1:].replace("\t", "\u21e5").replace("\r\n", "\u240d\u240a\r\n").replace("\r", "\u240d\r").replace("\n", "\u240a\n"), token[0] if token[0] != " " else None) # .replace(" ", "\u2423")
|
| 738 |
+
for token in unified_diff(new_config.splitlines(keepends = True), org_config.splitlines(keepends = True), fromfile = config_file_name, tofile = config_file_name)
|
| 739 |
+
]
|
| 740 |
|
| 741 |
tokenizer_chat_template = info.get(ModelFiles.TOKENIZER_CHAT_TEMPLATE, {})
|
| 742 |
org_template = tokenizer_chat_template.get("data", org_template)
|
|
|
|
| 801 |
operations = []
|
| 802 |
pr_branch = branch if branch.startswith("refs/pr/") else None
|
| 803 |
|
| 804 |
+
chat_template_file = info.get(ModelFiles.CHAT_TEMPLATE_JSON, {})
|
| 805 |
+
if org_chat_template := chat_template_file.get("data"):
|
| 806 |
+
tokenizer_config = TokenizerConfig(chat_template_file.get("content"))
|
| 807 |
|
| 808 |
tokenizer_config.chat_templates["default"] = template
|
| 809 |
tokenizer_config.chat_templates["tool_use"] = template_tool_use
|
| 810 |
tokenizer_config.chat_templates["rag"] = template_rag
|
| 811 |
# tokenizer_config.inverse_template = template_inverse
|
| 812 |
|
| 813 |
+
new_chat_template = tokenizer_config.json(get_json_indent(org_chat_template))
|
| 814 |
+
if org_chat_template.endswith("\n"):
|
| 815 |
+
new_chat_template += "\n"
|
| 816 |
+
|
| 817 |
+
if org_chat_template != new_chat_template:
|
| 818 |
+
operations.append(CommitOperationAdd(ModelFiles.CHAT_TEMPLATE_JSON, new_chat_template.encode("utf-8")))
|
| 819 |
+
|
| 820 |
+
tokenizer_file = info.get(ModelFiles.TOKENIZER_CONFIG, {})
|
| 821 |
+
if org_config := tokenizer_file.get("data"):
|
| 822 |
+
tokenizer_content = tokenizer_file.get("content")
|
| 823 |
+
tokenizer_config = TokenizerConfig(tokenizer_content)
|
| 824 |
+
|
| 825 |
+
if "chat_template" in tokenizer_content or not chat_template_file:
|
| 826 |
+
tokenizer_config.chat_templates["default"] = template
|
| 827 |
+
tokenizer_config.chat_templates["tool_use"] = template_tool_use
|
| 828 |
+
tokenizer_config.chat_templates["rag"] = template_rag
|
| 829 |
+
# tokenizer_config.inverse_template = template_inverse
|
| 830 |
+
|
| 831 |
new_config = tokenizer_config.json(get_json_indent(org_config))
|
| 832 |
if org_config.endswith("\n"):
|
| 833 |
new_config += "\n"
|
|
|
|
| 870 |
|
| 871 |
info["parent_commit"] = commit.oid
|
| 872 |
|
| 873 |
+
if org_chat_template:
|
| 874 |
+
chat_template_file["data"] = new_chat_template
|
| 875 |
+
chat_template_file["content"] = json.loads(new_chat_template)
|
| 876 |
+
|
| 877 |
if org_config:
|
| 878 |
tokenizer_file["data"] = new_config
|
| 879 |
tokenizer_file["content"] = json.loads(new_config)
|
|
|
|
| 1007 |
"disabled": info.disabled,
|
| 1008 |
"gated": info.gated,
|
| 1009 |
"private": info.private,
|
| 1010 |
+
"chat_template": templates,
|
| 1011 |
}
|
| 1012 |
|
| 1013 |
template_messages = example_values[0][1]
|
|
|
|
| 1049 |
progress = gr.Progress(track_tqdm = True),
|
| 1050 |
oauth_token: gr.OAuthToken | None = None,
|
| 1051 |
):
|
| 1052 |
+
parent_commit = None
|
| 1053 |
write_access = False
|
| 1054 |
|
| 1055 |
if info and oauth_token:
|
|
|
|
| 1156 |
else:
|
| 1157 |
gr.Warning(f"No {ModelFiles.TOKENIZER_CONFIG} found in repository...")
|
| 1158 |
|
| 1159 |
+
if not info.get("chat_template"):
|
| 1160 |
+
try:
|
| 1161 |
+
chat_template_file = None
|
| 1162 |
+
if (hfapi.file_exists(
|
| 1163 |
+
repo,
|
| 1164 |
+
ModelFiles.CHAT_TEMPLATE_JSON,
|
| 1165 |
+
revision = branch,
|
| 1166 |
+
token = oauth_token.token if oauth_token else False,
|
| 1167 |
+
)):
|
| 1168 |
+
chat_template_file = hfapi.hf_hub_download(
|
| 1169 |
+
repo,
|
| 1170 |
+
ModelFiles.CHAT_TEMPLATE_JSON,
|
| 1171 |
+
revision = parent_commit or branch,
|
| 1172 |
+
token = oauth_token.token if oauth_token else False,
|
| 1173 |
+
)
|
| 1174 |
+
except Exception as e:
|
| 1175 |
+
pass
|
| 1176 |
+
else:
|
| 1177 |
+
if chat_template_file:
|
| 1178 |
+
with open(chat_template_file, "r", encoding = "utf-8") as fp:
|
| 1179 |
+
template_data = fp.read()
|
| 1180 |
+
template_content = json.loads(template_data)
|
| 1181 |
+
info[ModelFiles.CHAT_TEMPLATE_JSON] = {
|
| 1182 |
+
"data": template_data,
|
| 1183 |
+
"content": template_content,
|
| 1184 |
+
}
|
| 1185 |
+
info["chat_template"] = template_content.get("chat_template")
|
| 1186 |
+
|
| 1187 |
pr_details = None
|
| 1188 |
if branch and branch.startswith("refs/pr/"):
|
| 1189 |
pr_num = branch.split("/")[-1]
|
|
|
|
| 1216 |
pr_submit: gr.Button(
|
| 1217 |
value = f"Commit to PR #{pr_details.num}" if pr_details else "Create Pull Request",
|
| 1218 |
),
|
| 1219 |
+
chat_template: gr.skip() if ModelFiles.CHAT_TEMPLATE_JSON not in info else gr.Code(
|
| 1220 |
+
value = TokenizerConfig(info[ModelFiles.CHAT_TEMPLATE_JSON]["content"]).chat_templates.get("default"),
|
| 1221 |
+
),
|
| 1222 |
# inverse_template: gr.skip() if ModelFiles.TOKENIZER_INVERSE_TEMPLATE not in info else gr.Code(
|
| 1223 |
# value = info[ModelFiles.TOKENIZER_INVERSE_TEMPLATE]["data"],
|
| 1224 |
# ),
|
|
|
|
| 1285 |
pr_preview_title,
|
| 1286 |
pr_description,
|
| 1287 |
pr_submit,
|
| 1288 |
+
chat_template,
|
| 1289 |
# inverse_template,
|
| 1290 |
],
|
| 1291 |
show_api = False,
|