barakplasma commited on
Commit
3bb2de3
·
verified ·
1 Parent(s): ac2e9b6

Upload scripts/bundle_litertlm.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. scripts/bundle_litertlm.py +23 -100
scripts/bundle_litertlm.py CHANGED
@@ -33,106 +33,29 @@ from litert_lm.runtime.proto import (
33
  )
34
 
35
 
36
- # TranslateGemma 4B IT Jinja chat template (from tokenizer_config.json)
37
- # Requires structured content: {type, source_lang_code, target_lang_code, text}
38
- TRANSLATE_GEMMA_JINJA_TEMPLATE = """\
39
- {%- set languages = {
40
- "aa": "Afar", "ab": "Abkhazian", "af": "Afrikaans", "ak": "Akan",
41
- "am": "Amharic", "an": "Aragonese", "ar": "Arabic", "as": "Assamese",
42
- "az": "Azerbaijani", "ba": "Bashkir", "be": "Belarusian", "bg": "Bulgarian",
43
- "bm": "Bambara", "bn": "Bengali", "bo": "Tibetan", "br": "Breton",
44
- "bs": "Bosnian", "ca": "Catalan", "ce": "Chechen", "co": "Corsican",
45
- "cs": "Czech", "cv": "Chuvash", "cy": "Welsh", "da": "Danish",
46
- "de": "German", "dv": "Divehi", "dz": "Dzongkha", "ee": "Ewe",
47
- "el": "Greek", "en": "English", "eo": "Esperanto", "es": "Spanish",
48
- "et": "Estonian", "eu": "Basque", "fa": "Persian", "ff": "Fulah",
49
- "fi": "Finnish", "fo": "Faroese", "fr": "French", "fy": "Western Frisian",
50
- "ga": "Irish", "gd": "Scottish Gaelic", "gl": "Galician", "gn": "Guarani",
51
- "gu": "Gujarati", "gv": "Manx", "ha": "Hausa", "he": "Hebrew",
52
- "hi": "Hindi", "hr": "Croatian", "ht": "Haitian", "hu": "Hungarian",
53
- "hy": "Armenian", "ia": "Interlingua", "id": "Indonesian", "ig": "Igbo",
54
- "ii": "Sichuan Yi", "ik": "Inupiaq", "io": "Ido", "is": "Icelandic",
55
- "it": "Italian", "iu": "Inuktitut", "ja": "Japanese", "jv": "Javanese",
56
- "ka": "Georgian", "ki": "Kikuyu", "kk": "Kazakh", "kl": "Kalaallisut",
57
- "km": "Central Khmer", "kn": "Kannada", "ko": "Korean", "ks": "Kashmiri",
58
- "ku": "Kurdish", "kw": "Cornish", "ky": "Kyrgyz", "la": "Latin",
59
- "lb": "Luxembourgish", "lg": "Ganda", "ln": "Lingala", "lo": "Lao",
60
- "lt": "Lithuanian", "lu": "Luba-Katanga", "lv": "Latvian", "mg": "Malagasy",
61
- "mi": "Maori", "mk": "Macedonian", "ml": "Malayalam", "mn": "Mongolian",
62
- "mr": "Marathi", "ms": "Malay", "mt": "Maltese", "my": "Burmese",
63
- "nb": "Norwegian Bokmål", "nd": "North Ndebele", "ne": "Nepali",
64
- "nl": "Dutch", "nn": "Norwegian Nynorsk", "no": "Norwegian",
65
- "nr": "South Ndebele", "nv": "Navajo", "ny": "Chichewa", "oc": "Occitan",
66
- "om": "Oromo", "or": "Oriya", "os": "Ossetian", "pa": "Punjabi",
67
- "pl": "Polish", "ps": "Pashto", "pt": "Portuguese", "qu": "Quechua",
68
- "rm": "Romansh", "rn": "Rundi", "ro": "Romanian", "ru": "Russian",
69
- "rw": "Kinyarwanda", "sa": "Sanskrit", "sc": "Sardinian", "sd": "Sindhi",
70
- "se": "Northern Sami", "sg": "Sango", "si": "Sinhala", "sk": "Slovak",
71
- "sl": "Slovenian", "sn": "Shona", "so": "Somali", "sq": "Albanian",
72
- "sr": "Serbian", "ss": "Swati", "st": "Southern Sotho", "su": "Sundanese",
73
- "sv": "Swedish", "sw": "Swahili", "ta": "Tamil", "te": "Telugu",
74
- "tg": "Tajik", "th": "Thai", "ti": "Tigrinya", "tk": "Turkmen",
75
- "tl": "Tagalog", "tn": "Tswana", "to": "Tonga", "tr": "Turkish",
76
- "ts": "Tsonga", "tt": "Tatar", "ug": "Uyghur", "uk": "Ukrainian",
77
- "ur": "Urdu", "uz": "Uzbek", "ve": "Venda", "vi": "Vietnamese",
78
- "vo": "Volapük", "wa": "Walloon", "wo": "Wolof", "xh": "Xhosa",
79
- "yi": "Yiddish", "yo": "Yoruba", "za": "Zhuang", "zh": "Chinese",
80
- "zu": "Zulu"
81
- } -%}
82
- {{ bos_token }}
83
- {%- if (messages[0]['role'] != 'user') -%}
84
- {{ raise_exception("Conversations must start with a user prompt.") }}
85
- {%- endif -%}
86
- {%- for message in messages -%}
87
- {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
88
- {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
89
- {%- endif -%}
90
- {%- if (message['role'] == 'assistant') -%}
91
- {%- if message['content'] is none or message['content'] is not string -%}
92
- {{ raise_exception("Assistant role must provide content as a string") }}
93
- {%- endif -%}
94
- {{ '<start_of_turn>model\\n'}}
95
- {{ message["content"] | trim }}
96
- {%- elif (message['role'] == 'user') -%}
97
- {%- set content = message["content"] if message["content"] is string else message["content"][0] -%}
98
- {%- if content is string -%}
99
- {%- set source_lang = "English" -%}
100
- {%- set source_lang_code = "en" -%}
101
- {%- set target_lang = "Spanish" -%}
102
- {%- set target_lang_code = "es" -%}
103
- {{ '<start_of_turn>user\\nYou are a professional ' + source_lang + ' (' + source_lang_code + ') to ' +
104
- target_lang + ' (' + target_lang_code + ') translator. Your goal is to accurately convey the meaning ' +
105
- 'and nuances of the original ' + source_lang + ' text while adhering to ' + target_lang + ' grammar, ' +
106
- 'vocabulary, and cultural sensitivities.\\n' +
107
- 'Produce only the ' + target_lang + ' translation, without any additional explanations or ' +
108
- 'commentary. Please translate the following ' + source_lang + ' text into ' + target_lang + ':\\n\\n\\n' +
109
- content | trim
110
- }}
111
- {%- else -%}
112
- {%- set source_lang_code = content["source_lang_code"] | replace("_", "-") -%}
113
- {%- set source_lang = languages.get(source_lang_code, source_lang_code) -%}
114
- {%- set target_lang_code = content["target_lang_code"] | replace("_", "-") -%}
115
- {%- set target_lang = languages.get(target_lang_code, target_lang_code) -%}
116
- {{ '<start_of_turn>user\\nYou are a professional ' + source_lang + ' (' + source_lang_code + ') to ' +
117
- target_lang + ' (' + target_lang_code + ') translator. Your goal is to accurately convey the meaning ' +
118
- 'and nuances of the original ' + source_lang + ' text while adhering to ' + target_lang + ' grammar, ' +
119
- 'vocabulary, and cultural sensitivities.\\n'
120
- }}
121
- {%- if content["type"] == 'text' -%}
122
- {{ 'Produce only the ' + target_lang + ' translation, without any additional explanations or ' +
123
- 'commentary. Please translate the following ' + source_lang + ' text into ' + target_lang + ':\\n\\n\\n' +
124
- content["text"] | trim
125
- }}
126
- {%- endif -%}
127
- {%- endif -%}
128
- {%- else -%}
129
- {{ raise_exception("Conversations must only contain user or assistant roles.") }}
130
- {%- endif -%}
131
- {{ '<end_of_turn>\\n' }}
132
- {%- endfor -%}
133
- {%- if add_generation_prompt -%}
134
- {{'<start_of_turn>model\\n'}}
135
- {%- endif -%}"""
136
 
137
 
138
  def build_llm_metadata_proto(max_tokens: int) -> bytes:
 
33
  )
34
 
35
 
36
+ # Simple Jinja template compatible with LiteRT-LM runtime (no .get(), no complex tests).
37
+ # Handles plain text input from Google AI Edge Gallery.
38
+ # Uses the exact prompt format TranslateGemma was trained with (en→es default).
39
+ # Users who need other language pairs should prefix their message with the pair,
40
+ # e.g. "Translate English to French:\n\nHello"
41
+ TRANSLATE_GEMMA_JINJA_TEMPLATE = \
42
+ "{{ bos_token }}" \
43
+ "{% for message in messages %}" \
44
+ "{% if message['role'] == 'user' %}" \
45
+ "<start_of_turn>user\n" \
46
+ "You are a professional translator. " \
47
+ "Produce only the translation of the following text, without any additional explanations or commentary:\n\n\n" \
48
+ "{{ message['content'] | trim }}" \
49
+ "<end_of_turn>\n" \
50
+ "{% elif message['role'] == 'assistant' %}" \
51
+ "<start_of_turn>model\n" \
52
+ "{{ message['content'] | trim }}" \
53
+ "<end_of_turn>\n" \
54
+ "{% endif %}" \
55
+ "{% endfor %}" \
56
+ "{% if add_generation_prompt %}" \
57
+ "<start_of_turn>model\n" \
58
+ "{% endif %}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
 
61
  def build_llm_metadata_proto(max_tokens: int) -> bytes: