Sharjeelbaig commited on
Commit
6008b40
·
verified ·
1 Parent(s): 9f63d33

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apple-amlr
3
+ license_name: apple-sample-code-license
4
+ license_link: LICENSE
5
+ ---
6
+
7
+ # OpenELM
8
+
9
+
config.json CHANGED
@@ -8,6 +8,7 @@
8
  "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
9
  },
10
  "bos_token_id": 1,
 
11
  "eos_token_id": 2,
12
  "ffn_dim_divisor": 256,
13
  "ffn_multipliers": [
@@ -78,11 +79,25 @@
78
  0.5,
79
  1.0
80
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  "rope_freq_constant": 10000,
82
  "rope_max_length": 4096,
83
  "share_input_output_layers": true,
84
- "torch_dtype": "bfloat16",
85
- "transformers_version": "4.39.3",
86
  "use_cache": true,
87
  "vocab_size": 32000
88
  }
 
8
  "AutoModelForCausalLM": "modeling_openelm.OpenELMForCausalLM"
9
  },
10
  "bos_token_id": 1,
11
+ "dtype": "float16",
12
  "eos_token_id": 2,
13
  "ffn_dim_divisor": 256,
14
  "ffn_multipliers": [
 
79
  0.5,
80
  1.0
81
  ],
82
+ "quantization_config": {
83
+ "_load_in_4bit": true,
84
+ "_load_in_8bit": false,
85
+ "bnb_4bit_compute_dtype": "bfloat16",
86
+ "bnb_4bit_quant_storage": "uint8",
87
+ "bnb_4bit_quant_type": "nf4",
88
+ "bnb_4bit_use_double_quant": true,
89
+ "llm_int8_enable_fp32_cpu_offload": false,
90
+ "llm_int8_has_fp16_weight": false,
91
+ "llm_int8_skip_modules": null,
92
+ "llm_int8_threshold": 6.0,
93
+ "load_in_4bit": true,
94
+ "load_in_8bit": false,
95
+ "quant_method": "bitsandbytes"
96
+ },
97
  "rope_freq_constant": 10000,
98
  "rope_max_length": 4096,
99
  "share_input_output_layers": true,
100
+ "transformers_version": "4.57.3",
 
101
  "use_cache": true,
102
  "vocab_size": 32000
103
  }
configuration_openelm.py CHANGED
@@ -316,3 +316,13 @@ class OpenELMConfig(PretrainedConfig):
316
  # check num_query_heads divisible by num_kv_heads for every layer
317
  for layer_idx in range(len(query_dims)):
318
  assert self.num_query_heads[layer_idx] % self.num_kv_heads[layer_idx] == 0
 
 
 
 
 
 
 
 
 
 
 
316
  # check num_query_heads divisible by num_kv_heads for every layer
317
  for layer_idx in range(len(query_dims)):
318
  assert self.num_query_heads[layer_idx] % self.num_kv_heads[layer_idx] == 0
319
+
320
+ @property
321
+ def num_hidden_layers(self) -> int:
322
+ """Alias for num_transformer_layers for compatibility with transformers library."""
323
+ return self.num_transformer_layers
324
+
325
+ @num_hidden_layers.setter
326
+ def num_hidden_layers(self, value: int) -> None:
327
+ """Setter for num_hidden_layers that updates num_transformer_layers."""
328
+ self.num_transformer_layers = value
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 1,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.57.3"
6
+ }
modeling_openelm.py CHANGED
@@ -661,7 +661,9 @@ class OpenELMModel(OpenELMPreTrainedModel):
661
 
662
  past_seen_tokens = 0
663
  if use_cache: # kept for BC (cache positions)
664
- if not isinstance(past_key_values, StaticCache):
 
 
665
  past_key_values = DynamicCache.from_legacy_cache(past_key_values)
666
  past_seen_tokens = past_key_values.get_seq_length()
667
 
@@ -920,8 +922,8 @@ class OpenELMForCausalLM(OpenELMPreTrainedModel):
920
  if past_key_values is not None:
921
  if isinstance(past_key_values, Cache):
922
  cache_length = past_key_values.get_seq_length()
923
- past_length = past_key_values.seen_tokens
924
- max_cache_length = past_key_values.get_max_length()
925
  else:
926
  cache_length = past_length = past_key_values[0][0].shape[2]
927
  max_cache_length = None
 
661
 
662
  past_seen_tokens = 0
663
  if use_cache: # kept for BC (cache positions)
664
+ if past_key_values is None:
665
+ past_key_values = DynamicCache()
666
+ elif not isinstance(past_key_values, (StaticCache, DynamicCache)):
667
  past_key_values = DynamicCache.from_legacy_cache(past_key_values)
668
  past_seen_tokens = past_key_values.get_seq_length()
669
 
 
922
  if past_key_values is not None:
923
  if isinstance(past_key_values, Cache):
924
  cache_length = past_key_values.get_seq_length()
925
+ past_length = cache_length
926
+ max_cache_length = getattr(past_key_values, 'get_max_length', lambda: None)()
927
  else:
928
  cache_length = past_length = past_key_values[0][0].shape[2]
929
  max_cache_length = None
test_proj/.gitignore ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python-generated files
2
+ __pycache__/
3
+ *.py[oc]
4
+ build/
5
+ dist/
6
+ wheels/
7
+ *.egg-info
8
+
9
+ # Virtual environments
10
+ .venv
test_proj/.python-version ADDED
@@ -0,0 +1 @@
 
 
1
+ 3.12
test_proj/README.md ADDED
File without changes
test_proj/main.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import AutoModelForCausalLM, AutoTokenizer
2
+
3
+ model = AutoModelForCausalLM.from_pretrained(
4
+ "Sharjeelbaig/apple-open_elm_4bit",
5
+ trust_remote_code=True,
6
+ )
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained(
9
+ "NousResearch/Llama-2-7b-chat-hf",
10
+ use_fast=False
11
+ )
12
+
13
+ prompt = "Hello, how are you?"
14
+ inputs = tokenizer(prompt, return_tensors="pt")
15
+
16
+ outputs = model.generate(
17
+ **inputs,
18
+ max_new_tokens=50,
19
+ do_sample=True,
20
+ temperature=0.7,
21
+ )
22
+
23
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
test_proj/pyproject.toml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "test-proj"
3
+ version = "0.1.0"
4
+ description = "Add your description here"
5
+ readme = "README.md"
6
+ requires-python = ">=3.12"
7
+ dependencies = [
8
+ "accelerate>=1.12.0",
9
+ "bitsandbytes>=0.49.0",
10
+ "protobuf>=6.33.2",
11
+ "sentencepiece>=0.2.1",
12
+ "torch>=2.9.1",
13
+ "transformers>=4.57.3",
14
+ ]
test_proj/uv.lock ADDED
The diff for this file is too large to render. See raw diff