Update model
Browse files- model.safetensors +1 -1
- train.py +19 -24
model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1583548940
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e025ee17178aacd08b47329d567ed76144c9fe50f6bc41c0ecdbf3e2d97c69b2
|
| 3 |
size 1583548940
|
train.py
CHANGED
|
@@ -61,43 +61,38 @@ def load_and_process_data(data_dir: str) -> str:
|
|
| 61 |
def prepare_dataset(text: str, tokenizer, max_length: int = 512):
|
| 62 |
"""
|
| 63 |
Tokenize the text and create a dataset for training.
|
|
|
|
|
|
|
| 64 |
|
| 65 |
Args:
|
| 66 |
-
text: The concatenated text
|
| 67 |
tokenizer: The tokenizer to use
|
| 68 |
max_length: Maximum sequence length
|
| 69 |
|
| 70 |
Returns:
|
| 71 |
Dataset ready for training
|
| 72 |
"""
|
| 73 |
-
#
|
| 74 |
-
#
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
examples = []
|
| 79 |
-
current_chunk = ""
|
| 80 |
-
|
| 81 |
-
for segment in chunks:
|
| 82 |
-
# Try adding this segment
|
| 83 |
-
test_chunk = current_chunk + (" [BRK] " if current_chunk else "") + segment
|
| 84 |
-
tokens = tokenizer(test_chunk, truncation=True, max_length=max_length)
|
| 85 |
-
token_length = len(tokens['input_ids'])
|
| 86 |
-
|
| 87 |
-
if token_length >= max_length - 10: # If close to max, save current chunk
|
| 88 |
-
if current_chunk:
|
| 89 |
-
examples.append(current_chunk)
|
| 90 |
-
current_chunk = segment
|
| 91 |
-
else:
|
| 92 |
-
current_chunk = test_chunk
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
| 97 |
|
| 98 |
print(f"Created {len(examples)} training examples")
|
| 99 |
|
| 100 |
-
# Tokenize all examples
|
| 101 |
def tokenize_function(examples):
|
| 102 |
return tokenizer(
|
| 103 |
examples["text"],
|
|
|
|
| 61 |
def prepare_dataset(text: str, tokenizer, max_length: int = 512):
|
| 62 |
"""
|
| 63 |
Tokenize the text and create a dataset for training.
|
| 64 |
+
Preserves [BRK] tokens in the training data so the model can learn to generate them.
|
| 65 |
+
Splits by token count only, not by [BRK] boundaries.
|
| 66 |
|
| 67 |
Args:
|
| 68 |
+
text: The concatenated text with [BRK] tokens
|
| 69 |
tokenizer: The tokenizer to use
|
| 70 |
max_length: Maximum sequence length
|
| 71 |
|
| 72 |
Returns:
|
| 73 |
Dataset ready for training
|
| 74 |
"""
|
| 75 |
+
# Tokenize the entire text first to split by token count
|
| 76 |
+
# This preserves [BRK] tokens within chunks
|
| 77 |
+
print("Tokenizing full text...")
|
| 78 |
+
full_tokens = tokenizer(text, add_special_tokens=False, return_offsets_mapping=False)
|
| 79 |
+
input_ids = full_tokens['input_ids']
|
| 80 |
+
|
| 81 |
+
# Split into chunks of max_length tokens
|
| 82 |
+
# The tokenizer will add CLS and SEP tokens, so we use max_length directly
|
| 83 |
+
# and let truncation handle it, or we can be more precise
|
| 84 |
+
chunk_size = max_length - 2 # Reserve space for CLS and SEP tokens
|
| 85 |
examples = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
|
| 87 |
+
for i in range(0, len(input_ids), chunk_size):
|
| 88 |
+
chunk_ids = input_ids[i:i + chunk_size]
|
| 89 |
+
# Decode back to text to preserve [BRK] tokens, then re-tokenize with special tokens
|
| 90 |
+
chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=False)
|
| 91 |
+
examples.append(chunk_text)
|
| 92 |
|
| 93 |
print(f"Created {len(examples)} training examples")
|
| 94 |
|
| 95 |
+
# Tokenize all examples with proper special tokens
|
| 96 |
def tokenize_function(examples):
|
| 97 |
return tokenizer(
|
| 98 |
examples["text"],
|