metricv commited on
Commit
b6c9920
·
verified ·
1 Parent(s): 923f236

Update model

Browse files
Files changed (2) hide show
  1. model.safetensors +1 -1
  2. train.py +19 -24
model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f41af4fd9a5dba8810b248a9e43f79615949bf3ab9f5c22a7b121d43bd21106f
3
  size 1583548940
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e025ee17178aacd08b47329d567ed76144c9fe50f6bc41c0ecdbf3e2d97c69b2
3
  size 1583548940
train.py CHANGED
@@ -61,43 +61,38 @@ def load_and_process_data(data_dir: str) -> str:
61
  def prepare_dataset(text: str, tokenizer, max_length: int = 512):
62
  """
63
  Tokenize the text and create a dataset for training.
 
 
64
 
65
  Args:
66
- text: The concatenated text
67
  tokenizer: The tokenizer to use
68
  max_length: Maximum sequence length
69
 
70
  Returns:
71
  Dataset ready for training
72
  """
73
- # Split text into chunks that fit within max_length
74
- # We'll use [BRK] as a natural boundary
75
- chunks = text.split(" [BRK] ")
76
-
77
- # Tokenize and create examples
 
 
 
 
 
78
  examples = []
79
- current_chunk = ""
80
-
81
- for segment in chunks:
82
- # Try adding this segment
83
- test_chunk = current_chunk + (" [BRK] " if current_chunk else "") + segment
84
- tokens = tokenizer(test_chunk, truncation=True, max_length=max_length)
85
- token_length = len(tokens['input_ids'])
86
-
87
- if token_length >= max_length - 10: # If close to max, save current chunk
88
- if current_chunk:
89
- examples.append(current_chunk)
90
- current_chunk = segment
91
- else:
92
- current_chunk = test_chunk
93
 
94
- # Add the last chunk
95
- if current_chunk:
96
- examples.append(current_chunk)
 
 
97
 
98
  print(f"Created {len(examples)} training examples")
99
 
100
- # Tokenize all examples
101
  def tokenize_function(examples):
102
  return tokenizer(
103
  examples["text"],
 
61
  def prepare_dataset(text: str, tokenizer, max_length: int = 512):
62
  """
63
  Tokenize the text and create a dataset for training.
64
+ Preserves [BRK] tokens in the training data so the model can learn to generate them.
65
+ Splits by token count only, not by [BRK] boundaries.
66
 
67
  Args:
68
+ text: The concatenated text with [BRK] tokens
69
  tokenizer: The tokenizer to use
70
  max_length: Maximum sequence length
71
 
72
  Returns:
73
  Dataset ready for training
74
  """
75
+ # Tokenize the entire text first to split by token count
76
+ # This preserves [BRK] tokens within chunks
77
+ print("Tokenizing full text...")
78
+ full_tokens = tokenizer(text, add_special_tokens=False, return_offsets_mapping=False)
79
+ input_ids = full_tokens['input_ids']
80
+
81
+ # Split into chunks of max_length tokens
82
+ # The tokenizer will add CLS and SEP tokens, so we use max_length directly
83
+ # and let truncation handle it, or we can be more precise
84
+ chunk_size = max_length - 2 # Reserve space for CLS and SEP tokens
85
  examples = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ for i in range(0, len(input_ids), chunk_size):
88
+ chunk_ids = input_ids[i:i + chunk_size]
89
+ # Decode back to text to preserve [BRK] tokens, then re-tokenize with special tokens
90
+ chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=False)
91
+ examples.append(chunk_text)
92
 
93
  print(f"Created {len(examples)} training examples")
94
 
95
+ # Tokenize all examples with proper special tokens
96
  def tokenize_function(examples):
97
  return tokenizer(
98
  examples["text"],