metricv
/

metricsubs-bert-segmenter

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f41af4fd9a5dba8810b248a9e43f79615949bf3ab9f5c22a7b121d43bd21106f
 size 1583548940

 version https://git-lfs.github.com/spec/v1
+oid sha256:e025ee17178aacd08b47329d567ed76144c9fe50f6bc41c0ecdbf3e2d97c69b2
 size 1583548940

train.py CHANGED Viewed

@@ -61,43 +61,38 @@ def load_and_process_data(data_dir: str) -> str:
 def prepare_dataset(text: str, tokenizer, max_length: int = 512):
     """
     Tokenize the text and create a dataset for training.
     Args:
-        text: The concatenated text
         tokenizer: The tokenizer to use
         max_length: Maximum sequence length
     Returns:
         Dataset ready for training
     """
-    # Split text into chunks that fit within max_length
-    # We'll use [BRK] as a natural boundary
-    chunks = text.split(" [BRK] ")
-    # Tokenize and create examples
     examples = []
-    current_chunk = ""
-    for segment in chunks:
-        # Try adding this segment
-        test_chunk = current_chunk + (" [BRK] " if current_chunk else "") + segment
-        tokens = tokenizer(test_chunk, truncation=True, max_length=max_length)
-        token_length = len(tokens['input_ids'])
-        if token_length >= max_length - 10:  # If close to max, save current chunk
-            if current_chunk:
-                examples.append(current_chunk)
-            current_chunk = segment
-        else:
-            current_chunk = test_chunk
-    # Add the last chunk
-    if current_chunk:
-        examples.append(current_chunk)
     print(f"Created {len(examples)} training examples")
-    # Tokenize all examples
     def tokenize_function(examples):
         return tokenizer(
             examples["text"],

 def prepare_dataset(text: str, tokenizer, max_length: int = 512):
     """
     Tokenize the text and create a dataset for training.
+    Preserves [BRK] tokens in the training data so the model can learn to generate them.
+    Splits by token count only, not by [BRK] boundaries.
     Args:
+        text: The concatenated text with [BRK] tokens
         tokenizer: The tokenizer to use
         max_length: Maximum sequence length
     Returns:
         Dataset ready for training
     """
+    # Tokenize the entire text first to split by token count
+    # This preserves [BRK] tokens within chunks
+    print("Tokenizing full text...")
+    full_tokens = tokenizer(text, add_special_tokens=False, return_offsets_mapping=False)
+    input_ids = full_tokens['input_ids']
+    # Split into chunks of max_length tokens
+    # The tokenizer will add CLS and SEP tokens, so we use max_length directly
+    # and let truncation handle it, or we can be more precise
+    chunk_size = max_length - 2  # Reserve space for CLS and SEP tokens
     examples = []
+    for i in range(0, len(input_ids), chunk_size):
+        chunk_ids = input_ids[i:i + chunk_size]
+        # Decode back to text to preserve [BRK] tokens, then re-tokenize with special tokens
+        chunk_text = tokenizer.decode(chunk_ids, skip_special_tokens=False)
+        examples.append(chunk_text)
     print(f"Created {len(examples)} training examples")
+    # Tokenize all examples with proper special tokens
     def tokenize_function(examples):
         return tokenizer(
             examples["text"],