Spaces:

heerjtdev
/

hybrid_train

Sleeping

App Files Files Community

heerjtdev commited on Jan 22

Commit

d5442f4

verified ·

1 Parent(s): 8e9da7d

Update train_hybrid.py

Browse files

Files changed (1) hide show

train_hybrid.py +80 -34

train_hybrid.py CHANGED Viewed

@@ -80,7 +80,7 @@ class LayoutLMv3BiLSTMCRF(nn.Module):
             return self.crf.viterbi_decode(emissions, mask=attention_mask.bool())
 # -------------------------
-# 2. Data Processing (FIXED)
 # -------------------------
 class LayoutDataset(Dataset):
     def __init__(self, json_path, tokenizer, max_len=512):
@@ -91,7 +91,13 @@ class LayoutDataset(Dataset):
         self.max_len = max_len
         self.processed_data = []
-        print(f"🔄 Preprocessing {len(data)} documents...")
         for item in data:
             if "data" in item:
@@ -105,41 +111,81 @@ class LayoutDataset(Dataset):
             if not words: continue
-            # Normalize bboxes
-            norm_bboxes = []
-            for b in bboxes:
-                x0, y0, x1, y1 = b
-                norm_bboxes.append([
-                    max(0, min(1000, int(x0))),
-                    max(0, min(1000, int(y0))),
-                    max(0, min(1000, int(x1))),
-                    max(0, min(1000, int(y1)))
-                ])
-            # --- KEY FIX IS HERE ---
-            # using text=words explicitly fixes the positional argument error
-            encoding = self.tokenizer(
-                text=words,
-                boxes=norm_bboxes,
-                padding="max_length",
-                truncation=True,
-                max_length=self.max_len,
-                is_split_into_words=True,
-                return_tensors="pt"
-            )
-            word_ids = encoding.word_ids(batch_index=0)
-            label_ids = []
-            for word_id in word_ids:
-                if word_id is None:
-                    label_ids.append(LABEL2ID["O"])
-                elif word_id < len(labels):
-                    label_ids.append(LABEL2ID.get(labels[word_id], LABEL2ID["O"]))
                 else:
-                    label_ids.append(LABEL2ID["O"])
-            item_dict = {key: val.squeeze(0) for key, val in encoding.items()}
-            item_dict["labels"] = torch.tensor(label_ids)
             self.processed_data.append(item_dict)
     def __len__(self):

             return self.crf.viterbi_decode(emissions, mask=attention_mask.bool())
 # -------------------------
+# 2. Data Processing (MANUAL ALIGNMENT FIX)
 # -------------------------
 class LayoutDataset(Dataset):
     def __init__(self, json_path, tokenizer, max_len=512):
         self.max_len = max_len
         self.processed_data = []
+        # Get special token IDs
+        self.cls_token_id = tokenizer.cls_token_id
+        self.sep_token_id = tokenizer.sep_token_id
+        self.pad_token_id = tokenizer.pad_token_id
+        self.unk_token_id = tokenizer.unk_token_id
+        print(f"🔄 Preprocessing {len(data)} documents (Manual Alignment Mode)...")
         for item in data:
             if "data" in item:
             if not words: continue
+            # 1. Initialize with [CLS]
+            input_ids = [self.cls_token_id]
+            final_bboxes = [[0, 0, 0, 0]]
+            label_ids = [LABEL2ID["O"]]
+            # 2. Iterate word by word
+            for word, box, label_str in zip(words, bboxes, labels):
+                # Clamp bbox 0-1000
+                clamped_box = [
+                    max(0, min(1000, int(box[0]))),
+                    max(0, min(1000, int(box[1]))),
+                    max(0, min(1000, int(box[2]))),
+                    max(0, min(1000, int(box[3])))
+                ]
+                # Tokenize current word
+                word_tokens = tokenizer.tokenize(word)
+                if not word_tokens: continue # Skip empty/weird tokens
+                # Convert to IDs
+                word_sub_ids = tokenizer.convert_tokens_to_ids(word_tokens)
+                # Add to lists
+                input_ids.extend(word_sub_ids)
+                # Expand bbox to match number of sub-tokens
+                final_bboxes.extend([clamped_box] * len(word_sub_ids))
+                # Handle BIO Labels for sub-tokens
+                # First sub-token gets the B- tag (if applicable), others get I- tag
+                current_label_id = LABEL2ID.get(label_str, LABEL2ID["O"])
+                if label_str.startswith("B-"):
+                    # Logic: First subtoken is B-X, rest are I-X
+                    i_tag_str = "I-" + label_str[2:]
+                    i_tag_id = LABEL2ID.get(i_tag_str, LABEL2ID["O"])
+                    # First subtoken = Original B- tag
+                    label_ids.append(current_label_id)
+                    # Remaining subtokens = I- tag
+                    label_ids.extend([i_tag_id] * (len(word_sub_ids) - 1))
                 else:
+                    # If it's O or I-X, just copy it to all subtokens
+                    label_ids.extend([current_label_id] * len(word_sub_ids))
+            # 3. Truncate if too long (account for [SEP])
+            if len(input_ids) > self.max_len - 1:
+                input_ids = input_ids[:self.max_len - 1]
+                final_bboxes = final_bboxes[:self.max_len - 1]
+                label_ids = label_ids[:self.max_len - 1]
+            # 4. Add [SEP]
+            input_ids.append(self.sep_token_id)
+            final_bboxes.append([0, 0, 0, 0])
+            label_ids.append(LABEL2ID["O"])
+            # 5. Create Attention Mask
+            attention_mask = [1] * len(input_ids)
+            # 6. Pad to max_len
+            padding_length = self.max_len - len(input_ids)
+            if padding_length > 0:
+                input_ids += [self.pad_token_id] * padding_length
+                final_bboxes += [[0, 0, 0, 0]] * padding_length
+                label_ids += [LABEL2ID["O"]] * padding_length
+                attention_mask += [0] * padding_length
+            # 7. Convert to Tensors
+            item_dict = {
+                "input_ids": torch.tensor(input_ids, dtype=torch.long),
+                "bbox": torch.tensor(final_bboxes, dtype=torch.long),
+                "attention_mask": torch.tensor(attention_mask, dtype=torch.long),
+                "labels": torch.tensor(label_ids, dtype=torch.long)
+            }
             self.processed_data.append(item_dict)
     def __len__(self):