Spaces:

heerjtdev
/

hybrid_train

Sleeping

App Files Files Community

heerjtdev commited on Jan 22

Commit

9e890f3

verified ·

1 Parent(s): 62ffe2c

Update train_hybrid.py

Browse files

Files changed (1) hide show

train_hybrid.py +9 -11

train_hybrid.py CHANGED Viewed

@@ -91,7 +91,7 @@ class LayoutLMv3BiLSTMCRF(nn.Module):
             return self.crf.viterbi_decode(emissions, mask=attention_mask.bool())
 # -------------------------
-# 2. Data Processing
 # -------------------------
 class LayoutDataset(Dataset):
     def __init__(self, json_path, tokenizer, max_len=512):
@@ -109,21 +109,18 @@ class LayoutDataset(Dataset):
             if "data" in item:
                 words = item["data"].get("original_words", [])
                 bboxes = item["data"].get("original_bboxes", [])
-                # If labels aren't pre-processed, you might need your conversion logic here.
-                # Assuming the JSON input already has word-aligned labels or we create dummy ones
                 labels = item.get("labels", ["O"] * len(words))
             else:
-                # Fallback or generic format
                 words = item.get("tokens", [])
                 bboxes = item.get("bboxes", [])
                 labels = item.get("labels", [])
             if not words: continue
-            # Normalize bboxes to 0-1000 if not already
             norm_bboxes = []
             for b in bboxes:
-                # Simple clamping 0-1000
                 x0, y0, x1, y1 = b
                 norm_bboxes.append([
                     max(0, min(1000, int(x0))),
@@ -132,14 +129,16 @@ class LayoutDataset(Dataset):
                     max(0, min(1000, int(y1)))
                 ])
-            # Tokenize
             encoding = self.tokenizer(
-                words,
                 boxes=norm_bboxes,
                 padding="max_length",
                 truncation=True,
                 max_length=self.max_len,
-                is_split_into_words=True,
                 return_tensors="pt"
             )
@@ -148,7 +147,7 @@ class LayoutDataset(Dataset):
             label_ids = []
             for word_id in word_ids:
                 if word_id is None:
-                    label_ids.append(LABEL2ID["O"]) # Pad/Special tokens are O
                 elif word_id < len(labels):
                     label_ids.append(LABEL2ID.get(labels[word_id], LABEL2ID["O"]))
                 else:
@@ -163,7 +162,6 @@ class LayoutDataset(Dataset):
     def __getitem__(self, idx):
         return self.processed_data[idx]
 # -------------------------
 # 3. Training Function
 # -------------------------

             return self.crf.viterbi_decode(emissions, mask=attention_mask.bool())
 # -------------------------
+# 2. Data Processing (FIXED)
 # -------------------------
 class LayoutDataset(Dataset):
     def __init__(self, json_path, tokenizer, max_len=512):
             if "data" in item:
                 words = item["data"].get("original_words", [])
                 bboxes = item["data"].get("original_bboxes", [])
+                # Handle missing labels gracefully
                 labels = item.get("labels", ["O"] * len(words))
             else:
                 words = item.get("tokens", [])
                 bboxes = item.get("bboxes", [])
                 labels = item.get("labels", [])
             if not words: continue
+            # Normalize bboxes to 0-1000
             norm_bboxes = []
             for b in bboxes:
                 x0, y0, x1, y1 = b
                 norm_bboxes.append([
                     max(0, min(1000, int(x0))),
                     max(0, min(1000, int(y1)))
                 ])
+            # --- THE FIX IS HERE ---
+            # 1. Use 'text=' keyword argument
+            # 2. Ensure 'is_split_into_words=True' is passed explicitly
             encoding = self.tokenizer(
+                text=words,              # <--- Changed from positional to keyword
                 boxes=norm_bboxes,
                 padding="max_length",
                 truncation=True,
                 max_length=self.max_len,
+                is_split_into_words=True, # This tells it 'words' is a list of strings
                 return_tensors="pt"
             )
             label_ids = []
             for word_id in word_ids:
                 if word_id is None:
+                    label_ids.append(LABEL2ID["O"])
                 elif word_id < len(labels):
                     label_ids.append(LABEL2ID.get(labels[word_id], LABEL2ID["O"]))
                 else:
     def __getitem__(self, idx):
         return self.processed_data[idx]
 # -------------------------
 # 3. Training Function
 # -------------------------