Spaces:

heerjtdev
/

LayoutLM_train

Running

aagamjtdev commited on Oct 27, 2025

Commit

ee11b08

1 Parent(s): caca294

correction

Files changed (1) hide show

HF_LayoutLM_with_Passage.py CHANGED Viewed

@@ -203,8 +203,8 @@ class LayoutDataset(Dataset):
 class LayoutLMv3CRF(nn.Module):
     def __init__(self, model_name, num_labels):
         super().__init__()
-        # self.layoutlm = LayoutLMv3Model.from_pretrained(model_name)
-        self.layoutlm = LayoutLMv3Model.from_pretrained("heerjtdev/edugenius")
         self.dropout = nn.Dropout(0.1)
         self.classifier = nn.Linear(self.layoutlm.config.hidden_size, num_labels)
         self.crf = CRF(num_labels)
@@ -302,9 +302,9 @@ def main(args):
     # 3. Load and split augmented dataset
     print("\n--- START PHASE: MODEL/DATASET SETUP ---")
-    MODEL_ID = "heerjtdev/edugenius"
-    # tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
-    tokenizer = LayoutLMv3TokenizerFast.from_pretrained(MODEL_ID)
     dataset = LayoutDataset(final_data_path, tokenizer, label2id, max_len=args.max_len)
     val_size = int(0.2 * len(dataset))
@@ -320,8 +320,8 @@ def main(args):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {device}")
     # Num_labels is based on the updated 'labels' list
-    # model = LayoutLMv3CRF("microsoft/layoutlmv3-base", num_labels=len(labels)).to(device)
-    model = LayoutLMv3CRF(MODEL_ID, num_labels=len(labels)).to(device)
     ckpt_path = "checkpoints/layoutlmv3_crf_passage.pth"
     os.makedirs("checkpoints", exist_ok=True)
     if os.path.exists(ckpt_path):

 class LayoutLMv3CRF(nn.Module):
     def __init__(self, model_name, num_labels):
         super().__init__()
+        self.layoutlm = LayoutLMv3Model.from_pretrained(model_name)
+        # self.layoutlm = LayoutLMv3Model.from_pretrained("heerjtdev/edugenius")
         self.dropout = nn.Dropout(0.1)
         self.classifier = nn.Linear(self.layoutlm.config.hidden_size, num_labels)
         self.crf = CRF(num_labels)
     # 3. Load and split augmented dataset
     print("\n--- START PHASE: MODEL/DATASET SETUP ---")
+    #MODEL_ID = "heerjtdev/edugenius"
+    tokenizer = LayoutLMv3TokenizerFast.from_pretrained("microsoft/layoutlmv3-base")
+    #tokenizer = LayoutLMv3TokenizerFast.from_pretrained(MODEL_ID)
     dataset = LayoutDataset(final_data_path, tokenizer, label2id, max_len=args.max_len)
     val_size = int(0.2 * len(dataset))
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"Using device: {device}")
     # Num_labels is based on the updated 'labels' list
+    model = LayoutLMv3CRF("microsoft/layoutlmv3-base", num_labels=len(labels)).to(device)
+    # model = LayoutLMv3CRF(MODEL_ID, num_labels=len(labels)).to(device)
     ckpt_path = "checkpoints/layoutlmv3_crf_passage.pth"
     os.makedirs("checkpoints", exist_ok=True)
     if os.path.exists(ckpt_path):