Spaces:

heerjtdev
/

hybrid_train

Sleeping

App Files Files Community

heerjtdev commited on Jan 22

Commit

8e9da7d

verified ·

1 Parent(s): 9e890f3

Update train_hybrid.py

Browse files

Files changed (1) hide show

train_hybrid.py +17 -47

train_hybrid.py CHANGED Viewed

@@ -14,15 +14,13 @@ import numpy as np
 try:
     from TorchCRF import CRF
 except ImportError:
-    print("❌ Error: 'TorchCRF' not found")
     exit()
 # --- Configuration ---
-# We use the base model for the backbone
 BASE_MODEL_ID = "microsoft/layoutlmv3-base"
 MAX_LEN = 512
-# Labels from your BiLSTM script
 LABELS = [
     "O",
     "B-QUESTION", "I-QUESTION",
@@ -43,48 +41,39 @@ class LayoutLMv3BiLSTMCRF(nn.Module):
         super().__init__()
         print(f"🏗️ Initializing Hybrid Model: LayoutLMv3 + BiLSTM + CRF...")
-        # 1. Backbone: LayoutLMv3 (Replaces Word Emb + CharCNN + Spatial Features)
         self.layoutlm = LayoutLMv3Model.from_pretrained(BASE_MODEL_ID)
-        # LayoutLMv3-base hidden size is 768
         transformer_output_size = self.layoutlm.config.hidden_size
         # 2. Middle: Bi-LSTM
-        # Takes the 768 vectors from Transformer and models sequence
         self.lstm = nn.LSTM(
             input_size=transformer_output_size,
             hidden_size=hidden_dim,
-            num_layers=2, # Stacked LSTM for depth
             bidirectional=True,
             batch_first=True,
             dropout=0.1
         )
         # 3. Head: Linear Projection
-        # Input is hidden_dim * 2 (because bidirectional)
         self.classifier = nn.Linear(hidden_dim * 2, num_labels)
         # 4. Decoder: CRF
         self.crf = CRF(num_labels)
     def forward(self, input_ids, bbox, attention_mask, labels=None):
-        # Step A: Get Contextual Embeddings from LayoutLM
-        # outputs.last_hidden_state shape: (Batch, Seq_Len, 768)
         outputs = self.layoutlm(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
         sequence_output = outputs.last_hidden_state
-        # Step B: Pass through Bi-LSTM
-        # lstm_output shape: (Batch, Seq_Len, hidden_dim * 2)
         lstm_output, _ = self.lstm(sequence_output)
-        # Step C: Project to Tag Space
-        # emissions shape: (Batch, Seq_Len, num_labels)
         emissions = self.classifier(lstm_output)
-        # Step D: CRF Loss or Decoding
         if labels is not None:
-            # We must use the attention_mask so CRF doesn't train on padding tokens
-            # Returns negative log likelihood
             log_likelihood = self.crf(emissions, labels, mask=attention_mask.bool())
             return -log_likelihood.mean()
         else:
@@ -105,11 +94,9 @@ class LayoutDataset(Dataset):
         print(f"🔄 Preprocessing {len(data)} documents...")
         for item in data:
-            # Handle Label Studio JSON format
             if "data" in item:
                 words = item["data"].get("original_words", [])
                 bboxes = item["data"].get("original_bboxes", [])
-                # Handle missing labels gracefully
                 labels = item.get("labels", ["O"] * len(words))
             else:
                 words = item.get("tokens", [])
@@ -118,7 +105,7 @@ class LayoutDataset(Dataset):
             if not words: continue
-            # Normalize bboxes to 0-1000
             norm_bboxes = []
             for b in bboxes:
                 x0, y0, x1, y1 = b
@@ -129,20 +116,18 @@ class LayoutDataset(Dataset):
                     max(0, min(1000, int(y1)))
                 ])
-            # --- THE FIX IS HERE ---
-            # 1. Use 'text=' keyword argument
-            # 2. Ensure 'is_split_into_words=True' is passed explicitly
             encoding = self.tokenizer(
-                text=words,              # <--- Changed from positional to keyword
                 boxes=norm_bboxes,
                 padding="max_length",
                 truncation=True,
                 max_length=self.max_len,
-                is_split_into_words=True, # This tells it 'words' is a list of strings
                 return_tensors="pt"
             )
-            # Align labels with subtokens
             word_ids = encoding.word_ids(batch_index=0)
             label_ids = []
             for word_id in word_ids:
@@ -162,6 +147,7 @@ class LayoutDataset(Dataset):
     def __getitem__(self, idx):
         return self.processed_data[idx]
 # -------------------------
 # 3. Training Function
 # -------------------------
@@ -169,22 +155,15 @@ def train_one_epoch(model, dataloader, optimizer, device):
     model.train()
     total_loss = 0
     for batch in tqdm(dataloader, desc="Training"):
-        # Move batch to device
         input_ids = batch["input_ids"].to(device)
         bbox = batch["bbox"].to(device)
         attention_mask = batch["attention_mask"].to(device)
         labels = batch["labels"].to(device)
         optimizer.zero_grad()
-        # Forward pass (Auto-calculates CRF loss inside model)
         loss = model(input_ids, bbox, attention_mask, labels=labels)
         loss.backward()
-        # Gradient clipping (Important for LSTM stability)
         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
         optimizer.step()
         total_loss += loss.item()
@@ -194,17 +173,12 @@ def train_one_epoch(model, dataloader, optimizer, device):
 # 4. Main Execution
 # -------------------------
 def main(args):
-    # Setup Device
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"⚙️ Using device: {device}")
-    # Initialize Tokenizer
     tokenizer = LayoutLMv3TokenizerFast.from_pretrained(BASE_MODEL_ID)
-    # Load Dataset
     dataset = LayoutDataset(args.input, tokenizer, max_len=args.max_len)
-    # Train/Val Split
     train_size = int(0.9 * len(dataset))
     val_size = len(dataset) - train_size
     train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
@@ -212,14 +186,11 @@ def main(args):
     train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
     val_loader = DataLoader(val_dataset, batch_size=args.batch_size)
-    # Initialize Hybrid Model
     model = LayoutLMv3BiLSTMCRF(num_labels=len(LABELS)).to(device)
-    # Optimization
-    # We use different learning rates: lower for transformer, higher for LSTM/CRF head
     optimizer = AdamW([
-        {'params': model.layoutlm.parameters(), 'lr': 2e-5}, # Low LR for backbone
-        {'params': model.lstm.parameters(), 'lr': 1e-4},     # Higher LR for LSTM
         {'params': model.classifier.parameters(), 'lr': 1e-4},
         {'params': model.crf.parameters(), 'lr': 1e-4}
     ])
@@ -230,7 +201,6 @@ def main(args):
         loss = train_one_epoch(model, train_loader, optimizer, device)
         print(f"Epoch {epoch+1}/{args.epochs} | Loss: {loss:.4f}")
-        # Save Checkpoint
         os.makedirs("checkpoints", exist_ok=True)
         save_path = "checkpoints/layoutlmv3_bilstm_crf_hybrid.pth"
         torch.save(model.state_dict(), save_path)
@@ -241,8 +211,8 @@ if __name__ == "__main__":
     parser.add_argument("--input", type=str, required=True, help="Path to unified JSON data")
     parser.add_argument("--batch_size", type=int, default=4)
     parser.add_argument("--epochs", type=int, default=5)
-    parser.add_argument("--lr", type=float, default=2e-5) # Base LR
     parser.add_argument("--max_len", type=int, default=512)
-    parser.add_argument("--mode", type=str, default="train") # Kept for compatibility with Gradio
     args = parser.parse_args()
     main(args)

 try:
     from TorchCRF import CRF
 except ImportError:
+    print("❌ Error: 'TorchCRF' not found. Install via: pip install pytorch-crf")
     exit()
 # --- Configuration ---
 BASE_MODEL_ID = "microsoft/layoutlmv3-base"
 MAX_LEN = 512
 LABELS = [
     "O",
     "B-QUESTION", "I-QUESTION",
         super().__init__()
         print(f"🏗️ Initializing Hybrid Model: LayoutLMv3 + BiLSTM + CRF...")
+        # 1. Backbone: LayoutLMv3
         self.layoutlm = LayoutLMv3Model.from_pretrained(BASE_MODEL_ID)
         transformer_output_size = self.layoutlm.config.hidden_size
         # 2. Middle: Bi-LSTM
         self.lstm = nn.LSTM(
             input_size=transformer_output_size,
             hidden_size=hidden_dim,
+            num_layers=2,
             bidirectional=True,
             batch_first=True,
             dropout=0.1
         )
         # 3. Head: Linear Projection
         self.classifier = nn.Linear(hidden_dim * 2, num_labels)
         # 4. Decoder: CRF
         self.crf = CRF(num_labels)
     def forward(self, input_ids, bbox, attention_mask, labels=None):
+        # Step A: LayoutLMv3
         outputs = self.layoutlm(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
         sequence_output = outputs.last_hidden_state
+        # Step B: Bi-LSTM
         lstm_output, _ = self.lstm(sequence_output)
+        # Step C: Projection
         emissions = self.classifier(lstm_output)
+        # Step D: CRF
         if labels is not None:
             log_likelihood = self.crf(emissions, labels, mask=attention_mask.bool())
             return -log_likelihood.mean()
         else:
         print(f"🔄 Preprocessing {len(data)} documents...")
         for item in data:
             if "data" in item:
                 words = item["data"].get("original_words", [])
                 bboxes = item["data"].get("original_bboxes", [])
                 labels = item.get("labels", ["O"] * len(words))
             else:
                 words = item.get("tokens", [])
             if not words: continue
+            # Normalize bboxes
             norm_bboxes = []
             for b in bboxes:
                 x0, y0, x1, y1 = b
                     max(0, min(1000, int(y1)))
                 ])
+            # --- KEY FIX IS HERE ---
+            # using text=words explicitly fixes the positional argument error
             encoding = self.tokenizer(
+                text=words,
                 boxes=norm_bboxes,
                 padding="max_length",
                 truncation=True,
                 max_length=self.max_len,
+                is_split_into_words=True,
                 return_tensors="pt"
             )
             word_ids = encoding.word_ids(batch_index=0)
             label_ids = []
             for word_id in word_ids:
     def __getitem__(self, idx):
         return self.processed_data[idx]
 # -------------------------
 # 3. Training Function
 # -------------------------
     model.train()
     total_loss = 0
     for batch in tqdm(dataloader, desc="Training"):
         input_ids = batch["input_ids"].to(device)
         bbox = batch["bbox"].to(device)
         attention_mask = batch["attention_mask"].to(device)
         labels = batch["labels"].to(device)
         optimizer.zero_grad()
         loss = model(input_ids, bbox, attention_mask, labels=labels)
         loss.backward()
         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
         optimizer.step()
         total_loss += loss.item()
 # 4. Main Execution
 # -------------------------
 def main(args):
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print(f"⚙️ Using device: {device}")
     tokenizer = LayoutLMv3TokenizerFast.from_pretrained(BASE_MODEL_ID)
     dataset = LayoutDataset(args.input, tokenizer, max_len=args.max_len)
     train_size = int(0.9 * len(dataset))
     val_size = len(dataset) - train_size
     train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])
     train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
     val_loader = DataLoader(val_dataset, batch_size=args.batch_size)
     model = LayoutLMv3BiLSTMCRF(num_labels=len(LABELS)).to(device)
     optimizer = AdamW([
+        {'params': model.layoutlm.parameters(), 'lr': 2e-5},
+        {'params': model.lstm.parameters(), 'lr': 1e-4},
         {'params': model.classifier.parameters(), 'lr': 1e-4},
         {'params': model.crf.parameters(), 'lr': 1e-4}
     ])
         loss = train_one_epoch(model, train_loader, optimizer, device)
         print(f"Epoch {epoch+1}/{args.epochs} | Loss: {loss:.4f}")
         os.makedirs("checkpoints", exist_ok=True)
         save_path = "checkpoints/layoutlmv3_bilstm_crf_hybrid.pth"
         torch.save(model.state_dict(), save_path)
     parser.add_argument("--input", type=str, required=True, help="Path to unified JSON data")
     parser.add_argument("--batch_size", type=int, default=4)
     parser.add_argument("--epochs", type=int, default=5)
+    parser.add_argument("--lr", type=float, default=2e-5)
     parser.add_argument("--max_len", type=int, default=512)
+    parser.add_argument("--mode", type=str, default="train")
     args = parser.parse_args()
     main(args)