Spaces:

heerjtdev
/

MLP_LayoutLMTrain

Sleeping

App Files Files Community

heerjtdev commited on Jan 21

Commit

27b7a20

verified ·

1 Parent(s): 703b939

Rename Data_augmentation.py to train.py

Browse files

Files changed (2) hide show

Data_augmentation.py +0 -105
train.py +244 -0

Data_augmentation.py DELETED Viewed

@@ -1,105 +0,0 @@
-import json
-import random
-import os
-# --- Configuration ---
-# The name of the file to load and save to.
-INPUT_FILE = "unified_training_data_bluuhhhhh.json"
-# The maximum allowed deviation for the shift in the x and y directions.
-# A range of +/- 5 is used to keep the change subtle but effective.
-MAX_SHIFT = 10
-# The coordinate boundary limit (assuming coordinates are scaled 0-1000)
-MAX_COORD = 1000
-MIN_COORD = 0
-# Number of augmented copies to create (1 means the original dataset size is doubled)
-NUM_AUGMENTATION_COPIES = 1
-def clip_coord(coord):
-    """Ensures a coordinate stays within the 0 to MAX_COORD boundary."""
-    return max(MIN_COORD, min(MAX_COORD, coord))
-def augment_data(data, shift_x, shift_y):
-    """
-    Applies a uniform translation shift to all bounding boxes in the dataset
-    and returns the new augmented list of tokens.
-    The shift_x and shift_y are the same for all tokens in this copy,
-    preserving the crucial relative layout structure.
-    """
-    augmented_data = []
-    for item in data:
-        # Create a deep copy of the item to avoid modifying the original data in place
-        new_item = item.copy()
-        # Bounding box coordinates: [x_min, y_min, x_max, y_max]
-        bbox = new_item['bbox']
-        # Apply the uniform shift and clip the coordinates
-        new_bbox = [
-            clip_coord(bbox[0] + shift_x),  # x_min
-            clip_coord(bbox[1] + shift_y),  # y_min
-            clip_coord(bbox[2] + shift_x),  # x_max
-            clip_coord(bbox[3] + shift_y)  # y_max
-        ]
-        new_item['bbox'] = new_bbox
-        augmented_data.append(new_item)
-    return augmented_data
-def process_dataset():
-    """Loads the original data, performs augmentation, and saves the combined data."""
-    if not os.path.exists(INPUT_FILE):
-        print(f"Error: Input file '{INPUT_FILE}' not found.")
-        print("Please ensure your uploaded JSON file is available and named correctly.")
-        return
-    print(f"Loading data from {INPUT_FILE}...")
-    try:
-        with open(INPUT_FILE, 'r') as f:
-            # Assuming the JSON file is a list of token objects
-            original_data = json.load(f)
-    except json.JSONDecodeError:
-        print(f"Error: Failed to decode JSON from '{INPUT_FILE}'. Check file format.")
-        return
-    except Exception as e:
-        print(f"An error occurred while reading the file: {e}")
-        return
-    print(f"Original dataset size: {len(original_data)} tokens.")
-    all_combined_data = original_data.copy()
-    for i in range(NUM_AUGMENTATION_COPIES):
-        # 1. Choose a uniform shift for the entire dataset copy
-        # This is the core spatial jittering logic.
-        shift_x = random.randint(-MAX_SHIFT, MAX_SHIFT)
-        shift_y = random.randint(-MAX_SHIFT, MAX_SHIFT)
-        print(f"\nCreating augmented copy #{i + 1} with uniform shift (X: {shift_x}, Y: {shift_y})...")
-        # 2. Perform the augmentation
-        augmented_copy = augment_data(original_data, shift_x, shift_y)
-        # 3. Append the augmented data to the combined list
-        all_combined_data.extend(augmented_copy)
-    print(f"\nAugmentation complete. Total dataset size: {len(all_combined_data)} tokens.")
-    # 4. Save the combined (original + augmented) data back to the file
-    print(f"Saving combined data back to {INPUT_FILE}...")
-    try:
-        with open(INPUT_FILE, 'w') as f:
-            # Use indent for readability
-            json.dump(all_combined_data, f, indent=2)
-        print("Successfully updated the dataset with augmented data.")
-    except Exception as e:
-        print(f"An error occurred while writing the file: {e}")
-if __name__ == "__main__":
-    process_dataset()

train.py ADDED Viewed

	@@ -0,0 +1,244 @@

+import json
+import argparse
+import os
+import random
+import torch
+import torch.nn as nn
+from torch.utils.data import Dataset, DataLoader, random_split
+from transformers import LayoutLMv3TokenizerFast, LayoutLMv3Model
+from TorchCRF import CRF
+from torch.optim import AdamW
+from tqdm import tqdm
+from sklearn.metrics import precision_recall_fscore_support
+# --- Configuration ---
+MAX_BBOX_DIMENSION = 1000
+MAX_SHIFT = 30
+AUGMENTATION_FACTOR = 1
+BASE_MODEL_ID = "microsoft/layoutlmv3-base"
+# -------------------------
+# Step 1: Preprocessing
+# -------------------------
+def preprocess_labelstudio(input_path, output_path):
+    with open(input_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    processed = []
+    print(f"🔄 Starting preprocessing of {len(data)} documents...")
+    for item in data:
+        words = item["data"]["original_words"]
+        bboxes = item["data"]["original_bboxes"]
+        labels = ["O"] * len(words)
+        clamped_bboxes = []
+        for bbox in bboxes:
+            x_min, y_min, x_max, y_max = bbox
+            new_x_min = max(0, min(x_min, 1000))
+            new_y_min = max(0, min(y_min, 1000))
+            new_x_max = max(0, min(x_max, 1000))
+            new_y_max = max(0, min(y_max, 1000))
+            if new_x_min > new_x_max: new_x_min = new_x_max
+            if new_y_min > new_y_max: new_y_min = new_y_max
+            clamped_bboxes.append([new_x_min, new_y_min, new_x_max, new_y_max])
+        if "annotations" in item:
+            for ann in item["annotations"]:
+                for res in ann["result"]:
+                    if "value" in res and "labels" in res["value"]:
+                        text = res["value"]["text"]
+                        tag = res["value"]["labels"][0]
+                        text_tokens = text.split()
+                        for i in range(len(words) - len(text_tokens) + 1):
+                            if words[i:i + len(text_tokens)] == text_tokens:
+                                labels[i] = f"B-{tag}"
+                                for j in range(1, len(text_tokens)):
+                                    labels[i + j] = f"I-{tag}"
+                                break
+        processed.append({"tokens": words, "labels": labels, "bboxes": clamped_bboxes})
+    with open(output_path, "w", encoding="utf-8") as f:
+        json.dump(processed, f, indent=2, ensure_ascii=False)
+    return output_path
+# -------------------------
+# Step 1.5: Augmentation
+# -------------------------
+def translate_bbox(bbox, shift_x, shift_y):
+    x_min, y_min, x_max, y_max = bbox
+    new_x_min = max(0, min(x_min + shift_x, 1000))
+    new_y_min = max(0, min(y_min + shift_y, 1000))
+    new_x_max = max(0, min(x_max + shift_x, 1000))
+    new_y_max = max(0, min(y_max + shift_y, 1000))
+    return [new_x_min, new_y_min, new_x_max, new_y_max]
+def augment_sample(sample):
+    shift_x = random.randint(-MAX_SHIFT, MAX_SHIFT)
+    shift_y = random.randint(-MAX_SHIFT, MAX_SHIFT)
+    new_sample = sample.copy()
+    new_sample["bboxes"] = [translate_bbox(b, shift_x, shift_y) for b in sample["bboxes"]]
+    return new_sample
+def augment_and_save_dataset(input_json_path, output_json_path):
+    with open(input_json_path, 'r', encoding="utf-8") as f:
+        training_data = json.load(f)
+    augmented_data = []
+    for original_sample in training_data:
+        augmented_data.append(original_sample)
+        for _ in range(AUGMENTATION_FACTOR):
+            augmented_data.append(augment_sample(original_sample))
+    with open(output_json_path, 'w', encoding="utf-8") as f:
+        json.dump(augmented_data, f, indent=2, ensure_ascii=False)
+    return output_json_path
+# -------------------------
+# Step 2: Dataset Class
+# -------------------------
+class LayoutDataset(Dataset):
+    def __init__(self, json_path, tokenizer, label2id, max_len=512):
+        with open(json_path, "r", encoding="utf-8") as f:
+            self.data = json.load(f)
+        self.tokenizer = tokenizer
+        self.label2id = label2id
+        self.max_len = max_len
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        item = self.data[idx]
+        words, bboxes, labels = item["tokens"], item["bboxes"], item["labels"]
+        encodings = self.tokenizer(words, boxes=bboxes, padding="max_length", truncation=True, max_length=self.max_len, return_tensors="pt")
+        word_ids = encodings.word_ids(batch_index=0)
+        label_ids = []
+        for word_id in word_ids:
+            if word_id is None:
+                label_ids.append(self.label2id["O"])
+            else:
+                label_ids.append(self.label2id.get(labels[word_id], self.label2id["O"]))
+        encodings["labels"] = torch.tensor(label_ids)
+        return {key: val.squeeze(0) for key, val in encodings.items()}
+# -------------------------
+# Step 3: Model Architecture (Non-Linear Head)
+# -------------------------
+class LayoutLMv3CRF(nn.Module):
+    def __init__(self, num_labels):
+        super().__init__()
+        # Initializing from scratch (Base weights only)
+        print(f"🔄 Initializing backbone from {BASE_MODEL_ID}...")
+        self.layoutlm = LayoutLMv3Model.from_pretrained(BASE_MODEL_ID)
+        hidden_size = self.layoutlm.config.hidden_size
+        # NON-LINEAR MLP HEAD
+        # Replacing the simple Linear layer with a deeper architecture
+        self.classifier = nn.Sequential(
+            nn.Linear(hidden_size, hidden_size),
+            nn.GELU(),                  # Non-linear activation
+            nn.LayerNorm(hidden_size),   # Stability for training from scratch
+            nn.Dropout(0.1),
+            nn.Linear(hidden_size, num_labels)
+        )
+        self.crf = CRF(num_labels)
+    def forward(self, input_ids, bbox, attention_mask, labels=None):
+        outputs = self.layoutlm(input_ids=input_ids, bbox=bbox, attention_mask=attention_mask)
+        sequence_output = outputs.last_hidden_state
+        # Pass through the new non-linear head
+        emissions = self.classifier(sequence_output)
+        if labels is not None:
+            log_likelihood = self.crf(emissions, labels, mask=attention_mask.bool())
+            return -log_likelihood.mean()
+        else:
+            return self.crf.viterbi_decode(emissions, mask=attention_mask.bool())
+# -------------------------
+# Step 4: Training + Evaluation
+# -------------------------
+def train_one_epoch(model, dataloader, optimizer, device):
+    model.train()
+    total_loss = 0
+    for batch in tqdm(dataloader, desc="Training"):
+        batch = {k: v.to(device) for k, v in batch.items()}
+        labels = batch.pop("labels")
+        optimizer.zero_grad()
+        loss = model(**batch, labels=labels)
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    return total_loss / len(dataloader)
+def evaluate(model, dataloader, device, id2label):
+    model.eval()
+    all_preds, all_labels = [], []
+    with torch.no_grad():
+        for batch in tqdm(dataloader, desc="Evaluating"):
+            batch = {k: v.to(device) for k, v in batch.items()}
+            labels = batch.pop("labels").cpu().numpy()
+            preds = model(**batch)
+            for p, l, mask in zip(preds, labels, batch["attention_mask"].cpu().numpy()):
+                valid = mask == 1
+                l_valid = l[valid].tolist()
+                all_labels.extend(l_valid)
+                all_preds.extend(p[:len(l_valid)])
+    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="micro", zero_division=0)
+    return precision, recall, f1
+# -------------------------
+# Step 5: Main Execution
+# -------------------------
+def main(args):
+    labels = ["O", "B-QUESTION", "I-QUESTION", "B-OPTION", "I-OPTION", "B-ANSWER", "I-ANSWER", "B-SECTION_HEADING", "I-SECTION_HEADING", "B-PASSAGE", "I-PASSAGE"]
+    label2id = {l: i for i, l in enumerate(labels)}
+    id2label = {i: l for l, i in label2id.items()}
+    TEMP_DIR = "temp_intermediate_files"
+    os.makedirs(TEMP_DIR, exist_ok=True)
+    # 1. Preprocess & Augment
+    initial_json = os.path.join(TEMP_DIR, "data_bio.json")
+    preprocess_labelstudio(args.input, initial_json)
+    augmented_json = os.path.join(TEMP_DIR, "data_aug.json")
+    final_data_path = augment_and_save_dataset(initial_json, augmented_json)
+    # 2. Setup Data
+    tokenizer = LayoutLMv3TokenizerFast.from_pretrained(BASE_MODEL_ID)
+    dataset = LayoutDataset(final_data_path, tokenizer, label2id, max_len=args.max_len)
+    val_size = int(0.2 * len(dataset))
+    train_dataset, val_dataset = random_split(dataset, [len(dataset) - val_size, val_size])
+    train_loader = DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True)
+    val_loader = DataLoader(val_dataset, batch_size=args.batch_size)
+    # 3. Model
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    model = LayoutLMv3CRF(num_labels=len(labels)).to(device)
+    optimizer = AdamW(model.parameters(), lr=args.lr)
+    # 4. Loop
+    for epoch in range(args.epochs):
+        loss = train_one_epoch(model, train_loader, optimizer, device)
+        p, r, f1 = evaluate(model, val_loader, device, id2label)
+        print(f"Epoch {epoch+1} | Loss: {loss:.4f} | F1: {f1:.3f}")
+        ckpt_path = "checkpoints/layoutlmv3_nonlinear_scratch.pth"
+        os.makedirs("checkpoints", exist_ok=True)
+        torch.save(model.state_dict(), ckpt_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mode", type=str, default="train")
+    parser.add_argument("--input", type=str, required=True)
+    parser.add_argument("--batch_size", type=int, default=4)
+    parser.add_argument("--epochs", type=int, default=10) # Increased for scratch training
+    parser.add_argument("--lr", type=float, default=2e-5)
+    parser.add_argument("--max_len", type=int, default=512)
+    args = parser.parse_args()
+    main(args)