Spaces:

heerjtdev
/

MLP_LayoutLMTrain

Sleeping

App Files Files Community

heerjtdev commited on 24 days ago

Commit

7225749

verified ·

1 Parent(s): 27b7a20

Upload LSTM_datset_converter.py

Browse files

Files changed (1) hide show

LSTM_datset_converter.py +103 -0

LSTM_datset_converter.py ADDED Viewed

	@@ -0,0 +1,103 @@

+import json
+import os
+from typing import List, Dict, Any
+def load_and_align_unified_data(input_json_path: str, output_json_path: str) -> str:
+    """
+    Loads the Label Studio JSON (with pre-extracted words and bboxes) and
+    aligns the character-offset labels to create a token-level (Token, Label, Bbox)
+    training file (The Unified JSON).
+    """
+    if not os.path.exists(input_json_path):
+        raise FileNotFoundError(f"Input JSON file not found at: {input_json_path}")
+    with open(input_json_path, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    # We assume the Label Studio output is a list where each item is a document/page
+    processed_documents = []
+    for item in data:
+        # 1. Get the pre-extracted tokens and bboxes from the 'data' field
+        words = item["data"].get("original_words", [])
+        bboxes = item["data"].get("original_bboxes", [])
+        full_text = item["data"].get("text", "")
+        if not words or not bboxes or not item.get("annotations"):
+            print(f"Skipping item {item.get('id', 'N/A')}: Missing words, bboxes, or annotations.")
+            continue
+        # Initialize labels for every token to 'O' (Outside)
+        labels = ["O"] * len(words)
+        # 2. Get the character-offset annotations from the 'annotations' field
+        annotations = item["annotations"][0].get("result", [])
+        # 3. Perform Alignment: Match the labeled text to the token list
+        for res in annotations:
+            if "value" in res and "labels" in res["value"]:
+                text_snippet = res["value"]["text"]
+                tag = res["value"]["labels"][0].upper()  # e.g., 'QUESTION'
+                # Tokenize the labeled snippet using simple space split
+                # (MUST match the original tokenization used to create 'original_words' if possible)
+                text_tokens = text_snippet.split()
+                # Find the starting index (i) where the sequence of tokens matches the snippet
+                for i in range(len(words) - len(text_tokens) + 1):
+                    # Check if the sequence of original words matches the sequence of labeled words
+                    if words[i:i + len(text_tokens)] == text_tokens:
+                        # Apply B-I-O scheme
+                        labels[i] = f"B-{tag}"
+                        for j in range(1, len(text_tokens)):
+                            # Check bounds just in case
+                            if i + j < len(labels):
+                                labels[i + j] = f"I-{tag}"
+                        break  # Found the match, move to the next annotation
+        # 4. Construct the final token-level output structure
+        document_tokens = []
+        for word, label, bbox in zip(words, labels, bboxes):
+            document_tokens.append({
+                "token": word,
+                "label": label,
+                "bbox": bbox
+            })
+        processed_documents.append(document_tokens)
+    # Flatten the list of documents into a single sequence if your training script expects it
+    # Note: The 'load_unified_data' function in the training script expects a flat list
+    flat_output = [token for doc in processed_documents for token in doc]
+    # Save the final Unified JSON
+    with open(output_json_path, "w", encoding="utf-8") as f:
+        json.dump(flat_output, f, indent=2, ensure_ascii=False)
+    print(f"✅ Alignment successful. Unified training data saved to: {output_json_path}")
+    print(f"Total aligned tokens: {len(flat_output)}")
+    return output_json_path
+# ==============================================================================
+if __name__ == '__main__':
+    # --- Configuration ---
+    # ⚠️ 1. Set the path to your uploaded Label Studio output JSON file
+    INPUT_FILE = "project-6-at-2026-01-21-07-10-460e552c.json"
+    # 2. Set the path for the output file (This is your Unified JSON Path)
+    OUTPUT_FILE = "unified_training_data_bluuhhhhh.json"
+    os.makedirs("output_data", exist_ok=True)
+    try:
+        # Run the alignment
+        unified_path = load_and_align_unified_data(INPUT_FILE, OUTPUT_FILE)
+        print("\nReady for Training! Use this path in your fixed training script:")
+        print(f"UNIFIED_DATA_PATH = \"{unified_path}\"")
+    except Exception as e:
+        print(f"\n❌ An error occurred during alignment: {e}")