heerjtdev commited on
Commit
7225749
·
verified ·
1 Parent(s): 27b7a20

Upload LSTM_datset_converter.py

Browse files
Files changed (1) hide show
  1. LSTM_datset_converter.py +103 -0
LSTM_datset_converter.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from typing import List, Dict, Any
4
+
5
+
6
+ def load_and_align_unified_data(input_json_path: str, output_json_path: str) -> str:
7
+ """
8
+ Loads the Label Studio JSON (with pre-extracted words and bboxes) and
9
+ aligns the character-offset labels to create a token-level (Token, Label, Bbox)
10
+ training file (The Unified JSON).
11
+ """
12
+
13
+ if not os.path.exists(input_json_path):
14
+ raise FileNotFoundError(f"Input JSON file not found at: {input_json_path}")
15
+
16
+ with open(input_json_path, "r", encoding="utf-8") as f:
17
+ data = json.load(f)
18
+
19
+ # We assume the Label Studio output is a list where each item is a document/page
20
+ processed_documents = []
21
+
22
+ for item in data:
23
+ # 1. Get the pre-extracted tokens and bboxes from the 'data' field
24
+ words = item["data"].get("original_words", [])
25
+ bboxes = item["data"].get("original_bboxes", [])
26
+ full_text = item["data"].get("text", "")
27
+
28
+ if not words or not bboxes or not item.get("annotations"):
29
+ print(f"Skipping item {item.get('id', 'N/A')}: Missing words, bboxes, or annotations.")
30
+ continue
31
+
32
+ # Initialize labels for every token to 'O' (Outside)
33
+ labels = ["O"] * len(words)
34
+
35
+ # 2. Get the character-offset annotations from the 'annotations' field
36
+ annotations = item["annotations"][0].get("result", [])
37
+
38
+ # 3. Perform Alignment: Match the labeled text to the token list
39
+ for res in annotations:
40
+ if "value" in res and "labels" in res["value"]:
41
+ text_snippet = res["value"]["text"]
42
+ tag = res["value"]["labels"][0].upper() # e.g., 'QUESTION'
43
+
44
+ # Tokenize the labeled snippet using simple space split
45
+ # (MUST match the original tokenization used to create 'original_words' if possible)
46
+ text_tokens = text_snippet.split()
47
+
48
+ # Find the starting index (i) where the sequence of tokens matches the snippet
49
+ for i in range(len(words) - len(text_tokens) + 1):
50
+ # Check if the sequence of original words matches the sequence of labeled words
51
+ if words[i:i + len(text_tokens)] == text_tokens:
52
+ # Apply B-I-O scheme
53
+ labels[i] = f"B-{tag}"
54
+ for j in range(1, len(text_tokens)):
55
+ # Check bounds just in case
56
+ if i + j < len(labels):
57
+ labels[i + j] = f"I-{tag}"
58
+ break # Found the match, move to the next annotation
59
+
60
+ # 4. Construct the final token-level output structure
61
+ document_tokens = []
62
+ for word, label, bbox in zip(words, labels, bboxes):
63
+ document_tokens.append({
64
+ "token": word,
65
+ "label": label,
66
+ "bbox": bbox
67
+ })
68
+
69
+ processed_documents.append(document_tokens)
70
+
71
+ # Flatten the list of documents into a single sequence if your training script expects it
72
+ # Note: The 'load_unified_data' function in the training script expects a flat list
73
+ flat_output = [token for doc in processed_documents for token in doc]
74
+
75
+ # Save the final Unified JSON
76
+ with open(output_json_path, "w", encoding="utf-8") as f:
77
+ json.dump(flat_output, f, indent=2, ensure_ascii=False)
78
+
79
+ print(f"✅ Alignment successful. Unified training data saved to: {output_json_path}")
80
+ print(f"Total aligned tokens: {len(flat_output)}")
81
+ return output_json_path
82
+
83
+
84
+ # ==============================================================================
85
+
86
+ if __name__ == '__main__':
87
+ # --- Configuration ---
88
+ # ⚠️ 1. Set the path to your uploaded Label Studio output JSON file
89
+ INPUT_FILE = "project-6-at-2026-01-21-07-10-460e552c.json"
90
+
91
+ # 2. Set the path for the output file (This is your Unified JSON Path)
92
+ OUTPUT_FILE = "unified_training_data_bluuhhhhh.json"
93
+
94
+ os.makedirs("output_data", exist_ok=True)
95
+
96
+ try:
97
+ # Run the alignment
98
+ unified_path = load_and_align_unified_data(INPUT_FILE, OUTPUT_FILE)
99
+ print("\nReady for Training! Use this path in your fixed training script:")
100
+ print(f"UNIFIED_DATA_PATH = \"{unified_path}\"")
101
+
102
+ except Exception as e:
103
+ print(f"\n❌ An error occurred during alignment: {e}")