Spaces:
Sleeping
Sleeping
Upload LSTM_datset_converter.py
Browse files- LSTM_datset_converter.py +103 -0
LSTM_datset_converter.py
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
import os
|
| 3 |
+
from typing import List, Dict, Any
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def load_and_align_unified_data(input_json_path: str, output_json_path: str) -> str:
|
| 7 |
+
"""
|
| 8 |
+
Loads the Label Studio JSON (with pre-extracted words and bboxes) and
|
| 9 |
+
aligns the character-offset labels to create a token-level (Token, Label, Bbox)
|
| 10 |
+
training file (The Unified JSON).
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
if not os.path.exists(input_json_path):
|
| 14 |
+
raise FileNotFoundError(f"Input JSON file not found at: {input_json_path}")
|
| 15 |
+
|
| 16 |
+
with open(input_json_path, "r", encoding="utf-8") as f:
|
| 17 |
+
data = json.load(f)
|
| 18 |
+
|
| 19 |
+
# We assume the Label Studio output is a list where each item is a document/page
|
| 20 |
+
processed_documents = []
|
| 21 |
+
|
| 22 |
+
for item in data:
|
| 23 |
+
# 1. Get the pre-extracted tokens and bboxes from the 'data' field
|
| 24 |
+
words = item["data"].get("original_words", [])
|
| 25 |
+
bboxes = item["data"].get("original_bboxes", [])
|
| 26 |
+
full_text = item["data"].get("text", "")
|
| 27 |
+
|
| 28 |
+
if not words or not bboxes or not item.get("annotations"):
|
| 29 |
+
print(f"Skipping item {item.get('id', 'N/A')}: Missing words, bboxes, or annotations.")
|
| 30 |
+
continue
|
| 31 |
+
|
| 32 |
+
# Initialize labels for every token to 'O' (Outside)
|
| 33 |
+
labels = ["O"] * len(words)
|
| 34 |
+
|
| 35 |
+
# 2. Get the character-offset annotations from the 'annotations' field
|
| 36 |
+
annotations = item["annotations"][0].get("result", [])
|
| 37 |
+
|
| 38 |
+
# 3. Perform Alignment: Match the labeled text to the token list
|
| 39 |
+
for res in annotations:
|
| 40 |
+
if "value" in res and "labels" in res["value"]:
|
| 41 |
+
text_snippet = res["value"]["text"]
|
| 42 |
+
tag = res["value"]["labels"][0].upper() # e.g., 'QUESTION'
|
| 43 |
+
|
| 44 |
+
# Tokenize the labeled snippet using simple space split
|
| 45 |
+
# (MUST match the original tokenization used to create 'original_words' if possible)
|
| 46 |
+
text_tokens = text_snippet.split()
|
| 47 |
+
|
| 48 |
+
# Find the starting index (i) where the sequence of tokens matches the snippet
|
| 49 |
+
for i in range(len(words) - len(text_tokens) + 1):
|
| 50 |
+
# Check if the sequence of original words matches the sequence of labeled words
|
| 51 |
+
if words[i:i + len(text_tokens)] == text_tokens:
|
| 52 |
+
# Apply B-I-O scheme
|
| 53 |
+
labels[i] = f"B-{tag}"
|
| 54 |
+
for j in range(1, len(text_tokens)):
|
| 55 |
+
# Check bounds just in case
|
| 56 |
+
if i + j < len(labels):
|
| 57 |
+
labels[i + j] = f"I-{tag}"
|
| 58 |
+
break # Found the match, move to the next annotation
|
| 59 |
+
|
| 60 |
+
# 4. Construct the final token-level output structure
|
| 61 |
+
document_tokens = []
|
| 62 |
+
for word, label, bbox in zip(words, labels, bboxes):
|
| 63 |
+
document_tokens.append({
|
| 64 |
+
"token": word,
|
| 65 |
+
"label": label,
|
| 66 |
+
"bbox": bbox
|
| 67 |
+
})
|
| 68 |
+
|
| 69 |
+
processed_documents.append(document_tokens)
|
| 70 |
+
|
| 71 |
+
# Flatten the list of documents into a single sequence if your training script expects it
|
| 72 |
+
# Note: The 'load_unified_data' function in the training script expects a flat list
|
| 73 |
+
flat_output = [token for doc in processed_documents for token in doc]
|
| 74 |
+
|
| 75 |
+
# Save the final Unified JSON
|
| 76 |
+
with open(output_json_path, "w", encoding="utf-8") as f:
|
| 77 |
+
json.dump(flat_output, f, indent=2, ensure_ascii=False)
|
| 78 |
+
|
| 79 |
+
print(f"✅ Alignment successful. Unified training data saved to: {output_json_path}")
|
| 80 |
+
print(f"Total aligned tokens: {len(flat_output)}")
|
| 81 |
+
return output_json_path
|
| 82 |
+
|
| 83 |
+
|
| 84 |
+
# ==============================================================================
|
| 85 |
+
|
| 86 |
+
if __name__ == '__main__':
|
| 87 |
+
# --- Configuration ---
|
| 88 |
+
# ⚠️ 1. Set the path to your uploaded Label Studio output JSON file
|
| 89 |
+
INPUT_FILE = "project-6-at-2026-01-21-07-10-460e552c.json"
|
| 90 |
+
|
| 91 |
+
# 2. Set the path for the output file (This is your Unified JSON Path)
|
| 92 |
+
OUTPUT_FILE = "unified_training_data_bluuhhhhh.json"
|
| 93 |
+
|
| 94 |
+
os.makedirs("output_data", exist_ok=True)
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
# Run the alignment
|
| 98 |
+
unified_path = load_and_align_unified_data(INPUT_FILE, OUTPUT_FILE)
|
| 99 |
+
print("\nReady for Training! Use this path in your fixed training script:")
|
| 100 |
+
print(f"UNIFIED_DATA_PATH = \"{unified_path}\"")
|
| 101 |
+
|
| 102 |
+
except Exception as e:
|
| 103 |
+
print(f"\n❌ An error occurred during alignment: {e}")
|