Spaces:
Sleeping
Sleeping
File size: 6,416 Bytes
d79b7f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 |
# src/data_loader.py
import json
import ast
import numpy as np
from datasets import load_dataset
from difflib import SequenceMatcher
# --- CONFIGURATION ---
LABEL_MAPPING = {
# Vendor/Company
"seller": "COMPANY",
"store_name": "COMPANY",
# Address
"store_addr": "ADDRESS",
# Date
"date": "DATE",
"invoice_date": "DATE",
# Total
"total": "TOTAL",
"total_gross_worth": "TOTAL",
# Receipt Number / Invoice No
"invoice_no": "INVOICE_NO",
# Bill To / Client
"client": "BILL_TO"
}
def safe_parse(content):
"""Robustly parses input that might be a list, a JSON string, or a Python string literal."""
if isinstance(content, list):
return content
if isinstance(content, str):
try:
return json.loads(content)
except json.JSONDecodeError:
pass
try:
return ast.literal_eval(content)
except (ValueError, SyntaxError):
pass
return []
def normalize_box(box, width, height):
"""Converts 8-point polygons to 4-point normalized [0-1000] bbox."""
try:
# Handle nested format variations
if isinstance(box, list) and len(box) == 2 and isinstance(box[0], list):
polygon = box[0]
elif isinstance(box, list) and len(box) == 4 and isinstance(box[0], list):
polygon = box
else:
return None
xs = [point[0] for point in polygon]
ys = [point[1] for point in polygon]
return [
int(max(0, min(1000 * (min(xs) / width), 1000))),
int(max(0, min(1000 * (min(ys) / height), 1000))),
int(max(0, min(1000 * (max(xs) / width), 1000))),
int(max(0, min(1000 * (max(ys) / height), 1000)))
]
except Exception:
return None
def tokenize_and_spread_boxes(words, boxes):
"""
Splits phrases into individual words and duplicates the bounding box.
Input: ['Invoice #123'], [BOX_A]
Output: ['Invoice', '#123'], [BOX_A, BOX_A]
"""
tokenized_words = []
tokenized_boxes = []
for word, box in zip(words, boxes):
# Split by whitespace
sub_words = str(word).split()
for sw in sub_words:
tokenized_words.append(sw)
tokenized_boxes.append(box)
return tokenized_words, tokenized_boxes
def align_labels(ocr_words, label_map):
"""Matches OCR words to Ground Truth values using Sub-sequence Matching."""
tags = ["O"] * len(ocr_words)
for target_text, label_class in label_map.items():
if not target_text: continue
target_tokens = str(target_text).split()
if not target_tokens: continue
n_target = len(target_tokens)
# Sliding window search
for i in range(len(ocr_words) - n_target + 1):
window = ocr_words[i : i + n_target]
# Check match
match = True
for j in range(n_target):
# Clean punctuation for comparison
w_clean = window[j].strip(".,-:")
t_clean = target_tokens[j].strip(".,-:")
if w_clean not in t_clean and t_clean not in w_clean:
match = False
break
if match:
tags[i] = f"B-{label_class}"
for k in range(1, n_target):
tags[i + k] = f"I-{label_class}"
return tags
def load_unified_dataset(split="train", sample_size=None):
print(f"🔄 Loading dataset 'mychen76/invoices-and-receipts_ocr_v1' ({split})...")
dataset = load_dataset("mychen76/invoices-and-receipts_ocr_v1", split=split)
if sample_size:
dataset = dataset.select(range(sample_size))
processed_data = []
print("⚙️ Processing, Tokenizing, and Aligning...")
for example in dataset:
try:
image = example['image']
if image.mode != "RGB":
image = image.convert("RGB")
width, height = image.size
# 1. Parse Raw OCR
raw_words = safe_parse(json.loads(example['raw_data']).get('ocr_words'))
raw_boxes = safe_parse(json.loads(example['raw_data']).get('ocr_boxes'))
if not raw_words or not raw_boxes or len(raw_words) != len(raw_boxes):
continue
# 2. Normalize Boxes first
norm_boxes = []
valid_words = []
for i, box in enumerate(raw_boxes):
nb = normalize_box(box, width, height)
if nb:
norm_boxes.append(nb)
valid_words.append(raw_words[i])
# 3. TOKENIZE (The Fix)
final_words, final_boxes = tokenize_and_spread_boxes(valid_words, norm_boxes)
# 4. Map Labels
parsed_json = json.loads(example['parsed_data'])
fields = safe_parse(parsed_json.get('json', {}))
label_value_map = {}
if isinstance(fields, dict):
for k, v in fields.items():
if k in LABEL_MAPPING and v:
label_value_map[v] = LABEL_MAPPING[k]
# 5. Align Labels
final_tags = align_labels(final_words, label_value_map)
# Only keep if we found at least one entity (cleaner training data)
unique_tags = set(final_tags)
if len(unique_tags) > 1:
processed_data.append({
"image": image,
"words": final_words,
"bboxes": final_boxes,
"ner_tags": final_tags
})
except Exception:
continue
print(f"✅ Successfully processed {len(processed_data)} examples.")
return processed_data
if __name__ == "__main__":
# Test run
data = load_unified_dataset(sample_size=20)
if len(data) > 0:
print(f"\nSample 0 Words: {data[0]['words'][:10]}...")
print(f"Sample 0 Tags: {data[0]['ner_tags'][:10]}...")
all_tags = [t for item in data for t in item['ner_tags']]
unique_tags = set(all_tags)
print(f"\nUnique Tags Found in Sample: {unique_tags}")
else:
print("No valid examples found in sample.") |