File size: 2,543 Bytes
faa3050 d79b7f7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
# src/sroie_loader.py
import json
from pathlib import Path
from PIL import Image
def load_sroie(path):
print(f"🔄 Loading SROIE from local path: {path}")
path = Path(path)
dataset = {'train': [], 'test': []}
for split in ["train", "test"]:
split_path = path / split
if (split_path / "images").exists(): img_dir = split_path / "images"
elif (split_path / "img").exists(): img_dir = split_path / "img"
else: continue
if (split_path / "tagged").exists(): ann_dir = split_path / "tagged"
elif (split_path / "box").exists(): ann_dir = split_path / "box"
else: continue
examples = []
for img_file in sorted(img_dir.iterdir()):
if img_file.suffix.lower() not in [".jpg", ".png"]: continue
name = img_file.stem
json_path = ann_dir / f"{name}.json"
if not json_path.exists(): continue
with open(json_path, encoding="utf8") as f:
data = json.load(f)
if "words" in data and "bbox" in data and "labels" in data:
# --- NORMALIZATION HAPPENS HERE (YOUR FIX) ---
try:
with Image.open(img_file) as img:
width, height = img.size
norm_boxes = []
for box in data["bbox"]:
# SROIE is raw [x0, y0, x1, y1]
x0, y0, x1, y1 = box
# Normalize and Clamp
norm_box = [
int(max(0, min(1000 * (x0 / width), 1000))),
int(max(0, min(1000 * (y0 / height), 1000))),
int(max(0, min(1000 * (x1 / width), 1000))),
int(max(0, min(1000 * (y1 / height), 1000)))
]
norm_boxes.append(norm_box)
examples.append({
"image_path": str(img_file),
"words": data["words"],
"bboxes": norm_boxes, # Storing normalized boxes
"ner_tags": data["labels"]
})
except Exception as e:
print(f"Skipping {name}: {e}")
continue
dataset[split] = examples
print(f" Mapped {len(examples)} paths for {split}")
return dataset |