File size: 4,044 Bytes
1e83c8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import os
import json
import pandas as pd
import numpy as np
from typing import Tuple, Dict, List


class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (np.float32, np.float64)):
            return float(obj)
        if isinstance(obj, (np.int32, np.int64)):
            return int(obj)
        return super().default(obj)


def load_annotations(path: str) -> Dict:
    if not os.path.exists(path):
        return {}
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def save_annotations(path: str, data: Dict):
    """Save annotations to both JSON and CSV formats, merging with existing."""
    # Load existing (prefer JSON, else CSV)
    json_path = path.replace('.csv', '.json')
    existing: Dict[str, Dict[str, str]] = {}

    if os.path.exists(json_path):
        try:
            with open(json_path, "r", encoding="utf-8") as f:
                existing = json.load(f)
        except Exception:
            existing = {}
    elif os.path.exists(path):
        try:
            existing = read_annotations_from_csv(path)
        except Exception:
            existing = {}

    # Merge incoming data
    merged: Dict[str, Dict[str, str]] = dict(existing)
    for filename, ann in data.items():
        if filename not in merged:
            merged[filename] = {"extracted_text": "", "validated_text": ""}
        if isinstance(ann, dict):
            if "extracted_text" in ann:
                merged[filename]["extracted_text"] = str(ann.get("extracted_text", ""))
            if "validated_text" in ann:
                merged[filename]["validated_text"] = str(ann.get("validated_text", ""))
        else:
            merged[filename]["validated_text"] = str(ann)

    # Save JSON (full merged)
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(merged, f, ensure_ascii=False, indent=2, cls=CustomJSONEncoder)

    # Save CSV (full merged)
    save_annotations_to_csv(path, merged)


def load_annotations_from_csv(csv_file: str, image_folder: str) -> Tuple[Dict, List[str], List[str]]:
    if not os.path.exists(csv_file):
        return {}, [], []

    df = pd.read_csv(csv_file, encoding='utf-8-sig')

    if 'image_filename' not in df.columns:
        raise ValueError("CSV must contain 'image_filename' column.")

    annotations: Dict[str, Dict[str, str]] = {}
    valid_images: List[str] = []
    missing_images: List[str] = []

    for _, row in df.iterrows():
        filename = row['image_filename']
        image_path = os.path.join(image_folder, filename)
        if os.path.exists(image_path):
            annotations[filename] = {
                'extracted_text': str(row.get('extracted_text', '')),
                'validated_text': str(row.get('validated_text', row.get('extracted_text', '')))
            }
            valid_images.append(filename)
        else:
            missing_images.append(filename)

    return annotations, valid_images, missing_images


def read_annotations_from_csv(csv_file: str) -> Dict[str, Dict[str, str]]:
    """Read existing CSV into a filename->annotation dict."""
    df = pd.read_csv(csv_file, encoding='utf-8-sig')
    existing: Dict[str, Dict[str, str]] = {}
    for _, row in df.iterrows():
        filename = str(row.get('image_filename', '')).strip()
        if not filename:
            continue
        existing[filename] = {
            'extracted_text': str(row.get('extracted_text', '')),
            'validated_text': str(row.get('validated_text', ''))
        }
    return existing


def save_annotations_to_csv(csv_file: str, annotations: Dict[str, Dict[str, str]]):
    rows = [
        {
            'image_filename': filename,
            'extracted_text': str(ann.get('extracted_text', '')),
            'validated_text': str(ann.get('validated_text', ''))
        }
        for filename, ann in annotations.items()
    ]
    df = pd.DataFrame(rows)
    os.makedirs(os.path.dirname(csv_file), exist_ok=True)
    df.to_csv(csv_file, index=False, encoding='utf-8-sig')