import os import json import pandas as pd import numpy as np from typing import Tuple, Dict, List class CustomJSONEncoder(json.JSONEncoder): def default(self, obj): if isinstance(obj, (np.float32, np.float64)): return float(obj) if isinstance(obj, (np.int32, np.int64)): return int(obj) return super().default(obj) def load_annotations(path: str) -> Dict: if not os.path.exists(path): return {} with open(path, "r", encoding="utf-8") as f: return json.load(f) def save_annotations(path: str, data: Dict): """Save annotations to both JSON and CSV formats, merging with existing.""" # Load existing (prefer JSON, else CSV) json_path = path.replace('.csv', '.json') existing: Dict[str, Dict[str, str]] = {} if os.path.exists(json_path): try: with open(json_path, "r", encoding="utf-8") as f: existing = json.load(f) except Exception: existing = {} elif os.path.exists(path): try: existing = read_annotations_from_csv(path) except Exception: existing = {} # Merge incoming data merged: Dict[str, Dict[str, str]] = dict(existing) for filename, ann in data.items(): if filename not in merged: merged[filename] = {"extracted_text": "", "validated_text": ""} if isinstance(ann, dict): if "extracted_text" in ann: merged[filename]["extracted_text"] = str(ann.get("extracted_text", "")) if "validated_text" in ann: merged[filename]["validated_text"] = str(ann.get("validated_text", "")) else: merged[filename]["validated_text"] = str(ann) # Save JSON (full merged) with open(json_path, "w", encoding="utf-8") as f: json.dump(merged, f, ensure_ascii=False, indent=2, cls=CustomJSONEncoder) # Save CSV (full merged) save_annotations_to_csv(path, merged) def load_annotations_from_csv(csv_file: str, image_folder: str) -> Tuple[Dict, List[str], List[str]]: if not os.path.exists(csv_file): return {}, [], [] df = pd.read_csv(csv_file, encoding='utf-8-sig') if 'image_filename' not in df.columns: raise ValueError("CSV must contain 'image_filename' column.") annotations: Dict[str, Dict[str, str]] = {} valid_images: List[str] = [] missing_images: List[str] = [] for _, row in df.iterrows(): filename = row['image_filename'] image_path = os.path.join(image_folder, filename) if os.path.exists(image_path): annotations[filename] = { 'extracted_text': str(row.get('extracted_text', '')), 'validated_text': str(row.get('validated_text', row.get('extracted_text', ''))) } valid_images.append(filename) else: missing_images.append(filename) return annotations, valid_images, missing_images def read_annotations_from_csv(csv_file: str) -> Dict[str, Dict[str, str]]: """Read existing CSV into a filename->annotation dict.""" df = pd.read_csv(csv_file, encoding='utf-8-sig') existing: Dict[str, Dict[str, str]] = {} for _, row in df.iterrows(): filename = str(row.get('image_filename', '')).strip() if not filename: continue existing[filename] = { 'extracted_text': str(row.get('extracted_text', '')), 'validated_text': str(row.get('validated_text', '')) } return existing def save_annotations_to_csv(csv_file: str, annotations: Dict[str, Dict[str, str]]): rows = [ { 'image_filename': filename, 'extracted_text': str(ann.get('extracted_text', '')), 'validated_text': str(ann.get('validated_text', '')) } for filename, ann in annotations.items() ] df = pd.DataFrame(rows) os.makedirs(os.path.dirname(csv_file), exist_ok=True) df.to_csv(csv_file, index=False, encoding='utf-8-sig')