Spaces:

Sk4467
/

OCR-annotation

Sleeping

File size: 4,044 Bytes

1e83c8a

import os
import json
import pandas as pd
import numpy as np
from typing import Tuple, Dict, List


class CustomJSONEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, (np.float32, np.float64)):
            return float(obj)
        if isinstance(obj, (np.int32, np.int64)):
            return int(obj)
        return super().default(obj)


def load_annotations(path: str) -> Dict:
    if not os.path.exists(path):
        return {}
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)


def save_annotations(path: str, data: Dict):
    """Save annotations to both JSON and CSV formats, merging with existing."""
    # Load existing (prefer JSON, else CSV)
    json_path = path.replace('.csv', '.json')
    existing: Dict[str, Dict[str, str]] = {}

    if os.path.exists(json_path):
        try:
            with open(json_path, "r", encoding="utf-8") as f:
                existing = json.load(f)
        except Exception:
            existing = {}
    elif os.path.exists(path):
        try:
            existing = read_annotations_from_csv(path)
        except Exception:
            existing = {}

    # Merge incoming data
    merged: Dict[str, Dict[str, str]] = dict(existing)
    for filename, ann in data.items():
        if filename not in merged:
            merged[filename] = {"extracted_text": "", "validated_text": ""}
        if isinstance(ann, dict):
            if "extracted_text" in ann:
                merged[filename]["extracted_text"] = str(ann.get("extracted_text", ""))
            if "validated_text" in ann:
                merged[filename]["validated_text"] = str(ann.get("validated_text", ""))
        else:
            merged[filename]["validated_text"] = str(ann)

    # Save JSON (full merged)
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(merged, f, ensure_ascii=False, indent=2, cls=CustomJSONEncoder)

    # Save CSV (full merged)
    save_annotations_to_csv(path, merged)


def load_annotations_from_csv(csv_file: str, image_folder: str) -> Tuple[Dict, List[str], List[str]]:
    if not os.path.exists(csv_file):
        return {}, [], []

    df = pd.read_csv(csv_file, encoding='utf-8-sig')

    if 'image_filename' not in df.columns:
        raise ValueError("CSV must contain 'image_filename' column.")

    annotations: Dict[str, Dict[str, str]] = {}
    valid_images: List[str] = []
    missing_images: List[str] = []

    for _, row in df.iterrows():
        filename = row['image_filename']
        image_path = os.path.join(image_folder, filename)
        if os.path.exists(image_path):
            annotations[filename] = {
                'extracted_text': str(row.get('extracted_text', '')),
                'validated_text': str(row.get('validated_text', row.get('extracted_text', '')))
            }
            valid_images.append(filename)
        else:
            missing_images.append(filename)

    return annotations, valid_images, missing_images


def read_annotations_from_csv(csv_file: str) -> Dict[str, Dict[str, str]]:
    """Read existing CSV into a filename->annotation dict."""
    df = pd.read_csv(csv_file, encoding='utf-8-sig')
    existing: Dict[str, Dict[str, str]] = {}
    for _, row in df.iterrows():
        filename = str(row.get('image_filename', '')).strip()
        if not filename:
            continue
        existing[filename] = {
            'extracted_text': str(row.get('extracted_text', '')),
            'validated_text': str(row.get('validated_text', ''))
        }
    return existing


def save_annotations_to_csv(csv_file: str, annotations: Dict[str, Dict[str, str]]):
    rows = [
        {
            'image_filename': filename,
            'extracted_text': str(ann.get('extracted_text', '')),
            'validated_text': str(ann.get('validated_text', ''))
        }
        for filename, ann in annotations.items()
    ]
    df = pd.DataFrame(rows)
    os.makedirs(os.path.dirname(csv_file), exist_ok=True)
    df.to_csv(csv_file, index=False, encoding='utf-8-sig')