Sk4467's picture
Upload 108 files
1e83c8a verified
import os
import json
import pandas as pd
import numpy as np
from typing import Tuple, Dict, List
class CustomJSONEncoder(json.JSONEncoder):
def default(self, obj):
if isinstance(obj, (np.float32, np.float64)):
return float(obj)
if isinstance(obj, (np.int32, np.int64)):
return int(obj)
return super().default(obj)
def load_annotations(path: str) -> Dict:
if not os.path.exists(path):
return {}
with open(path, "r", encoding="utf-8") as f:
return json.load(f)
def save_annotations(path: str, data: Dict):
"""Save annotations to both JSON and CSV formats, merging with existing."""
# Load existing (prefer JSON, else CSV)
json_path = path.replace('.csv', '.json')
existing: Dict[str, Dict[str, str]] = {}
if os.path.exists(json_path):
try:
with open(json_path, "r", encoding="utf-8") as f:
existing = json.load(f)
except Exception:
existing = {}
elif os.path.exists(path):
try:
existing = read_annotations_from_csv(path)
except Exception:
existing = {}
# Merge incoming data
merged: Dict[str, Dict[str, str]] = dict(existing)
for filename, ann in data.items():
if filename not in merged:
merged[filename] = {"extracted_text": "", "validated_text": ""}
if isinstance(ann, dict):
if "extracted_text" in ann:
merged[filename]["extracted_text"] = str(ann.get("extracted_text", ""))
if "validated_text" in ann:
merged[filename]["validated_text"] = str(ann.get("validated_text", ""))
else:
merged[filename]["validated_text"] = str(ann)
# Save JSON (full merged)
with open(json_path, "w", encoding="utf-8") as f:
json.dump(merged, f, ensure_ascii=False, indent=2, cls=CustomJSONEncoder)
# Save CSV (full merged)
save_annotations_to_csv(path, merged)
def load_annotations_from_csv(csv_file: str, image_folder: str) -> Tuple[Dict, List[str], List[str]]:
if not os.path.exists(csv_file):
return {}, [], []
df = pd.read_csv(csv_file, encoding='utf-8-sig')
if 'image_filename' not in df.columns:
raise ValueError("CSV must contain 'image_filename' column.")
annotations: Dict[str, Dict[str, str]] = {}
valid_images: List[str] = []
missing_images: List[str] = []
for _, row in df.iterrows():
filename = row['image_filename']
image_path = os.path.join(image_folder, filename)
if os.path.exists(image_path):
annotations[filename] = {
'extracted_text': str(row.get('extracted_text', '')),
'validated_text': str(row.get('validated_text', row.get('extracted_text', '')))
}
valid_images.append(filename)
else:
missing_images.append(filename)
return annotations, valid_images, missing_images
def read_annotations_from_csv(csv_file: str) -> Dict[str, Dict[str, str]]:
"""Read existing CSV into a filename->annotation dict."""
df = pd.read_csv(csv_file, encoding='utf-8-sig')
existing: Dict[str, Dict[str, str]] = {}
for _, row in df.iterrows():
filename = str(row.get('image_filename', '')).strip()
if not filename:
continue
existing[filename] = {
'extracted_text': str(row.get('extracted_text', '')),
'validated_text': str(row.get('validated_text', ''))
}
return existing
def save_annotations_to_csv(csv_file: str, annotations: Dict[str, Dict[str, str]]):
rows = [
{
'image_filename': filename,
'extracted_text': str(ann.get('extracted_text', '')),
'validated_text': str(ann.get('validated_text', ''))
}
for filename, ann in annotations.items()
]
df = pd.DataFrame(rows)
os.makedirs(os.path.dirname(csv_file), exist_ok=True)
df.to_csv(csv_file, index=False, encoding='utf-8-sig')