Spaces:

Sk4467
/

OCR-annotation

Sleeping

App Files Files Community

OCR-annotation / backend /app /services /annotations.py

Sk4467

Upload 108 files

1e83c8a verified 4 months ago

raw

history blame contribute delete

4.04 kB

	import os
	import json
	import pandas as pd
	import numpy as np
	from typing import Tuple, Dict, List


	class CustomJSONEncoder(json.JSONEncoder):
	def default(self, obj):
	if isinstance(obj, (np.float32, np.float64)):
	return float(obj)
	if isinstance(obj, (np.int32, np.int64)):
	return int(obj)
	return super().default(obj)


	def load_annotations(path: str) -> Dict:
	if not os.path.exists(path):
	return {}
	with open(path, "r", encoding="utf-8") as f:
	return json.load(f)


	def save_annotations(path: str, data: Dict):
	"""Save annotations to both JSON and CSV formats, merging with existing."""
	# Load existing (prefer JSON, else CSV)
	json_path = path.replace('.csv', '.json')
	existing: Dict[str, Dict[str, str]] = {}

	if os.path.exists(json_path):
	try:
	with open(json_path, "r", encoding="utf-8") as f:
	existing = json.load(f)
	except Exception:
	existing = {}
	elif os.path.exists(path):
	try:
	existing = read_annotations_from_csv(path)
	except Exception:
	existing = {}

	# Merge incoming data
	merged: Dict[str, Dict[str, str]] = dict(existing)
	for filename, ann in data.items():
	if filename not in merged:
	merged[filename] = {"extracted_text": "", "validated_text": ""}
	if isinstance(ann, dict):
	if "extracted_text" in ann:
	merged[filename]["extracted_text"] = str(ann.get("extracted_text", ""))
	if "validated_text" in ann:
	merged[filename]["validated_text"] = str(ann.get("validated_text", ""))
	else:
	merged[filename]["validated_text"] = str(ann)

	# Save JSON (full merged)
	with open(json_path, "w", encoding="utf-8") as f:
	json.dump(merged, f, ensure_ascii=False, indent=2, cls=CustomJSONEncoder)

	# Save CSV (full merged)
	save_annotations_to_csv(path, merged)


	def load_annotations_from_csv(csv_file: str, image_folder: str) -> Tuple[Dict, List[str], List[str]]:
	if not os.path.exists(csv_file):
	return {}, [], []

	df = pd.read_csv(csv_file, encoding='utf-8-sig')

	if 'image_filename' not in df.columns:
	raise ValueError("CSV must contain 'image_filename' column.")

	annotations: Dict[str, Dict[str, str]] = {}
	valid_images: List[str] = []
	missing_images: List[str] = []

	for _, row in df.iterrows():
	filename = row['image_filename']
	image_path = os.path.join(image_folder, filename)
	if os.path.exists(image_path):
	annotations[filename] = {
	'extracted_text': str(row.get('extracted_text', '')),
	'validated_text': str(row.get('validated_text', row.get('extracted_text', '')))
	}
	valid_images.append(filename)
	else:
	missing_images.append(filename)

	return annotations, valid_images, missing_images


	def read_annotations_from_csv(csv_file: str) -> Dict[str, Dict[str, str]]:
	"""Read existing CSV into a filename->annotation dict."""
	df = pd.read_csv(csv_file, encoding='utf-8-sig')
	existing: Dict[str, Dict[str, str]] = {}
	for _, row in df.iterrows():
	filename = str(row.get('image_filename', '')).strip()
	if not filename:
	continue
	existing[filename] = {
	'extracted_text': str(row.get('extracted_text', '')),
	'validated_text': str(row.get('validated_text', ''))
	}
	return existing


	def save_annotations_to_csv(csv_file: str, annotations: Dict[str, Dict[str, str]]):
	rows = [
	{
	'image_filename': filename,
	'extracted_text': str(ann.get('extracted_text', '')),
	'validated_text': str(ann.get('validated_text', ''))
	}
	for filename, ann in annotations.items()
	]
	df = pd.DataFrame(rows)
	os.makedirs(os.path.dirname(csv_file), exist_ok=True)
	df.to_csv(csv_file, index=False, encoding='utf-8-sig')