Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import pandas as pd | |
| import numpy as np | |
| from typing import Tuple, Dict, List | |
| class CustomJSONEncoder(json.JSONEncoder): | |
| def default(self, obj): | |
| if isinstance(obj, (np.float32, np.float64)): | |
| return float(obj) | |
| if isinstance(obj, (np.int32, np.int64)): | |
| return int(obj) | |
| return super().default(obj) | |
| def load_annotations(path: str) -> Dict: | |
| if not os.path.exists(path): | |
| return {} | |
| with open(path, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| def save_annotations(path: str, data: Dict): | |
| """Save annotations to both JSON and CSV formats, merging with existing.""" | |
| # Load existing (prefer JSON, else CSV) | |
| json_path = path.replace('.csv', '.json') | |
| existing: Dict[str, Dict[str, str]] = {} | |
| if os.path.exists(json_path): | |
| try: | |
| with open(json_path, "r", encoding="utf-8") as f: | |
| existing = json.load(f) | |
| except Exception: | |
| existing = {} | |
| elif os.path.exists(path): | |
| try: | |
| existing = read_annotations_from_csv(path) | |
| except Exception: | |
| existing = {} | |
| # Merge incoming data | |
| merged: Dict[str, Dict[str, str]] = dict(existing) | |
| for filename, ann in data.items(): | |
| if filename not in merged: | |
| merged[filename] = {"extracted_text": "", "validated_text": ""} | |
| if isinstance(ann, dict): | |
| if "extracted_text" in ann: | |
| merged[filename]["extracted_text"] = str(ann.get("extracted_text", "")) | |
| if "validated_text" in ann: | |
| merged[filename]["validated_text"] = str(ann.get("validated_text", "")) | |
| else: | |
| merged[filename]["validated_text"] = str(ann) | |
| # Save JSON (full merged) | |
| with open(json_path, "w", encoding="utf-8") as f: | |
| json.dump(merged, f, ensure_ascii=False, indent=2, cls=CustomJSONEncoder) | |
| # Save CSV (full merged) | |
| save_annotations_to_csv(path, merged) | |
| def load_annotations_from_csv(csv_file: str, image_folder: str) -> Tuple[Dict, List[str], List[str]]: | |
| if not os.path.exists(csv_file): | |
| return {}, [], [] | |
| df = pd.read_csv(csv_file, encoding='utf-8-sig') | |
| if 'image_filename' not in df.columns: | |
| raise ValueError("CSV must contain 'image_filename' column.") | |
| annotations: Dict[str, Dict[str, str]] = {} | |
| valid_images: List[str] = [] | |
| missing_images: List[str] = [] | |
| for _, row in df.iterrows(): | |
| filename = row['image_filename'] | |
| image_path = os.path.join(image_folder, filename) | |
| if os.path.exists(image_path): | |
| annotations[filename] = { | |
| 'extracted_text': str(row.get('extracted_text', '')), | |
| 'validated_text': str(row.get('validated_text', row.get('extracted_text', ''))) | |
| } | |
| valid_images.append(filename) | |
| else: | |
| missing_images.append(filename) | |
| return annotations, valid_images, missing_images | |
| def read_annotations_from_csv(csv_file: str) -> Dict[str, Dict[str, str]]: | |
| """Read existing CSV into a filename->annotation dict.""" | |
| df = pd.read_csv(csv_file, encoding='utf-8-sig') | |
| existing: Dict[str, Dict[str, str]] = {} | |
| for _, row in df.iterrows(): | |
| filename = str(row.get('image_filename', '')).strip() | |
| if not filename: | |
| continue | |
| existing[filename] = { | |
| 'extracted_text': str(row.get('extracted_text', '')), | |
| 'validated_text': str(row.get('validated_text', '')) | |
| } | |
| return existing | |
| def save_annotations_to_csv(csv_file: str, annotations: Dict[str, Dict[str, str]]): | |
| rows = [ | |
| { | |
| 'image_filename': filename, | |
| 'extracted_text': str(ann.get('extracted_text', '')), | |
| 'validated_text': str(ann.get('validated_text', '')) | |
| } | |
| for filename, ann in annotations.items() | |
| ] | |
| df = pd.DataFrame(rows) | |
| os.makedirs(os.path.dirname(csv_file), exist_ok=True) | |
| df.to_csv(csv_file, index=False, encoding='utf-8-sig') | |