histOSM / annotation_tab /annotation_logic.py
muk42's picture
debug annotations csv
4937aea
import os
import pandas as pd
import threading
import gradio as gr
from config import OUTPUT_DIR
# ==== CONFIG ====
IMAGE_FOLDER = os.path.join(OUTPUT_DIR, "blobs")
os.makedirs(IMAGE_FOLDER, exist_ok=True)
CSV_FILE = None
df_annotations = pd.DataFrame(columns=["blob_id", "human_ocr"])
# ==== STATE ====
all_images = [
f for f in os.listdir(IMAGE_FOLDER)
if f.lower().endswith(('.png', '.jpg', '.jpeg')) and '_margin' in f
]
all_images_paths = [os.path.join(IMAGE_FOLDER, f) for f in all_images]
current_index = 0
# ==== TILE CSV ====
def switch_tile_csv(selected_tile):
global CSV_FILE, df_annotations
tile_filename = os.path.basename(selected_tile["tile_path"])
tile_name, _ = os.path.splitext(tile_filename)
CSV_FILE = os.path.join(OUTPUT_DIR, f"annotations_{tile_name}.csv")
# Only load existing CSV; do NOT create new one
if os.path.exists(CSV_FILE):
df_annotations = pd.read_csv(CSV_FILE, dtype={"blob_id": str, "human_ocr": str})
return CSV_FILE
# ==== HELPERS ====
def get_progress_text():
if not all_images_paths:
return "No images loaded"
return f"Image {current_index + 1} of {len(all_images_paths)}"
def get_current_image_path():
if 0 <= current_index < len(all_images_paths):
return all_images_paths[current_index]
return None
def get_annotation_for_image(image_path):
blob_id = os.path.basename(image_path).replace("_margin", "")
row = df_annotations[df_annotations["blob_id"] == blob_id]
if not row.empty:
return str(row["human_ocr"].values[-1])
return ""
def is_annotated_or_deleted(image_path):
"""Return True if image has an annotation or is deleted."""
blob_id = os.path.basename(image_path).replace("_margin", "")
row = df_annotations[df_annotations["blob_id"] == blob_id]
if not row.empty:
val = str(row["human_ocr"].values[-1]).strip()
return val != "" # includes 'DELETED' as counted
return False
def is_deleted(image_path):
blob_id = os.path.basename(image_path).replace("_margin", "")
row = df_annotations[df_annotations["blob_id"] == blob_id]
if not row.empty:
return str(row["human_ocr"].values[-1]).strip() == "DELETED"
return False
def all_processed():
"""Return True if all images are either annotated or deleted."""
return all(is_annotated_or_deleted(p) for p in all_images_paths)
def find_next_unprocessed_index(start):
"""Return the next image index that is neither annotated nor deleted."""
n = len(all_images_paths)
idx = start
for _ in range(n):
idx = (idx + 1) % n
if not is_annotated_or_deleted(all_images_paths[idx]):
return idx
return None
# ==== CORE FUNCTIONS ====
def save_annotation(user_text):
"""Save the current annotation for the active image."""
global df_annotations
img_path = get_current_image_path()
if not img_path:
return
blob_id = os.path.basename(img_path).replace("_margin", "")
text_value = user_text.strip() if user_text else ""
row_idx = df_annotations.index[df_annotations["blob_id"] == blob_id].tolist()
if row_idx:
df_annotations.at[row_idx[0], "human_ocr"] = text_value
else:
df_annotations = pd.concat(
[df_annotations, pd.DataFrame([{"blob_id": blob_id, "human_ocr": text_value}])],
ignore_index=True
)
df_annotations.to_csv(CSV_FILE, index=False)
def save_and_next(user_text):
global current_index
if get_current_image_path() is None:
return None, "", gr.update(visible=True, value="No images available."), "No image loaded", "No images loaded"
save_annotation(user_text)
if all_processed():
current_index = 0
img_path = get_current_image_path()
annotation = get_annotation_for_image(img_path)
return img_path, annotation, gr.update(visible=True, value="All images annotated."), img_path, get_progress_text()
next_idx = find_next_unprocessed_index(current_index)
current_index = next_idx if next_idx is not None else 0
img_path = get_current_image_path()
annotation = get_annotation_for_image(img_path)
return img_path, annotation, gr.update(visible=False), img_path, get_progress_text()
def previous_image():
global current_index
if not all_images_paths:
return None, "", gr.update(visible=True, value="No images available."), "No image loaded", "No images loaded"
current_index = (current_index - 1) % len(all_images_paths)
img_path = get_current_image_path()
annotation = get_annotation_for_image(img_path)
return img_path, annotation, gr.update(visible=False), img_path, get_progress_text()
def delete_and_next():
"""Mark current image as DELETED and move to next image."""
global current_index, df_annotations
img_path = get_current_image_path()
if not img_path:
return None, "", gr.update(visible=True, value="No images available."), "No image loaded", "No images loaded"
blob_id = os.path.basename(img_path).replace("_margin", "")
row_idx = df_annotations.index[df_annotations["blob_id"] == blob_id].tolist()
if row_idx:
df_annotations.at[row_idx[0], "human_ocr"] = "DELETED"
else:
df_annotations = pd.concat(
[df_annotations, pd.DataFrame([{"blob_id": blob_id, "human_ocr": "DELETED"}])],
ignore_index=True
)
df_annotations.to_csv(CSV_FILE, index=False)
if all_processed():
current_index = 0
img_path = get_current_image_path()
annotation = get_annotation_for_image(img_path)
return img_path, annotation, gr.update(visible=True, value="All images annotated."), img_path, get_progress_text()
next_idx = find_next_unprocessed_index(current_index)
current_index = next_idx if next_idx is not None else 0
img_path = get_current_image_path()
annotation = get_annotation_for_image(img_path)
return img_path, annotation, gr.update(visible=False), img_path, get_progress_text()
def save_and_exit(user_text):
if get_current_image_path() is not None:
save_annotation(user_text)
threading.Timer(1, lambda: os._exit(0)).start()
return None, "", gr.update(visible=True, value="Session closed."), "", get_progress_text()
def refresh_image_list(selected_tile):
"""Reload images for the current tile and prepare the annotation CSV."""
global all_images_paths, current_index, df_annotations, CSV_FILE
# Track current tile and create CSV for it
tile_filename = os.path.basename(selected_tile["tile_path"])
tile_name, _ = os.path.splitext(tile_filename)
CSV_FILE = os.path.join(OUTPUT_DIR, f"annotations_{tile_name}.csv")
# Initialize empty CSV for this tile if it doesn't exist
if not os.path.exists(CSV_FILE):
df_annotations = pd.DataFrame(columns=["blob_id", "human_ocr"])
df_annotations.to_csv(CSV_FILE, index=False)
else:
# Load existing annotations if they exist (optional)
df_annotations = pd.read_csv(CSV_FILE, dtype={"blob_id": str, "human_ocr": str})
# Reload images
all_images = [
f for f in os.listdir(IMAGE_FOLDER)
if f.lower().endswith(('.png', '.jpg', '.jpeg')) and '_margin' in f
]
all_images_paths = [os.path.join(IMAGE_FOLDER, f) for f in all_images]
current_index = 0
if not all_images_paths:
return None, "", gr.update(visible=True, value="No images available."), "No image loaded", "No images loaded"
# Return first image and annotation
img_path = get_current_image_path()
annotation = get_annotation_for_image(img_path)
return img_path, annotation, gr.update(visible=False), img_path, get_progress_text()