Spaces:

muk42
/

histOSM

Running on Zero

App Files Files Community

histOSM / annotation_tab /annotation_logic.py

muk42

debug annotations csv

4937aea 4 months ago

raw

history blame contribute delete

7.74 kB

	import os
	import pandas as pd
	import threading
	import gradio as gr
	from config import OUTPUT_DIR

	# ==== CONFIG ====
	IMAGE_FOLDER = os.path.join(OUTPUT_DIR, "blobs")
	os.makedirs(IMAGE_FOLDER, exist_ok=True)

	CSV_FILE = None
	df_annotations = pd.DataFrame(columns=["blob_id", "human_ocr"])

	# ==== STATE ====
	all_images = [
	f for f in os.listdir(IMAGE_FOLDER)
	if f.lower().endswith(('.png', '.jpg', '.jpeg')) and '_margin' in f
	]
	all_images_paths = [os.path.join(IMAGE_FOLDER, f) for f in all_images]
	current_index = 0

	# ==== TILE CSV ====
	def switch_tile_csv(selected_tile):
	global CSV_FILE, df_annotations

	tile_filename = os.path.basename(selected_tile["tile_path"])
	tile_name, _ = os.path.splitext(tile_filename)
	CSV_FILE = os.path.join(OUTPUT_DIR, f"annotations_{tile_name}.csv")

	# Only load existing CSV; do NOT create new one
	if os.path.exists(CSV_FILE):
	df_annotations = pd.read_csv(CSV_FILE, dtype={"blob_id": str, "human_ocr": str})

	return CSV_FILE

	# ==== HELPERS ====
	def get_progress_text():
	if not all_images_paths:
	return "No images loaded"
	return f"Image {current_index + 1} of {len(all_images_paths)}"

	def get_current_image_path():
	if 0 <= current_index < len(all_images_paths):
	return all_images_paths[current_index]
	return None

	def get_annotation_for_image(image_path):
	blob_id = os.path.basename(image_path).replace("_margin", "")
	row = df_annotations[df_annotations["blob_id"] == blob_id]
	if not row.empty:
	return str(row["human_ocr"].values[-1])
	return ""

	def is_annotated_or_deleted(image_path):
	"""Return True if image has an annotation or is deleted."""
	blob_id = os.path.basename(image_path).replace("_margin", "")
	row = df_annotations[df_annotations["blob_id"] == blob_id]
	if not row.empty:
	val = str(row["human_ocr"].values[-1]).strip()
	return val != "" # includes 'DELETED' as counted
	return False

	def is_deleted(image_path):
	blob_id = os.path.basename(image_path).replace("_margin", "")
	row = df_annotations[df_annotations["blob_id"] == blob_id]
	if not row.empty:
	return str(row["human_ocr"].values[-1]).strip() == "DELETED"
	return False

	def all_processed():
	"""Return True if all images are either annotated or deleted."""
	return all(is_annotated_or_deleted(p) for p in all_images_paths)

	def find_next_unprocessed_index(start):
	"""Return the next image index that is neither annotated nor deleted."""
	n = len(all_images_paths)
	idx = start
	for _ in range(n):
	idx = (idx + 1) % n
	if not is_annotated_or_deleted(all_images_paths[idx]):
	return idx
	return None

	# ==== CORE FUNCTIONS ====
	def save_annotation(user_text):
	"""Save the current annotation for the active image."""
	global df_annotations
	img_path = get_current_image_path()
	if not img_path:
	return

	blob_id = os.path.basename(img_path).replace("_margin", "")
	text_value = user_text.strip() if user_text else ""

	row_idx = df_annotations.index[df_annotations["blob_id"] == blob_id].tolist()
	if row_idx:
	df_annotations.at[row_idx[0], "human_ocr"] = text_value
	else:
	df_annotations = pd.concat(
	[df_annotations, pd.DataFrame([{"blob_id": blob_id, "human_ocr": text_value}])],
	ignore_index=True
	)

	df_annotations.to_csv(CSV_FILE, index=False)

	def save_and_next(user_text):
	global current_index
	if get_current_image_path() is None:
	return None, "", gr.update(visible=True, value="No images available."), "No image loaded", "No images loaded"

	save_annotation(user_text)

	if all_processed():
	current_index = 0
	img_path = get_current_image_path()
	annotation = get_annotation_for_image(img_path)
	return img_path, annotation, gr.update(visible=True, value="All images annotated."), img_path, get_progress_text()

	next_idx = find_next_unprocessed_index(current_index)
	current_index = next_idx if next_idx is not None else 0
	img_path = get_current_image_path()
	annotation = get_annotation_for_image(img_path)
	return img_path, annotation, gr.update(visible=False), img_path, get_progress_text()

	def previous_image():
	global current_index
	if not all_images_paths:
	return None, "", gr.update(visible=True, value="No images available."), "No image loaded", "No images loaded"

	current_index = (current_index - 1) % len(all_images_paths)
	img_path = get_current_image_path()
	annotation = get_annotation_for_image(img_path)
	return img_path, annotation, gr.update(visible=False), img_path, get_progress_text()

	def delete_and_next():
	"""Mark current image as DELETED and move to next image."""
	global current_index, df_annotations
	img_path = get_current_image_path()
	if not img_path:
	return None, "", gr.update(visible=True, value="No images available."), "No image loaded", "No images loaded"

	blob_id = os.path.basename(img_path).replace("_margin", "")
	row_idx = df_annotations.index[df_annotations["blob_id"] == blob_id].tolist()
	if row_idx:
	df_annotations.at[row_idx[0], "human_ocr"] = "DELETED"
	else:
	df_annotations = pd.concat(
	[df_annotations, pd.DataFrame([{"blob_id": blob_id, "human_ocr": "DELETED"}])],
	ignore_index=True
	)

	df_annotations.to_csv(CSV_FILE, index=False)

	if all_processed():
	current_index = 0
	img_path = get_current_image_path()
	annotation = get_annotation_for_image(img_path)
	return img_path, annotation, gr.update(visible=True, value="All images annotated."), img_path, get_progress_text()

	next_idx = find_next_unprocessed_index(current_index)
	current_index = next_idx if next_idx is not None else 0
	img_path = get_current_image_path()
	annotation = get_annotation_for_image(img_path)
	return img_path, annotation, gr.update(visible=False), img_path, get_progress_text()

	def save_and_exit(user_text):
	if get_current_image_path() is not None:
	save_annotation(user_text)
	threading.Timer(1, lambda: os._exit(0)).start()
	return None, "", gr.update(visible=True, value="Session closed."), "", get_progress_text()

	def refresh_image_list(selected_tile):
	"""Reload images for the current tile and prepare the annotation CSV."""
	global all_images_paths, current_index, df_annotations, CSV_FILE

	# Track current tile and create CSV for it
	tile_filename = os.path.basename(selected_tile["tile_path"])
	tile_name, _ = os.path.splitext(tile_filename)
	CSV_FILE = os.path.join(OUTPUT_DIR, f"annotations_{tile_name}.csv")

	# Initialize empty CSV for this tile if it doesn't exist
	if not os.path.exists(CSV_FILE):
	df_annotations = pd.DataFrame(columns=["blob_id", "human_ocr"])
	df_annotations.to_csv(CSV_FILE, index=False)
	else:
	# Load existing annotations if they exist (optional)
	df_annotations = pd.read_csv(CSV_FILE, dtype={"blob_id": str, "human_ocr": str})

	# Reload images
	all_images = [
	f for f in os.listdir(IMAGE_FOLDER)
	if f.lower().endswith(('.png', '.jpg', '.jpeg')) and '_margin' in f
	]
	all_images_paths = [os.path.join(IMAGE_FOLDER, f) for f in all_images]
	current_index = 0

	if not all_images_paths:
	return None, "", gr.update(visible=True, value="No images available."), "No image loaded", "No images loaded"

	# Return first image and annotation
	img_path = get_current_image_path()
	annotation = get_annotation_for_image(img_path)
	return img_path, annotation, gr.update(visible=False), img_path, get_progress_text()