Spaces:

RockMi
/

onit-text-analysis

Sleeping

onit-text-analysis / src /utils /annotations_preprocessing.py

Michela

Upload data and app

e62e0c5 12 months ago

3.36 kB

	"""
	This script matches the text annotations created on the original OCR files to the cleaned version of the text.
	The annotations were created with Recogito https://recogito.pelagios.org/.

	Code by Michela Vignoli. Parts of this code were developed with assistance from GPT-4 and GPT-3 (free version).
	"""

	## Import packages ##

	import pandas as pd
	import os
	import re
	from typing import Union


	## Import annotations from Recogito ##

	path_1 = "source/path/"
	filename_1 = 'jiggvn0g5pgx34.csv'

	# Function to reformat the labels
	def reformat_labels(label_str):
	labels = str(label_str).split('\|') # Split the string by '\|'
	reformatted = ', '.join([f"'{label}'" for label in labels]) # Enclose each label in ''
	return reformatted

	df1 = pd.read_csv(os.path.join(path_1, filename_1))[["UUID", "FILE", "QUOTE_TRANSCRIPTION", "ANCHOR", "COMMENTS", "TAGS"]]

	# Apply the function to the 'labels' column
	df1['TAGS'] = df1['TAGS'].apply(reformat_labels)


	## Extract page numbers from merged OCR text file ##

	# Read the entire text file into a single string
	with open('source/path/Z255430508_clean_merged.txt', 'r', encoding='utf-8') as file1:
	text_content1 = file1.read()


	# Function to find a number in the preceding character sequence
	def find_number_before_position(text: str, position: int, search_length: int = 10000) -> Union[str, str]:
	"""
	Finds the last number following 'page' in the text preceding or succeeding the given position.

	Parameters:
	- text (str): The full text to search within.
	- position (int): The position in the text to search around.
	- search_length (int): The length of text to search before or after the position.

	Returns:
	- Union[str, str]: The last number found after 'page' in the preceding or succeeding text,
	or a warning message "Warning: No matches found. Check!" if no match is found.
	"""
	if not isinstance(text, str):
	raise ValueError("text must be a string")
	if not isinstance(position, int) or position < 0:
	raise ValueError("position must be a non-negative integer")
	if not isinstance(search_length, int) or search_length < 0:
	raise ValueError("search_length must be a non-negative integer")

	if position == 0:
	# Search after the position
	following_text = text[position:position + search_length]
	matches = re.findall(r'page(\d+)', following_text)
	if matches:
	return matches[0] # Return the first match found
	else:
	return "Check!"
	else:
	# Search before the position
	start_position = max(0, position - search_length)
	preceding_text = text[start_position:position]
	matches = re.findall(r'page(\d+)', preceding_text)
	if matches:
	return matches[-1] # Return the last match found
	else:
	return "Check!"

	# Apply the function to each row in the DataFrames
	df1['PAGE'] = pd.to_numeric(df1['ANCHOR'].str.extract(r'(\d+)')[0], errors='coerce').apply(lambda x: find_number_before_position(text_content1, x))


	## Annotation analysis ##

	# Split the labels by '\|' and flatten the list of lists into a single list
	all_labels = df1['TAGS'].str.split(', ').sum()

	# Count the occurrences of each label
	label_counts = pd.Series(all_labels).value_counts()

	print(label_counts)