Spaces:

RockMi
/

onit-text-analysis

Sleeping

onit-text-analysis / src /preprocessing /clean_books.py

Michela

Upload data and app

e62e0c5 12 months ago

3.22 kB

	"""
	This script cleans the OCR files, so that we have uniform documents with the same pre-processing applied to each of
	them. For every book, a new document is created so that the original file is always available for cross-checking etc.

	Code adapted from Travelogues project, by Jan Rörden. Source: https://github.com/travelogues/scripts/blob/master/groundtruth/

	"""

	import os
	import re
	import string
	import unicodedata
	from tqdm import tqdm


	# directories
	books_original_dir = 'source/path/'
	output_dir = 'output/path/'

	# Ensure the cleaned directory exists
	os.makedirs(output_dir, exist_ok=True)

	# Function to remove accents and umlauts
	def remove_accents(input_str):
	# Normalize to decompose accents
	nfkd_form = unicodedata.normalize('NFKD', input_str)
	# Filter out diacritical marks
	return ''.join([c for c in nfkd_form if not unicodedata.combining(c)])

	for fname in tqdm(sorted(os.listdir(books_original_dir))):
	# Save the current id for file naming later
	current_book_id = fname[:-4]

	# Process only .txt files
	if fname.endswith('.txt'):
	with open(os.path.join(books_original_dir, fname), 'r', encoding='utf-8') as f:
	cleaned_lines = []
	page_lines = []

	for line in f:
	# Replace long s and ß with normal s
	clean_line = re.sub(r'[ſß]', 's', line)

	# Remove accents and umlauts
	clean_line = remove_accents(clean_line)

	# Remove all non-word characters except whitespace and punctuation
	clean_line = re.sub(r'[^a-zA-Z0-9\s' + re.escape(string.punctuation) + ']', '', clean_line)

	# Convert to lowercase
	#clean_line = clean_line.lower()

	# Strip trailing spaces but keep line breaks
	clean_line = clean_line.rstrip()

	# Exclude lines based on criteria
	if len(clean_line) < 3 or clean_line.isdigit() or not re.search(r'[a-zA-Z]', clean_line):
	continue # Skip the line

	# Check for a new page indicated by a blank line
	if clean_line == "":
	# Handle empty pages
	if not page_lines or page_lines[0].startswith('statuscode') or page_lines[0].startswith('<html>'):
	cleaned_lines.append("<empty page>")
	else:
	cleaned_lines.extend(page_lines)
	page_lines = []
	else:
	page_lines.append(clean_line)

	# Handle the last page if the file ends without a blank line
	if not page_lines or page_lines[0].startswith('statuscode') or page_lines[0].startswith('<html>'):
	cleaned_lines.append("<empty page>")
	else:
	cleaned_lines.extend(page_lines)

	# Save the cleaned text to a new file, retaining line breaks
	cleaned_file_path = os.path.join(output_dir, f"{current_book_id}_cleaned.txt")
	with open(cleaned_file_path, 'w', encoding='utf-8') as cleaned_file:
	cleaned_file.write('\n'.join(cleaned_lines)) # Write lines with original line breaks