npc-worldwide
/

BabyBentham

Text Generation

text-generation-inference

Model card Files Files and versions

BabyBentham / pdf_extract.py

caug37's picture

Upload 25 files

c6bb4cd verified about 2 years ago

history blame contribute delete

3.02 kB

	from pdfminer.high_level import extract_text

	import re
	bentham_texts = []
	import glob
	def extract_text_from_pdf(pdf_path):
	text = extract_text(pdf_path)
	return text

	bentham_pdfs = glob.glob('./Bentham*.pdf')
	for pdf in bentham_pdfs:
	print(pdf)
	with open(pdf, 'rb') as f:
	text = extract_text_from_pdf(f)
	bentham_texts.append(text)



	bentham_text_string = ' '.join(bentham_texts)
	with open('bentham_text.txt', 'w') as f:
	f.write(bentham_text_string)




	##
	bentham_text_string =''
	with open('bentham_text.txt', 'r') as f:
	bentham_text_strings = f.readlines()
	bentham_text_string = ''.join(bentham_text_strings)

	import re

	def clean_text(text):
	cleaned_text = re.sub(r'§\s*\d+\.', '', text)

	# Step 2: Remove the unwanted patterns
	# Removing patterns like "PLL v6.0 (generated September, 2011)" and URLs
	cleaned_text = re.sub(r'\nPLL v[0-9.]+ \(generated.?\)\n+.?\n+http.?\n.?Online Library of Liberty:.?\n', '', cleaned_text, flags=re.DOTALL)
	cleaned_text = re.sub(r'\n\sPLL v[0-9.]+ \(generated.?\)\s\n', '', cleaned_text, flags=re.DOTALL)
	cleaned_text = re.sub(r'https?://\S+', '', cleaned_text)

	# Removing "Online Library of Liberty" lines that might not fit the exact previous pattern
	cleaned_text = re.sub(r'Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL)
	cleaned_text = re.sub(r'\n\nPLL v[0-9.]+ \(generated.?\)\n\n.?\n\nhttp.*?\n', '', cleaned_text, flags=re.DOTALL)
	cleaned_text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', cleaned_text) # Removes non-printable characters
	cleaned_text = re.sub(r'\\[ntr]', '', cleaned_text) # Removes escaped special characters like \n, \t, \r
	patterns_to_remove = [
	r'^\s*$', # Empty lines
	r'^\s\d+\s$', # Standalone numeric lines
	r'\[Back to Table of Contents\]', # Specific placeholders
	]
	for pattern in patterns_to_remove:
	cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.MULTILINE)
	return cleaned_text
	cleaned_lines = []
	for line in bentham_text_strings:
	cleaned_line = clean_text(line)
	if cleaned_line != '':
	cleaned_lines.append(cleaned_line)


	def split_into_chunks(text, chunk_size=100):
	"""
	Split the text into chunks of approximately `chunk_size` words.

	Args:
	text (str): The input text to split.
	chunk_size (int): The desired chunk size in words.

	Returns:
	list of str: A list of text chunks.
	"""
	# Split the text into words
	words = text.split()

	# Create chunks of approximately `chunk_size` words
	chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)]

	return chunks

	chunks = split_into_chunks((' ').join(cleaned_lines), 100)

	from datasets import Dataset

	# Assuming `chunks` is the list of text chunks you created earlier
	data = {'text': chunks}
	new_dataset = Dataset.from_dict(data)
	new_dataset.save_to_disk('./bentham_chunked')