| from pdfminer.high_level import extract_text |
|
|
| import re |
| bentham_texts = [] |
| import glob |
| def extract_text_from_pdf(pdf_path): |
| text = extract_text(pdf_path) |
| return text |
|
|
| bentham_pdfs = glob.glob('./Bentham*.pdf') |
| for pdf in bentham_pdfs: |
| print(pdf) |
| with open(pdf, 'rb') as f: |
| text = extract_text_from_pdf(f) |
| bentham_texts.append(text) |
| |
|
|
|
|
| bentham_text_string = ' '.join(bentham_texts) |
| with open('bentham_text.txt', 'w') as f: |
| f.write(bentham_text_string) |
| |
| |
| |
| |
| |
| bentham_text_string ='' |
| with open('bentham_text.txt', 'r') as f: |
| bentham_text_strings = f.readlines() |
| bentham_text_string = ''.join(bentham_text_strings) |
| |
| import re |
|
|
| def clean_text(text): |
| cleaned_text = re.sub(r'§\s*\d+\.', '', text) |
|
|
| |
| |
| cleaned_text = re.sub(r'\n*PLL v[0-9.]+ \(generated.*?\)\n+.*?\n+http.*?\n.*?Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL) |
| cleaned_text = re.sub(r'\n*\s*PLL v[0-9.]+ \(generated.*?\)\s*\n', '', cleaned_text, flags=re.DOTALL) |
| cleaned_text = re.sub(r'https?://\S+', '', cleaned_text) |
|
|
| |
| cleaned_text = re.sub(r'Online Library of Liberty:.*?\n', '', cleaned_text, flags=re.DOTALL) |
| cleaned_text = re.sub(r'\n\nPLL v[0-9.]+ \(generated.*?\)\n\n.*?\n\nhttp.*?\n', '', cleaned_text, flags=re.DOTALL) |
| cleaned_text = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', cleaned_text) |
| cleaned_text = re.sub(r'\\[ntr]', '', cleaned_text) |
| patterns_to_remove = [ |
| r'^\s*$', |
| r'^\s*\d+\s*$', |
| r'\[Back to Table of Contents\]', |
| ] |
| for pattern in patterns_to_remove: |
| cleaned_text = re.sub(pattern, '', cleaned_text, flags=re.MULTILINE) |
| return cleaned_text |
| cleaned_lines = [] |
| for line in bentham_text_strings: |
| cleaned_line = clean_text(line) |
| if cleaned_line != '': |
| cleaned_lines.append(cleaned_line) |
| |
|
|
| def split_into_chunks(text, chunk_size=100): |
| """ |
| Split the text into chunks of approximately `chunk_size` words. |
| |
| Args: |
| text (str): The input text to split. |
| chunk_size (int): The desired chunk size in words. |
| |
| Returns: |
| list of str: A list of text chunks. |
| """ |
| |
| words = text.split() |
| |
| |
| chunks = [' '.join(words[i:i + chunk_size]) for i in range(0, len(words), chunk_size)] |
| |
| return chunks |
|
|
| chunks = split_into_chunks((' ').join(cleaned_lines), 100) |
|
|
| from datasets import Dataset |
|
|
| |
| data = {'text': chunks} |
| new_dataset = Dataset.from_dict(data) |
| new_dataset.save_to_disk('./bentham_chunked') |
|
|
|
|