| import re | |
| def is_page_number(line): | |
| return line.strip().isdigit() | |
| with open("./finn_wake.txt", "r", encoding="utf-8") as file: | |
| lines = file.readlines() | |
| filtered_lines = [line for line in lines if not is_page_number(line)] | |
| text = ''.join(filtered_lines) | |
| from datasets import Dataset | |
| import pandas as pd | |
| def split_paragraph_into_smaller_parts(paragraph, max_length=100): | |
| """Split a paragraph into smaller parts with a maximum length in words.""" | |
| words = paragraph.split() | |
| for i in range(0, len(words), max_length): | |
| yield ' '.join(words[i:i+max_length]) | |
| paragraphs = text.split('\n') | |
| split_paragraphs = [] | |
| for paragraph in paragraphs: | |
| if paragraph.strip() != "": | |
| split_paragraphs.extend(split_paragraph_into_smaller_parts(paragraph, max_length=100)) | |
| df = pd.DataFrame(split_paragraphs, columns=['text']) | |
| dataset = Dataset.from_pandas(df) | |
| df.to_csv('finn_wake.csv', index=False) | |
| dataset.save_to_disk('finn_wake_dataset') | |