| | import re |
| |
|
| |
|
| | def clean_corpus(chat_export_file): |
| | """Prepare a WhatsApp chat export for training with chatterbot.""" |
| | message_corpus = remove_chat_metadata(chat_export_file) |
| | cleaned_corpus = remove_non_message_text(message_corpus) |
| | return cleaned_corpus |
| |
|
| |
|
| | def remove_chat_metadata(chat_export_file): |
| | """Remove WhatsApp chat metadata. |
| | |
| | WhatsApp chat exports come with metadata about each message: |
| | |
| | date time username message |
| | --------------------------------------- |
| | 8/26/22, 17:47 - Jane Doe: Message text |
| | |
| | This function removes all the metadata up to the text of each message. |
| | |
| | Args: |
| | chat_export_file (str): The name of the chat export file |
| | |
| | Returns: |
| | tuple: The text of each message in the conversation |
| | """ |
| | date_time = r"(\d+\/\d+\/\d+,\s\d+:\d+)" |
| | dash_whitespace = r"\s-\s" |
| | username = r"([\w\s]+)" |
| | metadata_end = r":\s" |
| | pattern = date_time + dash_whitespace + username + metadata_end |
| |
|
| | with open(chat_export_file, "r") as corpus_file: |
| | content = corpus_file.read() |
| | cleaned_corpus = re.sub(pattern, "", content) |
| | return tuple(cleaned_corpus.split("\n")) |
| |
|
| |
|
| | def remove_non_message_text(export_text_lines): |
| | """Remove conversation-irrelevant text from chat export. |
| | |
| | WhatsApp chat exports come with a standardized intro line, |
| | and an empty line at the end of the file. |
| | Text exports also replace media messages with text that isn't |
| | relevant for the conversation. This function removes all that. |
| | |
| | Args: |
| | export_text_lines (tuple): All lines from the export file |
| | |
| | Returns: |
| | tuple: Messages that are a relevant part of the conversation |
| | """ |
| | messages = export_text_lines[1:-1] |
| |
|
| | filter_out_msgs = ("<Media omitted>",) |
| | return tuple((msg for msg in messages if msg not in filter_out_msgs)) |
| |
|