Spaces:

acmc
/

whatsapp-chats-finetuning-formatter

Running

App Files Files Community

ACMC commited on Sep 24, 2024

Commit

05e7fc2

1 Parent(s): ac08d51

Better error handling

Browse files

Files changed (2) hide show

app.py +25 -21
utils.py +98 -48

app.py CHANGED Viewed

@@ -19,27 +19,31 @@ logger.setLevel(logging.INFO)
 def convert_to_dataset(files, do_spelling_correction, progress, whatsapp_name, datetime_dayfirst, message_line_format):
     modified_dataset = None
     for file in progress.tqdm(files, desc="Processing files"):
-        if modified_dataset is None:
-            # First file
-            modified_dataset = process_chat_file(
-                file,
-                do_spelling_correction=do_spelling_correction,
-                whatsapp_name=whatsapp_name,
-                datetime_dayfirst=datetime_dayfirst,
-                message_line_format=message_line_format,
-            )
-        else:
-            # Concatenate the datasets
-            this_file_dataset = process_chat_file(
-                file,
-                do_spelling_correction=do_spelling_correction,
-                whatsapp_name=whatsapp_name,
-                datetime_dayfirst=datetime_dayfirst,
-                message_line_format=message_line_format,
-            )
-            modified_dataset = datasets.concatenate_datasets(
-                [modified_dataset, this_file_dataset]
-            )
     return modified_dataset

 def convert_to_dataset(files, do_spelling_correction, progress, whatsapp_name, datetime_dayfirst, message_line_format):
     modified_dataset = None
     for file in progress.tqdm(files, desc="Processing files"):
+        try:
+            if modified_dataset is None:
+                # First file
+                modified_dataset = process_chat_file(
+                    file,
+                    do_spelling_correction=do_spelling_correction,
+                    whatsapp_name=whatsapp_name,
+                    datetime_dayfirst=datetime_dayfirst,
+                    message_line_format=message_line_format,
+                )
+            else:
+                # Concatenate the datasets
+                this_file_dataset = process_chat_file(
+                    file,
+                    do_spelling_correction=do_spelling_correction,
+                    whatsapp_name=whatsapp_name,
+                    datetime_dayfirst=datetime_dayfirst,
+                    message_line_format=message_line_format,
+                )
+                modified_dataset = datasets.concatenate_datasets(
+                    [modified_dataset, this_file_dataset]
+                )
+        except Exception as e:
+            logger.error(f"Error processing file {file}: {e}")
+            raise gr.Error(f"Error processing file {file}: {e}")
     return modified_dataset

utils.py CHANGED Viewed

@@ -91,7 +91,7 @@ def spell_check_conversation_spacy(conversation):
     return conversation
-def remove_whatapp_annotations(conversation):
     """
     Removes the following annotations from the messages:
     - <This message was edited>
@@ -238,45 +238,79 @@ def process_chat_file(file, do_spelling_correction, whatsapp_name, datetime_dayf
             logger.exception(example["text"])
             raise e
-    ds = (
-        datasets.load_dataset("text", data_files=[file])["train"]
-        .filter(
             # Has to begin by date, time, contact name, and contain at least a ':' symbol
             lambda x: re.match(
                 r"^\d{1,2}/\d{1,2}/\d{1,2},\s\d{2}:\d{2}\s-\s.+:", x["text"]
             )
         )
-        .map(process_line, remove_columns=["text"])
-    )
-    # Filter out messages that just say '<Media omitted>'
-    ds = ds.filter(lambda x: x["message"] != "<Media omitted>")
-    groups = group_messages(iter(ds))
-    # Generate the dataset
-    conversations_ds = datasets.Dataset.from_dict({"conversations": groups})
-    # Filter out conversations with less than 5 messages
-    conversations_ds = conversations_ds.filter(
-        lambda x: len(x["conversations"]) >= MIN_MESSAGES_THRESHOLD
-    )
-    conversations_ds_without_whatsapp_annotations = conversations_ds.map(
-        remove_whatapp_annotations,
-        num_proc=os.cpu_count() - 1,
-    )
     if do_spelling_correction:
-        spell_checked_conversations_ds = (
-            conversations_ds_without_whatsapp_annotations.map(spell_check_conversation)
-        )
     else:
         spell_checked_conversations_ds = conversations_ds_without_whatsapp_annotations
     if do_reordering:
-        reordered_conversations_ds = spell_checked_conversations_ds.map(
-            swap_messages_if_needed_in_conversation
-        )
     else:
         reordered_conversations_ds = spell_checked_conversations_ds
@@ -287,14 +321,22 @@ def process_chat_file(file, do_spelling_correction, whatsapp_name, datetime_dayf
                 message["contact_name"] = "Other"
         return conversation
-    changed_contact_name_ds = reordered_conversations_ds.map(
-        rewrite_contact_name
-    )  # , num_proc=os.cpu_count() - 1)
-    # Filter out conversations with only one contact
-    changed_contact_name_ds = changed_contact_name_ds.filter(
-        lambda x: len(set([msg["contact_name"] for msg in x["conversations"]])) > 1
-    )
     return changed_contact_name_ds
@@ -381,17 +423,25 @@ def transform_conversations_dataset_into_training_examples(
             flattened_examples[key] = [d[key] for d in processed_examples]
         return flattened_examples
-    processed_examples = conversations_ds.map(
-        process_examples,
-        remove_columns=["conversations"],
-        # num_proc=os.cpu_count() - 1,
-        batched=True,
-    )
-    examples_filtered_by_length = processed_examples.filter(
-        lambda x: all(
-            [len(m["content"]) < MAX_CHARACTERS_PER_MESSAGE for m in x["messages"]]
         )
-    )
     return examples_filtered_by_length

     return conversation
+def remove_whatsapp_annotations(conversation):
     """
     Removes the following annotations from the messages:
     - <This message was edited>
             logger.exception(example["text"])
             raise e
+    try:
+        ds = datasets.load_dataset("text", data_files=[file])["train"]
+    except Exception as e:
+        logger.exception(f"Error while loading file {file}")
+        raise Exception(f"Error while loading file {file}") from e
+    try:
+        ds = ds.filter(
             # Has to begin by date, time, contact name, and contain at least a ':' symbol
             lambda x: re.match(
                 r"^\d{1,2}/\d{1,2}/\d{1,2},\s\d{2}:\d{2}\s-\s.+:", x["text"]
             )
         )
+    except Exception as e:
+        logger.exception(f"Error filtering the lines in file {file} so they match the expected format")
+        raise Exception(f"Error filtering the lines in file {file} so they match the expected format") from e
+    try:
+        ds = ds.map(process_line, remove_columns=["text"])
+    except Exception as e:
+        logger.exception(f"Error mapping the lines in file {file} to the expected format")
+        raise Exception(f"Error mapping the lines in file {file} to the expected format") from e
+    try:
+        # Filter out messages that just say '<Media omitted>'
+        ds = ds.filter(lambda x: x["message"] != "<Media omitted>")
+    except Exception as e:
+        logger.exception(f"Error filtering out messages that say '<Media omitted>' in file {file}")
+        raise Exception(f"Error filtering out messages that say '<Media omitted>' in file {file}") from e
+    try:
+        groups = group_messages(iter(ds))
+        # Generate the dataset
+        conversations_ds = datasets.Dataset.from_dict({"conversations": groups})
+    except Exception as e:
+        logger.exception(f"Error grouping the messages in file {file}")
+        raise Exception(f"Error grouping the messages in file {file}") from e
+    try:
+        # Filter out conversations with less than 5 messages
+        conversations_ds = conversations_ds.filter(
+            lambda x: len(x["conversations"]) >= MIN_MESSAGES_THRESHOLD
+        )
+    except Exception as e:
+        logger.exception(f"Error filtering out conversations with less than {MIN_MESSAGES_THRESHOLD} messages in file {file}")
+        raise Exception(f"Error filtering out conversations with less than {MIN_MESSAGES_THRESHOLD} messages in file {file}") from e
+    try:
+        conversations_ds_without_whatsapp_annotations = conversations_ds.map(
+            remove_whatsapp_annotations,
+            num_proc=os.cpu_count() - 1,
+        )
+    except Exception as e:
+        logger.exception(f"Error removing WhatsApp annotations in file {file}")
+        raise Exception(f"Error removing WhatsApp annotations in file {file}") from e
     if do_spelling_correction:
+        try:
+            spell_checked_conversations_ds = (
+                conversations_ds_without_whatsapp_annotations.map(spell_check_conversation)
+            )
+        except Exception as e:
+            logger.exception(f"Error spell checking the conversations in file {file}")
+            raise Exception(f"Error spell checking the conversations in file {file}") from e
     else:
         spell_checked_conversations_ds = conversations_ds_without_whatsapp_annotations
     if do_reordering:
+        try:
+            reordered_conversations_ds = spell_checked_conversations_ds.map(
+                swap_messages_if_needed_in_conversation
+            )
+        except Exception as e:
+            logger.exception(f"Error reordering the conversations in file {file}")
+            raise Exception(f"Error reordering the conversations in file {file}") from e
     else:
         reordered_conversations_ds = spell_checked_conversations_ds
                 message["contact_name"] = "Other"
         return conversation
+    try:
+        changed_contact_name_ds = reordered_conversations_ds.map(
+            rewrite_contact_name
+        )  # , num_proc=os.cpu_count() - 1)
+    except Exception as e:
+        logger.exception(f"Error changing your other contact's names in file {file}")
+        raise Exception(f"Error changing your other contact's names in file {file}") from e
+    try:
+        # Filter out conversations with only one contact
+        changed_contact_name_ds = changed_contact_name_ds.filter(
+            lambda x: len(set([msg["contact_name"] for msg in x["conversations"]])) > 1
+        )
+    except Exception as e:
+        logger.exception(f"Error filtering out conversations with only one contact in file {file}")
+        raise Exception(f"Error filtering out conversations with only one contact in file {file}") from e
     return changed_contact_name_ds
             flattened_examples[key] = [d[key] for d in processed_examples]
         return flattened_examples
+    try:
+        processed_examples = conversations_ds.map(
+            process_examples,
+            remove_columns=["conversations"],
+            # num_proc=os.cpu_count() - 1,
+            batched=True,
         )
+    except Exception as e:
+        logger.exception("Error transforming the conversations dataset into training examples")
+        raise Exception("Error transforming the conversations dataset into training examples") from e
+    try:
+        examples_filtered_by_length = processed_examples.filter(
+            lambda x: all(
+                [len(m["content"]) < MAX_CHARACTERS_PER_MESSAGE for m in x["messages"]]
+            )
+        )
+    except Exception as e:
+        logger.exception("Error filtering out examples with messages longer than the maximum allowed")
+        raise Exception("Error filtering out examples with messages longer than the maximum allowed") from e
     return examples_filtered_by_length