Spaces:

Copopopopo
/

Golden_ERS

Build error

App Files Files Community

Copopopopo commited on Jan 13, 2025

Commit

dd32e59

verified ·

1 Parent(s): 37e280c

Update HF_processor.py

Browse files

Files changed (1) hide show

HF_processor.py +33 -27

HF_processor.py CHANGED Viewed

@@ -97,38 +97,44 @@ class FMEADataPipeline:
         new_rows = []
         columns = ['Proposed Task', 'Task Type', 'Frequency', 'Action Party', 'TA (Y/N)']
         clean_columns = ['Frequency', 'Action Party', 'TA (Y/N)']  # Columns to clean bullet points
-        # Enhanced regex for bullet points
         bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*'
         for _, row in self.fmea.iterrows():
             cell_value = row[columns[0]]
-            if isinstance(cell_value, str) and '\n' in cell_value:
-                # Split based on the first column, and align splits with the other columns
-                points = cell_value.split('\n')
-                for idx, point in enumerate(points):
-                    new_row = row.copy()
-                    for column in columns:
-                        column_values = row[column].split('\n') if isinstance(row[column], str) else [row[column]]
-                        if idx < len(column_values):
-                            new_value = column_values[idx]
-                            # Remove bullet points if column is in clean_columns
-                            if column in clean_columns:
-                                new_value = re.sub(bullet_pattern, '', new_value).strip()
-                            new_row[column] = new_value
-                        else:
-                            new_row[column] = np.nan  # Fill with NaN if the split is not aligned
-                    new_rows.append(new_row)
             else:
-                # Clean up bullet points for non-split rows in clean_columns
-                for column in clean_columns:
-                    if isinstance(row[column], str):
-                        row[column] = re.sub(bullet_pattern, '', row[column]).strip()
                 new_rows.append(row)
         self.new_fmea = pd.DataFrame(new_rows)
         self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
         return self.new_fmea

         new_rows = []
         columns = ['Proposed Task', 'Task Type', 'Frequency', 'Action Party', 'TA (Y/N)']
         clean_columns = ['Frequency', 'Action Party', 'TA (Y/N)']  # Columns to clean bullet points
+        # Enhanced regex for bullet points or numbered lists
         bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*'
+        split_pattern = r'(?<=\d[)\.\-•])\s+'  # Split after numbers followed by `)` or `.` or `-`
         for _, row in self.fmea.iterrows():
             cell_value = row[columns[0]]
+            if isinstance(cell_value, str):
+                # Handle newline-separated or bullet/numbered lists
+                if '\n' in cell_value or re.search(split_pattern, cell_value):
+                    # Split the input into points
+                    points = re.split(r'\n|(?<=\d[)\.\-•])\s+', cell_value)
+                    for idx, point in enumerate(points):
+                        new_row = row.copy()
+                        for column in columns:
+                            column_values = re.split(r'\n|(?<=\d[)\.\-•])\s+', row[column]) if isinstance(row[column], str) else [row[column]]
+                            if idx < len(column_values):
+                                new_value = column_values[idx]
+                                # Remove bullet points if column is in clean_columns
+                                if column in clean_columns:
+                                    new_value = re.sub(bullet_pattern, '', new_value).strip()
+                                new_row[column] = new_value
+                            else:
+                                new_row[column] = np.nan  # Fill with NaN if the split is not aligned
+                        new_rows.append(new_row)
+                else:
+                    # Clean up bullet points for non-split rows in clean_columns
+                    for column in clean_columns:
+                        if isinstance(row[column], str):
+                            row[column] = re.sub(bullet_pattern, '', row[column]).strip()
+                    new_rows.append(row)
             else:
+                # If the value is not a string, add the row without modification
                 new_rows.append(row)
         self.new_fmea = pd.DataFrame(new_rows)
         self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
         return self.new_fmea