Copopopopo commited on
Commit
dd32e59
·
verified ·
1 Parent(s): 37e280c

Update HF_processor.py

Browse files
Files changed (1) hide show
  1. HF_processor.py +33 -27
HF_processor.py CHANGED
@@ -97,38 +97,44 @@ class FMEADataPipeline:
97
  new_rows = []
98
  columns = ['Proposed Task', 'Task Type', 'Frequency', 'Action Party', 'TA (Y/N)']
99
  clean_columns = ['Frequency', 'Action Party', 'TA (Y/N)'] # Columns to clean bullet points
100
-
101
- # Enhanced regex for bullet points
102
  bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*'
103
-
 
104
  for _, row in self.fmea.iterrows():
105
  cell_value = row[columns[0]]
106
-
107
- if isinstance(cell_value, str) and '\n' in cell_value:
108
- # Split based on the first column, and align splits with the other columns
109
- points = cell_value.split('\n')
110
- for idx, point in enumerate(points):
111
- new_row = row.copy()
112
- for column in columns:
113
- column_values = row[column].split('\n') if isinstance(row[column], str) else [row[column]]
114
- if idx < len(column_values):
115
- new_value = column_values[idx]
116
-
117
- # Remove bullet points if column is in clean_columns
118
- if column in clean_columns:
119
- new_value = re.sub(bullet_pattern, '', new_value).strip()
120
-
121
- new_row[column] = new_value
122
- else:
123
- new_row[column] = np.nan # Fill with NaN if the split is not aligned
124
- new_rows.append(new_row)
 
 
 
 
 
 
 
 
125
  else:
126
- # Clean up bullet points for non-split rows in clean_columns
127
- for column in clean_columns:
128
- if isinstance(row[column], str):
129
- row[column] = re.sub(bullet_pattern, '', row[column]).strip()
130
  new_rows.append(row)
131
-
132
  self.new_fmea = pd.DataFrame(new_rows)
133
  self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
134
  return self.new_fmea
 
97
  new_rows = []
98
  columns = ['Proposed Task', 'Task Type', 'Frequency', 'Action Party', 'TA (Y/N)']
99
  clean_columns = ['Frequency', 'Action Party', 'TA (Y/N)'] # Columns to clean bullet points
100
+
101
+ # Enhanced regex for bullet points or numbered lists
102
  bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*'
103
+ split_pattern = r'(?<=\d[)\.\-•])\s+' # Split after numbers followed by `)` or `.` or `-`
104
+
105
  for _, row in self.fmea.iterrows():
106
  cell_value = row[columns[0]]
107
+
108
+ if isinstance(cell_value, str):
109
+ # Handle newline-separated or bullet/numbered lists
110
+ if '\n' in cell_value or re.search(split_pattern, cell_value):
111
+ # Split the input into points
112
+ points = re.split(r'\n|(?<=\d[)\.\-•])\s+', cell_value)
113
+ for idx, point in enumerate(points):
114
+ new_row = row.copy()
115
+ for column in columns:
116
+ column_values = re.split(r'\n|(?<=\d[)\.\-•])\s+', row[column]) if isinstance(row[column], str) else [row[column]]
117
+ if idx < len(column_values):
118
+ new_value = column_values[idx]
119
+
120
+ # Remove bullet points if column is in clean_columns
121
+ if column in clean_columns:
122
+ new_value = re.sub(bullet_pattern, '', new_value).strip()
123
+
124
+ new_row[column] = new_value
125
+ else:
126
+ new_row[column] = np.nan # Fill with NaN if the split is not aligned
127
+ new_rows.append(new_row)
128
+ else:
129
+ # Clean up bullet points for non-split rows in clean_columns
130
+ for column in clean_columns:
131
+ if isinstance(row[column], str):
132
+ row[column] = re.sub(bullet_pattern, '', row[column]).strip()
133
+ new_rows.append(row)
134
  else:
135
+ # If the value is not a string, add the row without modification
 
 
 
136
  new_rows.append(row)
137
+
138
  self.new_fmea = pd.DataFrame(new_rows)
139
  self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
140
  return self.new_fmea