Copopopopo commited on
Commit
af1260e
·
verified ·
1 Parent(s): 3823ff7

Update HF_processor.py

Browse files
Files changed (1) hide show
  1. HF_processor.py +43 -6
HF_processor.py CHANGED
@@ -131,17 +131,15 @@ class FMEADataPipeline:
131
  lambda x: mapping_dict_code.get(x.replace("_secondary", "")) if x else None
132
  )
133
 
134
- # Construct the description column with catalog code source
135
  # Construct the description column with catalog code source
136
  self.fmea[f"{name}_description"] = merged_m2.apply(
137
  lambda x: (
138
- f"*{x.replace('_secondary', '').split(' ; ')[0]} ({catalog_code_dict.get(x.replace('_secondary', ''), 'Unknown')})"
139
  if "_secondary" in x else
140
- f"*{mapping_dict_short_text.get(mapping_dict_code.get(x), x).split(' ; ')[0]}"
141
- if " ; " in mapping_dict_short_text.get(mapping_dict_code.get(x), x) else
142
  mapping_dict_short_text.get(mapping_dict_code.get(x), x)
143
  )
144
  )
 
145
 
146
 
147
  def column_arranger(self):
@@ -179,7 +177,7 @@ class FMEADataPipeline:
179
  bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*' # To clean bullets for specific columns
180
 
181
  for _, row in self.fmea.iterrows():
182
- cell_value = row[columns[0]]
183
 
184
  if isinstance(cell_value, str):
185
  # Split on newline characters (\n)
@@ -206,4 +204,43 @@ class FMEADataPipeline:
206
  # Create a new DataFrame with processed rows
207
  self.new_fmea = pd.DataFrame(new_rows)
208
  self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
209
- return self.new_fmea
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
  lambda x: mapping_dict_code.get(x.replace("_secondary", "")) if x else None
132
  )
133
 
 
134
  # Construct the description column with catalog code source
135
  self.fmea[f"{name}_description"] = merged_m2.apply(
136
  lambda x: (
137
+ f"{x.replace('_secondary', '')} ({catalog_code_dict.get(x.replace('_secondary', ''), 'Unknown')})"
138
  if "_secondary" in x else
 
 
139
  mapping_dict_short_text.get(mapping_dict_code.get(x), x)
140
  )
141
  )
142
+ return self.fmea
143
 
144
 
145
  def column_arranger(self):
 
177
  bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*' # To clean bullets for specific columns
178
 
179
  for _, row in self.fmea.iterrows():
180
+ cell_value = row.loc[columns[0]]
181
 
182
  if isinstance(cell_value, str):
183
  # Split on newline characters (\n)
 
204
  # Create a new DataFrame with processed rows
205
  self.new_fmea = pd.DataFrame(new_rows)
206
  self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
207
+ return self.new_fmea
208
+
209
+ def process_and_split_excel_2(self):
210
+ columns_to_split = ['Proposed Task', 'Task Type', 'Frequency', 'Action Party', 'TA (Y/N)']
211
+ clean_columns = ['Proposed Task', 'Task Type','Frequency', 'Action Party', 'TA (Y/N)'] # Columns to clean bullet points
212
+ bullet_pattern = r'^\s*(?:\d+[\)\.\-•]\s*|[a-zA-Z]\))' # Regex to clean bullets
213
+
214
+ separated_rows = []
215
+
216
+ for _, row in self.fmea.iterrows():
217
+ split_values = []
218
+
219
+ for col in columns_to_split:
220
+ if isinstance(row[col], str) and row[col]:
221
+ values = [item.strip() for item in row[col].split('\n') if item.strip()]
222
+ if col in clean_columns:
223
+ values = [re.sub(bullet_pattern, '', v).strip() for v in values]
224
+ else:
225
+ values = [row[col]] # Keep non-string values as is
226
+ split_values.append(values)
227
+
228
+ # Determine the maximum number of splits across all columns
229
+ max_length = max(len(values) for values in split_values)
230
+
231
+ # Ensure all columns have the same number of values by repeating the last value
232
+ for i, values in enumerate(split_values):
233
+ if len(values) < max_length:
234
+ split_values[i] = values + [values[-1]] * (max_length - len(values))
235
+
236
+ # Create new rows for each split value
237
+ for i in range(max_length):
238
+ new_row = row.copy()
239
+ for col, values in zip(columns_to_split, split_values):
240
+ new_row[col] = values[i]
241
+ separated_rows.append(new_row)
242
+
243
+ # Create a new DataFrame with processed rows
244
+ self.new_fmea = pd.DataFrame(separated_rows)
245
+ self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
246
+ return self.new_fmea