Copopopopo commited on
Commit
b86f23a
·
verified ·
1 Parent(s): dd32e59

fix issue with the process_and_split_excel function. Moved from splitting numbers/bullets to \n

Browse files
Files changed (1) hide show
  1. HF_processor.py +25 -35
HF_processor.py CHANGED
@@ -30,7 +30,6 @@ class FMEADataPipeline:
30
  self.damage = pd.read_json(self.catalog_profile['damage'],orient='split')
31
  self.cause = pd.read_json(self.catalog_profile['cause'],orient='split')
32
 
33
-
34
  def build_connector(self):
35
  self.code_group = self.cp[self.cp['Catalog profile']==self.catalog_code][['Catalog','Code group']]
36
  self.fmea_code = {'fmea code': ['Component','Failure Mode','Failure Mechanism','Failure Cause'],
@@ -39,7 +38,6 @@ class FMEADataPipeline:
39
  self.code_group = pd.merge(self.code_group,self.fmea_code,how='left',on='Catalog')
40
  self.fmea['Catalog Profile (SAP)'] = self.catalog_code
41
 
42
-
43
  def column_matcher(self):
44
  for code,sap in zip(self.fmea_code['fmea code'],[self.object_part,self.symptom,self.damage,self.cause]):
45
 
@@ -97,44 +95,36 @@ class FMEADataPipeline:
97
  new_rows = []
98
  columns = ['Proposed Task', 'Task Type', 'Frequency', 'Action Party', 'TA (Y/N)']
99
  clean_columns = ['Frequency', 'Action Party', 'TA (Y/N)'] # Columns to clean bullet points
100
-
101
- # Enhanced regex for bullet points or numbered lists
102
- bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*'
103
- split_pattern = r'(?<=\d[)\.\-•])\s+' # Split after numbers followed by `)` or `.` or `-`
104
-
105
  for _, row in self.fmea.iterrows():
106
- cell_value = row[columns[0]]
107
-
108
  if isinstance(cell_value, str):
109
- # Handle newline-separated or bullet/numbered lists
110
- if '\n' in cell_value or re.search(split_pattern, cell_value):
111
- # Split the input into points
112
- points = re.split(r'\n|(?<=\d[)\.\-•])\s+', cell_value)
113
- for idx, point in enumerate(points):
114
- new_row = row.copy()
115
- for column in columns:
116
- column_values = re.split(r'\n|(?<=\d[)\.\-•])\s+', row[column]) if isinstance(row[column], str) else [row[column]]
117
- if idx < len(column_values):
118
- new_value = column_values[idx]
119
-
120
- # Remove bullet points if column is in clean_columns
121
- if column in clean_columns:
122
- new_value = re.sub(bullet_pattern, '', new_value).strip()
123
-
124
- new_row[column] = new_value
125
- else:
126
- new_row[column] = np.nan # Fill with NaN if the split is not aligned
127
- new_rows.append(new_row)
128
- else:
129
- # Clean up bullet points for non-split rows in clean_columns
130
- for column in clean_columns:
131
- if isinstance(row[column], str):
132
- row[column] = re.sub(bullet_pattern, '', row[column]).strip()
133
- new_rows.append(row)
134
  else:
135
  # If the value is not a string, add the row without modification
136
  new_rows.append(row)
137
-
 
138
  self.new_fmea = pd.DataFrame(new_rows)
139
  self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
140
  return self.new_fmea
 
30
  self.damage = pd.read_json(self.catalog_profile['damage'],orient='split')
31
  self.cause = pd.read_json(self.catalog_profile['cause'],orient='split')
32
 
 
33
  def build_connector(self):
34
  self.code_group = self.cp[self.cp['Catalog profile']==self.catalog_code][['Catalog','Code group']]
35
  self.fmea_code = {'fmea code': ['Component','Failure Mode','Failure Mechanism','Failure Cause'],
 
38
  self.code_group = pd.merge(self.code_group,self.fmea_code,how='left',on='Catalog')
39
  self.fmea['Catalog Profile (SAP)'] = self.catalog_code
40
 
 
41
  def column_matcher(self):
42
  for code,sap in zip(self.fmea_code['fmea code'],[self.object_part,self.symptom,self.damage,self.cause]):
43
 
 
95
  new_rows = []
96
  columns = ['Proposed Task', 'Task Type', 'Frequency', 'Action Party', 'TA (Y/N)']
97
  clean_columns = ['Frequency', 'Action Party', 'TA (Y/N)'] # Columns to clean bullet points
98
+
99
+ # Regex to remove bullet points or numbering in the clean columns
100
+ bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*' # To clean bullets for specific columns
101
+
 
102
  for _, row in self.fmea.iterrows():
103
+ cell_value = row[columns[0]] # 'Proposed Task'
104
+
105
  if isinstance(cell_value, str):
106
+ # Split on newline characters (\n)
107
+ points = [point.strip() for point in cell_value.split('\n') if point.strip()]
108
+ for idx, point in enumerate(points):
109
+ new_row = row.copy()
110
+ for column in columns:
111
+ column_value = row[column]
112
+ if isinstance(column_value, str):
113
+ # Split column by newline and align them
114
+ column_points = [p.strip() for p in column_value.split('\n') if p.strip()]
115
+ new_value = column_points[idx] if idx < len(column_points) else np.nan
116
+ # Clean bullet points for specific columns
117
+ if column in clean_columns:
118
+ new_value = re.sub(bullet_pattern, '', new_value).strip() if isinstance(new_value, str) else new_value
119
+ new_row[column] = new_value
120
+ else:
121
+ new_row[column] = np.nan if idx > 0 else column_value
122
+ new_rows.append(new_row)
 
 
 
 
 
 
 
 
123
  else:
124
  # If the value is not a string, add the row without modification
125
  new_rows.append(row)
126
+
127
+ # Create a new DataFrame with processed rows
128
  self.new_fmea = pd.DataFrame(new_rows)
129
  self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
130
  return self.new_fmea