Spaces:
Build error
Build error
Update HF_processor.py
Browse files- HF_processor.py +43 -6
HF_processor.py
CHANGED
|
@@ -131,17 +131,15 @@ class FMEADataPipeline:
|
|
| 131 |
lambda x: mapping_dict_code.get(x.replace("_secondary", "")) if x else None
|
| 132 |
)
|
| 133 |
|
| 134 |
-
# Construct the description column with catalog code source
|
| 135 |
# Construct the description column with catalog code source
|
| 136 |
self.fmea[f"{name}_description"] = merged_m2.apply(
|
| 137 |
lambda x: (
|
| 138 |
-
f"
|
| 139 |
if "_secondary" in x else
|
| 140 |
-
f"*{mapping_dict_short_text.get(mapping_dict_code.get(x), x).split(' ; ')[0]}"
|
| 141 |
-
if " ; " in mapping_dict_short_text.get(mapping_dict_code.get(x), x) else
|
| 142 |
mapping_dict_short_text.get(mapping_dict_code.get(x), x)
|
| 143 |
)
|
| 144 |
)
|
|
|
|
| 145 |
|
| 146 |
|
| 147 |
def column_arranger(self):
|
|
@@ -179,7 +177,7 @@ class FMEADataPipeline:
|
|
| 179 |
bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*' # To clean bullets for specific columns
|
| 180 |
|
| 181 |
for _, row in self.fmea.iterrows():
|
| 182 |
-
cell_value = row[columns[0]]
|
| 183 |
|
| 184 |
if isinstance(cell_value, str):
|
| 185 |
# Split on newline characters (\n)
|
|
@@ -206,4 +204,43 @@ class FMEADataPipeline:
|
|
| 206 |
# Create a new DataFrame with processed rows
|
| 207 |
self.new_fmea = pd.DataFrame(new_rows)
|
| 208 |
self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
|
| 209 |
-
return self.new_fmea
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
lambda x: mapping_dict_code.get(x.replace("_secondary", "")) if x else None
|
| 132 |
)
|
| 133 |
|
|
|
|
| 134 |
# Construct the description column with catalog code source
|
| 135 |
self.fmea[f"{name}_description"] = merged_m2.apply(
|
| 136 |
lambda x: (
|
| 137 |
+
f"{x.replace('_secondary', '')} ({catalog_code_dict.get(x.replace('_secondary', ''), 'Unknown')})"
|
| 138 |
if "_secondary" in x else
|
|
|
|
|
|
|
| 139 |
mapping_dict_short_text.get(mapping_dict_code.get(x), x)
|
| 140 |
)
|
| 141 |
)
|
| 142 |
+
return self.fmea
|
| 143 |
|
| 144 |
|
| 145 |
def column_arranger(self):
|
|
|
|
| 177 |
bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*' # To clean bullets for specific columns
|
| 178 |
|
| 179 |
for _, row in self.fmea.iterrows():
|
| 180 |
+
cell_value = row.loc[columns[0]]
|
| 181 |
|
| 182 |
if isinstance(cell_value, str):
|
| 183 |
# Split on newline characters (\n)
|
|
|
|
| 204 |
# Create a new DataFrame with processed rows
|
| 205 |
self.new_fmea = pd.DataFrame(new_rows)
|
| 206 |
self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
|
| 207 |
+
return self.new_fmea
|
| 208 |
+
|
| 209 |
+
def process_and_split_excel_2(self):
|
| 210 |
+
columns_to_split = ['Proposed Task', 'Task Type', 'Frequency', 'Action Party', 'TA (Y/N)']
|
| 211 |
+
clean_columns = ['Proposed Task', 'Task Type','Frequency', 'Action Party', 'TA (Y/N)'] # Columns to clean bullet points
|
| 212 |
+
bullet_pattern = r'^\s*(?:\d+[\)\.\-•]\s*|[a-zA-Z]\))' # Regex to clean bullets
|
| 213 |
+
|
| 214 |
+
separated_rows = []
|
| 215 |
+
|
| 216 |
+
for _, row in self.fmea.iterrows():
|
| 217 |
+
split_values = []
|
| 218 |
+
|
| 219 |
+
for col in columns_to_split:
|
| 220 |
+
if isinstance(row[col], str) and row[col]:
|
| 221 |
+
values = [item.strip() for item in row[col].split('\n') if item.strip()]
|
| 222 |
+
if col in clean_columns:
|
| 223 |
+
values = [re.sub(bullet_pattern, '', v).strip() for v in values]
|
| 224 |
+
else:
|
| 225 |
+
values = [row[col]] # Keep non-string values as is
|
| 226 |
+
split_values.append(values)
|
| 227 |
+
|
| 228 |
+
# Determine the maximum number of splits across all columns
|
| 229 |
+
max_length = max(len(values) for values in split_values)
|
| 230 |
+
|
| 231 |
+
# Ensure all columns have the same number of values by repeating the last value
|
| 232 |
+
for i, values in enumerate(split_values):
|
| 233 |
+
if len(values) < max_length:
|
| 234 |
+
split_values[i] = values + [values[-1]] * (max_length - len(values))
|
| 235 |
+
|
| 236 |
+
# Create new rows for each split value
|
| 237 |
+
for i in range(max_length):
|
| 238 |
+
new_row = row.copy()
|
| 239 |
+
for col, values in zip(columns_to_split, split_values):
|
| 240 |
+
new_row[col] = values[i]
|
| 241 |
+
separated_rows.append(new_row)
|
| 242 |
+
|
| 243 |
+
# Create a new DataFrame with processed rows
|
| 244 |
+
self.new_fmea = pd.DataFrame(separated_rows)
|
| 245 |
+
self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
|
| 246 |
+
return self.new_fmea
|