Spaces:
Build error
Build error
File size: 12,130 Bytes
e0219a5 47f321a 690a536 e0219a5 47f321a e0219a5 47f321a e0219a5 7dd4209 3acc8f6 7dd4209 2d7818d 7dd4209 2d7818d 7dd4209 e0219a5 2d7818d f955e97 e0219a5 f955e97 37e280c f955e97 2d7818d e0219a5 2d7818d f955e97 2d7818d f955e97 2d7818d f955e97 2d7818d f955e97 2d7818d af1260e 86bec4e 2d7818d f955e97 af1260e cfacb7e e0219a5 9c4f999 7dd4209 9c4f999 e0219a5 00259f5 b86f23a e0219a5 af1260e b86f23a dd32e59 b86f23a e0219a5 dd32e59 e0219a5 b86f23a e0219a5 7594dc2 af1260e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 | import pandas as pd
import numpy as np
import json
import re
from fuzzywuzzy import process
class FMEADataPipeline:
def __init__(self,catalog_profile,fmea,catalog_code,threshold):
self.catalog_profile = catalog_profile
self.fmea = fmea
self.catalog_code = catalog_code
self.threshold = threshold
self.json_cp = None
self.cp = None
self.object_part = None
self.symptom = None
self.damage = None
self.cause = None
self.ERS_filename = None
self.code_group = None
self.fmea_code = {}
self.new_fmea = None
def read_catalog_profile(self):
self.cp = pd.read_json(self.catalog_profile['cp'],orient='split')
self.object_part = pd.read_json(self.catalog_profile['object part'],orient='split')
self.symptom = pd.read_json(self.catalog_profile['symptom'],orient='split')
self.damage = pd.read_json(self.catalog_profile['damage'],orient='split')
self.cause = pd.read_json(self.catalog_profile['cause'],orient='split')
def build_connector(self):
self.code_group = self.cp[self.cp['Catalog profile']==self.catalog_code][['Catalog','Code group']]
self.fmea_code = {'fmea code': ['Component','Failure Mode','Failure Mechanism','Failure Cause'],
'Catalog':['B-Object Part','D-Symptom','C-Damage','5-Cause']}
self.fmea_code = pd.DataFrame(self.fmea_code)
self.code_group = pd.merge(self.code_group,self.fmea_code,how='left',on='Catalog')
self.fmea['Catalog Profile (SAP)'] = self.catalog_code
def split_columns_rows(self):
columns_to_split = ['Failure Mode', 'Failure Mechanism', 'Failure Cause','Failure Effect']
separated_rows = []
# Iterate over each row in the DataFrame
for _, row in self.fmea.iterrows():
# Process each column to split by various numbering formats (e.g., '1.', '1)', '1-', etc.)
split_values = []
for col in columns_to_split:
# Split by numbering patterns and strip whitespace
if isinstance(row[col], str) and row[col]:
values = [item.strip() for item in re.split(r'\d+[\)\.-]\s*|[a-zA-Z]+[\)\.-]\s*', row[col]) if item.strip()]
else:
values = [row[col]] # Keep non-string values as is
split_values.append(values)
# Check the maximum length of split values across columns to split
max_length = max(len(values) for values in split_values)
# Ensure all split columns have equal length by repeating the last value if needed
for i, values in enumerate(split_values):
if len(values) < max_length:
split_values[i] = values + [values[-1]] * (max_length - len(values))
# Create new rows for each split value
for i in range(max_length):
new_row = row.copy()
for col, values in zip(columns_to_split, split_values):
new_row[col] = values[i]
separated_rows.append(new_row)
# rewrite previous fmea
self.fmea = pd.DataFrame(separated_rows)
def bracket_remover(self):
columns = ['Component','Failure Mode', 'Failure Mechanism', 'Failure Cause','Failure Effect']
for col in columns:
if col in self.fmea.columns:
self.fmea[col] = self.fmea[col].str.replace(r"\s*\(.*?\)", "", regex=True).str.strip()
def column_matcher_21Jan(self):
for code, sap in zip(self.fmea_code['fmea code'], [self.object_part, self.symptom, self.damage, self.cause]):
# Find the matching code group for the current FMEA code
matching_code_group = self.code_group[self.code_group['fmea code'] == code]['Code group']
if matching_code_group.empty:
continue # Skip if no matching code group is found
# Get the first matching code group value
matching_code_group_value = matching_code_group.values[0]
# Extract the catalog profile and its group for prioritization
catalog_profile = self.code_group[self.code_group['fmea code'] == code]['Catalog'].values[0]
catalog_group = catalog_profile[1] # Second character of the catalog code
# Filter SAP table for each priority level
# 1. Catalog Profile
profile_sap = sap[sap['Code group'] == catalog_profile]
s_profile = profile_sap['Short text'].tolist()
m_profile = self.fmea[code].apply(lambda x: process.extract(x, s_profile, limit=1))
m2_profile = m_profile.apply(lambda x: ', '.join([i[0] for i in x if i[1] >= self.threshold]))
# 2. Catalog Group (excluding the catalog profile)
group_sap = sap[(sap['Code group'].str[1] == catalog_group) & (sap['Code group'] != catalog_profile)]
s_group = group_sap['Short text'].tolist()
m_group = self.fmea[code].apply(lambda x: process.extract(x, s_group, limit=1))
m2_group = m_group.apply(lambda x: ', '.join([i[0] for i in x if i[1] >= self.threshold]))
# 3. Entire SAP catalog
s_all = sap['Short text'].tolist()
m_all = self.fmea[code].apply(lambda x: process.extract(x, s_all, limit=1))
m2_all = m_all.apply(lambda x: ''.join([i[0] for i in x if i[1] >= self.threshold]))
# Add "_secondary" flag to m2_all values
m2_all_flagged = m2_all.apply(lambda x: f"{x}_secondary" if x else x)
# Merge prioritized matches: Profile > Group > All
merged_m2 = m2_profile.combine(m2_group, lambda x, y: x if x else y)
merged_m2 = merged_m2.combine(m2_all_flagged, lambda x, y: x if x else y)
# Create mapping dictionaries
mapping_dict_code = sap.set_index('Short text')['Code'].to_dict()
mapping_dict_short_text = sap.set_index('Code')['Short text'].to_dict()
catalog_code_dict = sap.set_index('Short text')['Code group'].to_dict()
# Create the new column name based on catalog
name = catalog_profile
# Apply the mapping for the catalog column
self.fmea[name] = merged_m2.apply(
lambda x: mapping_dict_code.get(x.replace("_secondary", "")) if x else None
)
# Construct the description column with catalog code source
self.fmea[f"{name}_description"] = merged_m2.apply(
lambda x: (
f"{x.replace('_secondary', '')} ({catalog_code_dict.get(x.replace('_secondary', ''), 'Unknown')})"
if "_secondary" in x else
mapping_dict_short_text.get(mapping_dict_code.get(x), x)
)
)
return self.fmea
def column_arranger(self):
catalog_profile = self.fmea.pop('Catalog Profile (SAP)')
b_object_part = self.fmea.pop('B-Object Part')
d_symptom = self.fmea.pop('D-Symptom')
c_damage = self.fmea.pop('C-Damage')
cause_5 = self.fmea.pop('5-Cause')
b_object_part_desc = self.fmea.pop('B-Object Part_description')
d_symptom_desc = self.fmea.pop('D-Symptom_description')
c_damage_desc = self.fmea.pop('C-Damage_description')
cause_5_desc = self.fmea.pop('5-Cause_description')
self.fmea.insert(1,catalog_profile.name,catalog_profile)
self.fmea.insert(3,b_object_part.name,b_object_part)
self.fmea.insert(4,b_object_part_desc.name,b_object_part_desc)
self.fmea.insert(6,d_symptom.name,d_symptom)
self.fmea.insert(7,d_symptom_desc.name,d_symptom_desc)
self.fmea.insert(9,c_damage.name,c_damage)
self.fmea.insert(10,c_damage_desc.name,c_damage_desc)
self.fmea.insert(12,cause_5.name,cause_5)
self.fmea.insert(13,cause_5_desc.name,cause_5_desc)
print('Completed column arranger function')
self.fmea.to_excel('processed_fmea.xlsx', index=False)
return self.fmea
def process_and_split_excel(self):
new_rows = []
columns = ['Proposed Task', 'Task Type', 'Frequency', 'Action Party', 'TA (Y/N)']
clean_columns = ['Frequency', 'Action Party', 'TA (Y/N)'] # Columns to clean bullet points
# Regex to remove bullet points or numbering in the clean columns
bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*' # To clean bullets for specific columns
for _, row in self.fmea.iterrows():
cell_value = row.loc[columns[0]]
if isinstance(cell_value, str):
# Split on newline characters (\n)
points = [point.strip() for point in cell_value.split('\n') if point.strip()]
for idx, point in enumerate(points):
new_row = row.copy()
for column in columns:
column_value = row[column]
if isinstance(column_value, str):
# Split column by newline and align them
column_points = [p.strip() for p in column_value.split('\n') if p.strip()]
new_value = column_points[idx] if idx < len(column_points) else np.nan
# Clean bullet points for specific columns
if column in clean_columns:
new_value = re.sub(bullet_pattern, '', new_value).strip() if isinstance(new_value, str) else new_value
new_row[column] = new_value
else:
new_row[column] = np.nan if idx > 0 else column_value
new_rows.append(new_row)
else:
# If the value is not a string, add the row without modification
new_rows.append(row)
# Create a new DataFrame with processed rows
self.new_fmea = pd.DataFrame(new_rows)
self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
return self.new_fmea
def process_and_split_excel_2(self):
columns_to_split = ['Proposed Task', 'Task Type', 'Frequency', 'Action Party', 'TA (Y/N)']
clean_columns = ['Proposed Task', 'Task Type','Frequency', 'Action Party', 'TA (Y/N)'] # Columns to clean bullet points
bullet_pattern = r'^\s*(?:\d+[\)\.\-•]\s*|[a-zA-Z]\))' # Regex to clean bullets
separated_rows = []
for _, row in self.fmea.iterrows():
split_values = []
for col in columns_to_split:
if isinstance(row[col], str) and row[col]:
values = [item.strip() for item in row[col].split('\n') if item.strip()]
if col in clean_columns:
values = [re.sub(bullet_pattern, '', v).strip() for v in values]
else:
values = [row[col]] # Keep non-string values as is
split_values.append(values)
# Determine the maximum number of splits across all columns
max_length = max(len(values) for values in split_values)
# Ensure all columns have the same number of values by repeating the last value
for i, values in enumerate(split_values):
if len(values) < max_length:
split_values[i] = values + [values[-1]] * (max_length - len(values))
# Create new rows for each split value
for i in range(max_length):
new_row = row.copy()
for col, values in zip(columns_to_split, split_values):
new_row[col] = values[i]
separated_rows.append(new_row)
# Create a new DataFrame with processed rows
self.new_fmea = pd.DataFrame(separated_rows)
self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
return self.new_fmea
|