Spaces:

Copopopopo
/

Golden_ERS

Build error

App Files Files Community

Copopopopo commited on Jan 15, 2025

Commit

b86f23a

verified ·

1 Parent(s): dd32e59

fix issue with the process_and_split_excel function. Moved from splitting numbers/bullets to \n

Browse files

Files changed (1) hide show

HF_processor.py +25 -35

HF_processor.py CHANGED Viewed

@@ -30,7 +30,6 @@ class FMEADataPipeline:
         self.damage = pd.read_json(self.catalog_profile['damage'],orient='split')
         self.cause = pd.read_json(self.catalog_profile['cause'],orient='split')
     def build_connector(self):
         self.code_group = self.cp[self.cp['Catalog profile']==self.catalog_code][['Catalog','Code group']]
         self.fmea_code = {'fmea code': ['Component','Failure Mode','Failure Mechanism','Failure Cause'],
@@ -39,7 +38,6 @@ class FMEADataPipeline:
         self.code_group = pd.merge(self.code_group,self.fmea_code,how='left',on='Catalog')
         self.fmea['Catalog Profile (SAP)'] = self.catalog_code
     def column_matcher(self):
         for code,sap in zip(self.fmea_code['fmea code'],[self.object_part,self.symptom,self.damage,self.cause]):
@@ -97,44 +95,36 @@ class FMEADataPipeline:
         new_rows = []
         columns = ['Proposed Task', 'Task Type', 'Frequency', 'Action Party', 'TA (Y/N)']
         clean_columns = ['Frequency', 'Action Party', 'TA (Y/N)']  # Columns to clean bullet points
-        # Enhanced regex for bullet points or numbered lists
-        bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*'
-        split_pattern = r'(?<=\d[)\.\-•])\s+'  # Split after numbers followed by `)` or `.` or `-`
         for _, row in self.fmea.iterrows():
-            cell_value = row[columns[0]]
             if isinstance(cell_value, str):
-                # Handle newline-separated or bullet/numbered lists
-                if '\n' in cell_value or re.search(split_pattern, cell_value):
-                    # Split the input into points
-                    points = re.split(r'\n|(?<=\d[)\.\-•])\s+', cell_value)
-                    for idx, point in enumerate(points):
-                        new_row = row.copy()
-                        for column in columns:
-                            column_values = re.split(r'\n|(?<=\d[)\.\-•])\s+', row[column]) if isinstance(row[column], str) else [row[column]]
-                            if idx < len(column_values):
-                                new_value = column_values[idx]
-                                # Remove bullet points if column is in clean_columns
-                                if column in clean_columns:
-                                    new_value = re.sub(bullet_pattern, '', new_value).strip()
-                                new_row[column] = new_value
-                            else:
-                                new_row[column] = np.nan  # Fill with NaN if the split is not aligned
-                        new_rows.append(new_row)
-                else:
-                    # Clean up bullet points for non-split rows in clean_columns
-                    for column in clean_columns:
-                        if isinstance(row[column], str):
-                            row[column] = re.sub(bullet_pattern, '', row[column]).strip()
-                    new_rows.append(row)
             else:
                 # If the value is not a string, add the row without modification
                 new_rows.append(row)
         self.new_fmea = pd.DataFrame(new_rows)
         self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
         return self.new_fmea

         self.damage = pd.read_json(self.catalog_profile['damage'],orient='split')
         self.cause = pd.read_json(self.catalog_profile['cause'],orient='split')
     def build_connector(self):
         self.code_group = self.cp[self.cp['Catalog profile']==self.catalog_code][['Catalog','Code group']]
         self.fmea_code = {'fmea code': ['Component','Failure Mode','Failure Mechanism','Failure Cause'],
         self.code_group = pd.merge(self.code_group,self.fmea_code,how='left',on='Catalog')
         self.fmea['Catalog Profile (SAP)'] = self.catalog_code
     def column_matcher(self):
         for code,sap in zip(self.fmea_code['fmea code'],[self.object_part,self.symptom,self.damage,self.cause]):
         new_rows = []
         columns = ['Proposed Task', 'Task Type', 'Frequency', 'Action Party', 'TA (Y/N)']
         clean_columns = ['Frequency', 'Action Party', 'TA (Y/N)']  # Columns to clean bullet points
+        # Regex to remove bullet points or numbering in the clean columns
+        bullet_pattern = r'^\s*[\da-zA-Z]+[)\.\-•]?\s*'  # To clean bullets for specific columns
         for _, row in self.fmea.iterrows():
+            cell_value = row[columns[0]]  # 'Proposed Task'
             if isinstance(cell_value, str):
+                # Split on newline characters (\n)
+                points = [point.strip() for point in cell_value.split('\n') if point.strip()]
+                for idx, point in enumerate(points):
+                    new_row = row.copy()
+                    for column in columns:
+                        column_value = row[column]
+                        if isinstance(column_value, str):
+                            # Split column by newline and align them
+                            column_points = [p.strip() for p in column_value.split('\n') if p.strip()]
+                            new_value = column_points[idx] if idx < len(column_points) else np.nan
+                            # Clean bullet points for specific columns
+                            if column in clean_columns:
+                                new_value = re.sub(bullet_pattern, '', new_value).strip() if isinstance(new_value, str) else new_value
+                            new_row[column] = new_value
+                        else:
+                            new_row[column] = np.nan if idx > 0 else column_value
+                    new_rows.append(new_row)
             else:
                 # If the value is not a string, add the row without modification
                 new_rows.append(row)
+        # Create a new DataFrame with processed rows
         self.new_fmea = pd.DataFrame(new_rows)
         self.new_fmea.to_excel('processed_fmea.xlsx', index=False)
         return self.new_fmea