Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import re | |
| import openpyxl | |
| def resolve_sheet_name(excel_path: str, expected_name: str) -> str: | |
| wb = openpyxl.load_workbook(excel_path, read_only=True) | |
| expected_norm = expected_name.strip().lower() | |
| for sheet in wb.sheetnames: | |
| if sheet.strip().lower() == expected_norm: | |
| return sheet | |
| raise ValueError( | |
| f"Sheet '{expected_name}' not found. Available sheets: {wb.sheetnames}" | |
| ) | |
| def extract_filtered_pasals_as_expected_format_v2(excel_path, regulation_code): | |
| sheet_name = resolve_sheet_name(excel_path, "Obligations") | |
| df_excel = pd.read_excel( | |
| excel_path, | |
| sheet_name=sheet_name, | |
| usecols="P:Q", | |
| skiprows=1, | |
| header=None, | |
| engine="openpyxl" | |
| ) | |
| df_excel.columns = ["Kewajiban", "Sanksi"] | |
| combined_data = [] | |
| for text in df_excel["Kewajiban"].dropna(): | |
| combined_data.append(("kewajiban", str(text))) | |
| for text in df_excel["Sanksi"].dropna(): | |
| combined_data.append(("sanksi", str(text))) | |
| extracted_rows = [] | |
| current_article, current_block, current_type = None, [], None | |
| capture = False | |
| for tipe, line in combined_data: | |
| line_stripped = line.strip() | |
| pattern_variasi = re.compile( | |
| r">+\s*@\s*(" + re.escape(regulation_code) + r")", | |
| re.IGNORECASE | |
| ) | |
| line_stripped = pattern_variasi.sub( | |
| f">>>@{regulation_code}", | |
| line_stripped | |
| ) | |
| if f">>>@{regulation_code}" in line_stripped: | |
| if current_article and current_block: | |
| extracted_rows.append({ | |
| "articles": current_article, | |
| "descriptions": "\n".join(current_block).strip(), | |
| "type": current_type | |
| }) | |
| current_type = tipe | |
| current_block = [line_stripped] | |
| pasal_match = re.search( | |
| r'(Pasal\s+\d+[A-Za-z]*)', | |
| line_stripped, | |
| re.IGNORECASE | |
| ) | |
| current_article = ( | |
| pasal_match.group(1).strip().lower() | |
| if pasal_match else "pasal ?" | |
| ) | |
| capture = True | |
| elif ">>>" in line_stripped and "@" in line_stripped: | |
| if current_article and current_block: | |
| extracted_rows.append({ | |
| "articles": current_article, | |
| "descriptions": "\n".join(current_block).strip(), | |
| "type": current_type | |
| }) | |
| current_article, current_block, current_type = None, [], None | |
| capture = False | |
| elif capture: | |
| current_block.append(line_stripped) | |
| if current_article and current_block: | |
| extracted_rows.append({ | |
| "articles": current_article, | |
| "descriptions": "\n".join(current_block).strip(), | |
| "type": current_type | |
| }) | |
| return pd.DataFrame(extracted_rows) | |