import pandas as pd import re import openpyxl def resolve_sheet_name(excel_path: str, expected_name: str) -> str: wb = openpyxl.load_workbook(excel_path, read_only=True) expected_norm = expected_name.strip().lower() for sheet in wb.sheetnames: if sheet.strip().lower() == expected_norm: return sheet raise ValueError( f"Sheet '{expected_name}' not found. Available sheets: {wb.sheetnames}" ) def extract_filtered_pasals_as_expected_format_v2(excel_path, regulation_code): sheet_name = resolve_sheet_name(excel_path, "Obligations") df_excel = pd.read_excel( excel_path, sheet_name=sheet_name, usecols="P:Q", skiprows=1, header=None, engine="openpyxl" ) df_excel.columns = ["Kewajiban", "Sanksi"] combined_data = [] for text in df_excel["Kewajiban"].dropna(): combined_data.append(("kewajiban", str(text))) for text in df_excel["Sanksi"].dropna(): combined_data.append(("sanksi", str(text))) extracted_rows = [] current_article, current_block, current_type = None, [], None capture = False for tipe, line in combined_data: line_stripped = line.strip() pattern_variasi = re.compile( r">+\s*@\s*(" + re.escape(regulation_code) + r")", re.IGNORECASE ) line_stripped = pattern_variasi.sub( f">>>@{regulation_code}", line_stripped ) if f">>>@{regulation_code}" in line_stripped: if current_article and current_block: extracted_rows.append({ "articles": current_article, "descriptions": "\n".join(current_block).strip(), "type": current_type }) current_type = tipe current_block = [line_stripped] pasal_match = re.search( r'(Pasal\s+\d+[A-Za-z]*)', line_stripped, re.IGNORECASE ) current_article = ( pasal_match.group(1).strip().lower() if pasal_match else "pasal ?" ) capture = True elif ">>>" in line_stripped and "@" in line_stripped: if current_article and current_block: extracted_rows.append({ "articles": current_article, "descriptions": "\n".join(current_block).strip(), "type": current_type }) current_article, current_block, current_type = None, [], None capture = False elif capture: current_block.append(line_stripped) if current_article and current_block: extracted_rows.append({ "articles": current_article, "descriptions": "\n".join(current_block).strip(), "type": current_type }) return pd.DataFrame(extracted_rows)