Spaces:

KalHol
/

ai_extraction_review

Sleeping

File size: 2,989 Bytes

9e1c256

import pandas as pd
import re
import openpyxl


def resolve_sheet_name(excel_path: str, expected_name: str) -> str:
    wb = openpyxl.load_workbook(excel_path, read_only=True)

    expected_norm = expected_name.strip().lower()

    for sheet in wb.sheetnames:
        if sheet.strip().lower() == expected_norm:
            return sheet

    raise ValueError(
        f"Sheet '{expected_name}' not found. Available sheets: {wb.sheetnames}"
    )


def extract_filtered_pasals_as_expected_format_v2(excel_path, regulation_code):

    sheet_name = resolve_sheet_name(excel_path, "Obligations")

    df_excel = pd.read_excel(
        excel_path,
        sheet_name=sheet_name,
        usecols="P:Q",
        skiprows=1,
        header=None,
        engine="openpyxl"
    )

    df_excel.columns = ["Kewajiban", "Sanksi"]

    combined_data = []

    for text in df_excel["Kewajiban"].dropna():
        combined_data.append(("kewajiban", str(text)))

    for text in df_excel["Sanksi"].dropna():
        combined_data.append(("sanksi", str(text)))

    extracted_rows = []
    current_article, current_block, current_type = None, [], None
    capture = False

    for tipe, line in combined_data:
        line_stripped = line.strip()

        pattern_variasi = re.compile(
            r">+\s*@\s*(" + re.escape(regulation_code) + r")",
            re.IGNORECASE
        )

        line_stripped = pattern_variasi.sub(
            f">>>@{regulation_code}",
            line_stripped
        )

        if f">>>@{regulation_code}" in line_stripped:

            if current_article and current_block:
                extracted_rows.append({
                    "articles": current_article,
                    "descriptions": "\n".join(current_block).strip(),
                    "type": current_type
                })

            current_type = tipe
            current_block = [line_stripped]

            pasal_match = re.search(
                r'(Pasal\s+\d+[A-Za-z]*)',
                line_stripped,
                re.IGNORECASE
            )

            current_article = (
                pasal_match.group(1).strip().lower()
                if pasal_match else "pasal ?"
            )

            capture = True

        elif ">>>" in line_stripped and "@" in line_stripped:

            if current_article and current_block:
                extracted_rows.append({
                    "articles": current_article,
                    "descriptions": "\n".join(current_block).strip(),
                    "type": current_type
                })

            current_article, current_block, current_type = None, [], None
            capture = False

        elif capture:
            current_block.append(line_stripped)

    if current_article and current_block:
        extracted_rows.append({
            "articles": current_article,
            "descriptions": "\n".join(current_block).strip(),
            "type": current_type
        })

    return pd.DataFrame(extracted_rows)