ai_extraction_review / core /excel_parser.py
KalHol's picture
Initialize Release
9e1c256
import pandas as pd
import re
import openpyxl
def resolve_sheet_name(excel_path: str, expected_name: str) -> str:
wb = openpyxl.load_workbook(excel_path, read_only=True)
expected_norm = expected_name.strip().lower()
for sheet in wb.sheetnames:
if sheet.strip().lower() == expected_norm:
return sheet
raise ValueError(
f"Sheet '{expected_name}' not found. Available sheets: {wb.sheetnames}"
)
def extract_filtered_pasals_as_expected_format_v2(excel_path, regulation_code):
sheet_name = resolve_sheet_name(excel_path, "Obligations")
df_excel = pd.read_excel(
excel_path,
sheet_name=sheet_name,
usecols="P:Q",
skiprows=1,
header=None,
engine="openpyxl"
)
df_excel.columns = ["Kewajiban", "Sanksi"]
combined_data = []
for text in df_excel["Kewajiban"].dropna():
combined_data.append(("kewajiban", str(text)))
for text in df_excel["Sanksi"].dropna():
combined_data.append(("sanksi", str(text)))
extracted_rows = []
current_article, current_block, current_type = None, [], None
capture = False
for tipe, line in combined_data:
line_stripped = line.strip()
pattern_variasi = re.compile(
r">+\s*@\s*(" + re.escape(regulation_code) + r")",
re.IGNORECASE
)
line_stripped = pattern_variasi.sub(
f">>>@{regulation_code}",
line_stripped
)
if f">>>@{regulation_code}" in line_stripped:
if current_article and current_block:
extracted_rows.append({
"articles": current_article,
"descriptions": "\n".join(current_block).strip(),
"type": current_type
})
current_type = tipe
current_block = [line_stripped]
pasal_match = re.search(
r'(Pasal\s+\d+[A-Za-z]*)',
line_stripped,
re.IGNORECASE
)
current_article = (
pasal_match.group(1).strip().lower()
if pasal_match else "pasal ?"
)
capture = True
elif ">>>" in line_stripped and "@" in line_stripped:
if current_article and current_block:
extracted_rows.append({
"articles": current_article,
"descriptions": "\n".join(current_block).strip(),
"type": current_type
})
current_article, current_block, current_type = None, [], None
capture = False
elif capture:
current_block.append(line_stripped)
if current_article and current_block:
extracted_rows.append({
"articles": current_article,
"descriptions": "\n".join(current_block).strip(),
"type": current_type
})
return pd.DataFrame(extracted_rows)