Spaces:

KalHol
/

ai_extraction_review

Sleeping

App Files Files Community

ai_extraction_review / core /excel_parser.py

KalHol

Initialize Release

9e1c256 3 months ago

raw

history blame contribute delete

2.99 kB

	import pandas as pd
	import re
	import openpyxl


	def resolve_sheet_name(excel_path: str, expected_name: str) -> str:
	wb = openpyxl.load_workbook(excel_path, read_only=True)

	expected_norm = expected_name.strip().lower()

	for sheet in wb.sheetnames:
	if sheet.strip().lower() == expected_norm:
	return sheet

	raise ValueError(
	f"Sheet '{expected_name}' not found. Available sheets: {wb.sheetnames}"
	)


	def extract_filtered_pasals_as_expected_format_v2(excel_path, regulation_code):

	sheet_name = resolve_sheet_name(excel_path, "Obligations")

	df_excel = pd.read_excel(
	excel_path,
	sheet_name=sheet_name,
	usecols="P:Q",
	skiprows=1,
	header=None,
	engine="openpyxl"
	)

	df_excel.columns = ["Kewajiban", "Sanksi"]

	combined_data = []

	for text in df_excel["Kewajiban"].dropna():
	combined_data.append(("kewajiban", str(text)))

	for text in df_excel["Sanksi"].dropna():
	combined_data.append(("sanksi", str(text)))

	extracted_rows = []
	current_article, current_block, current_type = None, [], None
	capture = False

	for tipe, line in combined_data:
	line_stripped = line.strip()

	pattern_variasi = re.compile(
	r">+\s@\s(" + re.escape(regulation_code) + r")",
	re.IGNORECASE
	)

	line_stripped = pattern_variasi.sub(
	f">>>@{regulation_code}",
	line_stripped
	)

	if f">>>@{regulation_code}" in line_stripped:

	if current_article and current_block:
	extracted_rows.append({
	"articles": current_article,
	"descriptions": "\n".join(current_block).strip(),
	"type": current_type
	})

	current_type = tipe
	current_block = [line_stripped]

	pasal_match = re.search(
	r'(Pasal\s+\d+[A-Za-z]*)',
	line_stripped,
	re.IGNORECASE
	)

	current_article = (
	pasal_match.group(1).strip().lower()
	if pasal_match else "pasal ?"
	)

	capture = True

	elif ">>>" in line_stripped and "@" in line_stripped:

	if current_article and current_block:
	extracted_rows.append({
	"articles": current_article,
	"descriptions": "\n".join(current_block).strip(),
	"type": current_type
	})

	current_article, current_block, current_type = None, [], None
	capture = False

	elif capture:
	current_block.append(line_stripped)

	if current_article and current_block:
	extracted_rows.append({
	"articles": current_article,
	"descriptions": "\n".join(current_block).strip(),
	"type": current_type
	})

	return pd.DataFrame(extracted_rows)