ERP_Preprocessor / processors /rental_processor.py
hariqueen's picture
Update processors/rental_processor.py
6514f8b verified
import pandas as pd
from typing import Dict, List, Any, Tuple
from mappers import mapping_utils
def load_and_preprocess_data(input_file: str, config: Dict[str, Any], mapping_dict: Dict[str, Dict[str, str]]) -> Tuple[pd.DataFrame, pd.DataFrame]:
"""
๋ฐ์ดํ„ฐ ๋กœ๋“œ ๋ฐ ์ „์ฒ˜๋ฆฌ
Args:
input_file: ์ž…๋ ฅ ํŒŒ์ผ ๊ฒฝ๋กœ
config: ๋ Œํƒˆ์‚ฌ ์„ค์ • ์ •๋ณด
mapping_dict: ๋งคํ•‘ ๋”•์…”๋„ˆ๋ฆฌ
Returns:
์ „์ฒ˜๋ฆฌ๋œ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„, ํ•„ํ„ฐ๋ง๋œ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„
"""
# CSV ํŒŒ์ผ ๋กœ๋“œ - ๋‹ค์–‘ํ•œ ์ธ์ฝ”๋”ฉ ์‹œ๋„
print(f"'{input_file}' ํŒŒ์ผ ๋กœ๋”ฉ ์ค‘...")
try:
rental_df = pd.read_csv(input_file, encoding='utf-8')
except UnicodeDecodeError:
try:
# UTF-8 ์‹คํŒจ ์‹œ CP949 ์‹œ๋„
rental_df = pd.read_csv(input_file, encoding='cp949')
print("CP949 ์ธ์ฝ”๋”ฉ์œผ๋กœ ํŒŒ์ผ ๋กœ๋“œ ์„ฑ๊ณต")
except UnicodeDecodeError:
try:
# EUC-KR ์‹œ๋„
rental_df = pd.read_csv(input_file, encoding='euc-kr')
print("EUC-KR ์ธ์ฝ”๋”ฉ์œผ๋กœ ํŒŒ์ผ ๋กœ๋“œ ์„ฑ๊ณต")
except Exception as e:
print(f"ํŒŒ์ผ ๋กœ๋“œ ์‹คํŒจ: {e}")
raise
print(f"๋กœ๋”ฉ ์™„๋ฃŒ: {len(rental_df)}๊ฐœ ํ–‰ ๋ฐœ๊ฒฌ")
# ์ปฌ๋Ÿผ๋ช… ์–‘์ชฝ ๊ณต๋ฐฑ ์ œ๊ฑฐ (๋” ์—„๊ฒฉํ•œ ์ฒ˜๋ฆฌ)
original_columns = rental_df.columns.tolist()
print("์›๋ณธ ์ปฌ๋Ÿผ๋ช…:")
for col in original_columns:
print(f"- '{col}'")
# ์ปฌ๋Ÿผ๋ช…์—์„œ ๊ณต๋ฐฑ ์ œ๊ฑฐ ๋ฐ ์ฒ˜๋ฆฌ
rental_df.columns = [col.strip() for col in rental_df.columns]
# ์ฒ˜๋ฆฌ๋œ ์ปฌ๋Ÿผ๋ช… ์ถœ๋ ฅ
processed_columns = rental_df.columns.tolist()
print("์ฒ˜๋ฆฌ ํ›„ ์ปฌ๋Ÿผ๋ช…:")
for i, col in enumerate(processed_columns):
orig = original_columns[i] if i < len(original_columns) else "?"
print(f"- '{orig}' -> '{col}'")
# ์ปฌ๋Ÿผ๋ช… ์ค‘๋ณต ์ฒดํฌ ๋ฐ ์ฒ˜๋ฆฌ
if len(set(rental_df.columns)) != len(rental_df.columns):
print("๊ฒฝ๊ณ : ๊ณต๋ฐฑ ์ œ๊ฑฐ ํ›„ ์ค‘๋ณต๋œ ์ปฌ๋Ÿผ๋ช…์ด ์žˆ์Šต๋‹ˆ๋‹ค.")
duplicate_count = {}
new_columns = []
for col in rental_df.columns:
if col in duplicate_count:
duplicate_count[col] += 1
new_col = f"{col}_{duplicate_count[col]}"
new_columns.append(new_col)
print(f" ์ค‘๋ณต ์ปฌ๋Ÿผ ์ฒ˜๋ฆฌ: '{col}' -> '{new_col}'")
else:
duplicate_count[col] = 0
new_columns.append(col)
rental_df.columns = new_columns
# ํ•„์š”ํ•œ ํ•„๋“œ ํ™•์ธ ๋ฐ ์กฐ์ •
# ํ•„์š”ํ•œ ์ปฌ๋Ÿผ์ด ์žˆ๋Š”์ง€ ํ™•์ธ
column_exists = {}
required_columns = ["๋ชจ๋ธ๋ช…", "์˜์—…๋ถ„๋ฅ˜", "๊ด€๋ฆฌ๋ถ€์„œ", "๊ฑฐ๋ž˜์ฒ˜๋ช…", "๊ด€๋ฆฌ์ง€์ "]
for col in required_columns:
if col in rental_df.columns:
column_exists[col] = True
else:
column_exists[col] = False
print(f"๊ฒฝ๊ณ : '{col}' ์ปฌ๋Ÿผ์ด ํŒŒ์ผ์— ์—†์Šต๋‹ˆ๋‹ค.")
# ๊ธˆ์•ก ํ•„๋“œ ์ฐพ๊ธฐ - ์›”๋ณ„ ์ž๋™ ์ธ์‹ ํŒจํ„ด
amount_field = None
# 1. ๋จผ์ € config์— ์„ค์ •๋œ ํ•„๋“œ ์‹œ๋„ (์•ž๋’ค ๊ณต๋ฐฑ ์ œ๊ฑฐ ํ›„ ๋น„๊ต)
clean_amount_field = config['amount_field'].strip()
for col in rental_df.columns:
if col.strip() == clean_amount_field:
amount_field = col
print(f"๊ธˆ์•ก ํ•„๋“œ๋กœ '{amount_field}'๋ฅผ ์„ค์ •๊ฐ’์—์„œ ์ฐพ์•˜์Šต๋‹ˆ๋‹ค.")
break
if not amount_field:
# 2. 'N์›”๋ Œํƒˆ๋ฃŒ' ํŒจํ„ด ์ฐพ๊ธฐ - ๊ณต๋ฐฑ ๊ณ ๋ ค
import re
month_pattern = re.compile(r'^\s*(?:[0-9]{1,2})์›”๋ Œํƒˆ๋ฃŒ\s*$')
for col in rental_df.columns:
if month_pattern.match(col):
amount_field = col
print(f"๊ธˆ์•ก ํ•„๋“œ๋กœ '{amount_field}'๋ฅผ ์ž๋™ ์ธ์‹ํ–ˆ์Šต๋‹ˆ๋‹ค.")
break
# 3. ๋ Œํƒˆ๋ฃŒ ํฌํ•จ ํ•„๋“œ ์ฐพ๊ธฐ
if not amount_field:
for col in rental_df.columns:
if '๋ Œํƒˆ๋ฃŒ' in col:
amount_field = col
print(f"๊ธˆ์•ก ํ•„๋“œ๋กœ '{amount_field}'๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.")
break
if not amount_field:
# 4. ์ปฌ๋Ÿผ๋ช…์— '์›'์ด๋‚˜ '๏ฟฆ' ๋˜๋Š” 'โ‚ฉ'๊ฐ€ ํฌํ•จ๋œ ๊ฒƒ์„ amount_field๋กœ ์‚ฌ์šฉ
for col in rental_df.columns:
if '์›' in col or '๏ฟฆ' in col or 'โ‚ฉ' in col:
amount_field = col
print(f"๊ธˆ์•ก ํ•„๋“œ๋กœ '{amount_field}'๋ฅผ ์‚ฌ์šฉํ•ฉ๋‹ˆ๋‹ค.")
break
# ๊ธˆ์•ก ํ•„๋“œ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์œผ๋ฉด ์˜ค๋ฅ˜ ๋ฐœ์ƒ
if not amount_field:
raise ValueError("๊ธˆ์•ก ํ•„๋“œ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ํŒŒ์ผ ํ˜•์‹์„ ํ™•์ธํ•ด์ฃผ์„ธ์š”.")
# ๊ธˆ์•ก ํ•„๋“œ ํ™•์ธ ์ถœ๋ ฅ
print(f"์‚ฌ์šฉํ•  ๊ธˆ์•ก ํ•„๋“œ: '{amount_field}'")
print(f"๊ธˆ์•ก ํ•„๋“œ ์ƒ˜ํ”Œ ๊ฐ’: {rental_df[amount_field].head().tolist()}")
# ํŒ€ ํ•„๋“œ ์ฐพ๊ธฐ - ์›”๋ณ„ ์ž๋™ ์ธ์‹ ํŒจํ„ด
team_fields = []
# 1. ๋จผ์ € config์— ์„ค์ •๋œ ํ•„๋“œ ์‹œ๋„
configured_team_fields = config.get('team_fields', [])
if isinstance(configured_team_fields, str):
configured_team_fields = [configured_team_fields]
for field in configured_team_fields:
clean_field = field.strip()
for col in rental_df.columns:
if col.strip() == clean_field:
team_fields.append(col)
print(f"ํŒ€ ํ•„๋“œ๋กœ '{col}'๋ฅผ ์„ค์ •๊ฐ’์—์„œ ์ฐพ์•˜์Šต๋‹ˆ๋‹ค.")
break
if not team_fields:
# 2. '[0-9]์›” ๋ณ€๊ฒฝPJT' ํŒจํ„ด๋งŒ ์ฐพ๊ธฐ - ๊ณต๋ฐฑ ํ—ˆ์šฉ
import re
# ๊ณต๋ฐฑ ํ—ˆ์šฉํ•˜๊ณ  '๋ณ€๊ฒฝPJT'๋งŒ ์ฐพ๋Š” ํŒจํ„ด
month_pjt_pattern = re.compile(r'^\s*(?:[0-9]{1,2})์›”\s*๋ณ€๊ฒฝPJT\s*$')
for col in rental_df.columns:
if month_pjt_pattern.match(col):
team_fields.append(col)
print(f"ํŒ€ ํ•„๋“œ๋กœ '{col}'๋ฅผ ์ž๋™ ์ธ์‹ํ–ˆ์Šต๋‹ˆ๋‹ค (๋ณ€๊ฒฝPJT ํŒจํ„ด).")
# ํŒ€ ํ•„๋“œ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Œ - ์˜ค๋ฅ˜ ๋ฐœ์ƒ
if not team_fields:
raise ValueError("ํŒ€ ์ •๋ณด ํ•„๋“œ๋ฅผ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค. ํŒŒ์ผ ํ˜•์‹์„ ํ™•์ธํ•ด์ฃผ์„ธ์š”.")
# ์‚ฌ์šฉ ๊ฐ€๋Šฅํ•œ ์ปฌ๋Ÿผ๋งŒ ์„ ํƒ
available_columns = []
for col in required_columns:
if column_exists.get(col, False):
available_columns.append(col)
if amount_field:
available_columns.append(amount_field)
available_columns.extend(team_fields)
# ์ค‘๋ณต ์ œ๊ฑฐ
available_columns = list(dict.fromkeys(available_columns))
print(f"์‚ฌ์šฉํ•  ์ปฌ๋Ÿผ: {available_columns}")
# ํ•„์š”ํ•œ ํ•„๋“œ๋งŒ ์„ ํƒ (์กด์žฌํ•˜๋Š” ์ปฌ๋Ÿผ๋งŒ)
df = rental_df[available_columns].copy()
# ๊ธˆ์•ก ํ•„๋“œ ์ฒ˜๋ฆฌ - ๊ฐ„๋‹จํ•œ ๋ฐฉ๋ฒ•์œผ๋กœ ์ˆซ์ž๋งŒ ์ถ”์ถœ
print(f"๊ธˆ์•ก ํ•„๋“œ '{amount_field}' ๋ฐ์ดํ„ฐ ์ฒ˜๋ฆฌ ์ค‘...")
# ์ˆซ์ž๋กœ ๋ณ€ํ™˜ ๊ฐ€๋Šฅํ•œ ๊ฐ’๋งŒ ์œ ํšจํ•œ ๊ฒƒ์œผ๋กœ ๊ฐ„์ฃผ (ํ•œ ์ค„๋กœ ์ฒ˜๋ฆฌ)
valid_amount_mask = pd.to_numeric(df[amount_field], errors='coerce').notna()
# ์œ ํšจํ•˜์ง€ ์•Š์€ ํ–‰ ์ˆ˜ ์ถœ๋ ฅ
invalid_rows = (~valid_amount_mask).sum()
if invalid_rows > 0:
print(f"๊ธˆ์•ก์ด ์—†๊ฑฐ๋‚˜ ์ˆซ์ž๊ฐ€ ์•„๋‹Œ ํ–‰(๋ฐ˜๋‚ฉ ํ•ญ๋ชฉ) {invalid_rows}๊ฐœ๋ฅผ ์ œ์™ธํ•ฉ๋‹ˆ๋‹ค.")
# ์œ ํšจํ•œ ํ–‰๋งŒ ์„ ํƒ
df = df[valid_amount_mask].copy()
# ๊ธˆ์•ก ๋ณ€ํ™˜ - ๋‹จ์ˆœํ™”๋œ ๋ฐฉ๋ฒ•
df["๊ธˆ์•ก"] = pd.to_numeric(df[amount_field], errors='coerce')
df["๊ธˆ์•ก"] = df["๊ธˆ์•ก"].astype(int)
print(f"๊ธˆ์•ก ๋ณ€ํ™˜ ์„ฑ๊ณต: ์ƒ˜ํ”Œ ๊ฐ’ = {df['๊ธˆ์•ก'].head().tolist()}")
# ํŒ€๋ช… ์ฒ˜๋ฆฌ (์šฐ์„ ์ˆœ์œ„์— ๋”ฐ๋ผ)
if team_fields:
df["์›๋ณธํŒ€๋ช…"] = df[team_fields[0]].copy()
for field in team_fields[1:]:
df["์›๋ณธํŒ€๋ช…"] = df["์›๋ณธํŒ€๋ช…"].combine_first(df[field])
# ๋งคํ•‘ ์ ์šฉ
df["๋งคํ•‘์ •๋ณด"] = df["์›๋ณธํŒ€๋ช…"].apply(lambda x: mapping_utils.apply_mapping(x, mapping_dict))
# ๋งคํ•‘ ์ •๋ณด์—์„œ ํ•„๋“œ ์ถ”์ถœ
df["ํŒ€๋ช…"] = df["๋งคํ•‘์ •๋ณด"].apply(lambda x: x["present"])
df["CD_ACCT"] = df["๋งคํ•‘์ •๋ณด"].apply(lambda x: x["CD_ACCT"])
# CD_PJT๋ฅผ ์ •์ˆ˜ํ˜•์œผ๋กœ ๋ณ€ํ™˜ํ•˜๋Š” ๋ถ€๋ถ„
df["CD_PJT"] = df["๋งคํ•‘์ •๋ณด"].apply(lambda x: x["CD_PJT"])
# ๋ฌธ์ž์—ด์ด๋‚˜ NaN ๊ฐ’ ์ฒ˜๋ฆฌ ํ›„ ์ •์ˆ˜ํ˜•์œผ๋กœ ๋ณ€ํ™˜
df["CD_PJT"] = pd.to_numeric(df["CD_PJT"], errors='coerce').fillna(1000).astype(int)
# ์ ์š” ์ƒ์„ฑ
df["์ ์š”"] = f"{config['note_prefix']}(" + df["ํŒ€๋ช…"] + ")"
# MNG ์ฝ”๋“œ ์„ค์ •
df["CD_MNG1"] = config['cost_center'] # ์ฝ”์ŠคํŠธ์„ผํ„ฐ
df["CD_MNG3"] = config['partner_code'] # ๊ฑฐ๋ž˜์ฒ˜ ์ฝ”๋“œ
# ๋งคํ•‘๋œ ํ•ญ๋ชฉ๋งŒ ์„ ํƒ (CD_ACCT์™€ CD_PJT๊ฐ€ ์žˆ๋Š” ํ•ญ๋ชฉ๋งŒ)
df_filtered = df[(df["CD_ACCT"] != "") & (df["CD_PJT"] != "")].copy()
# ๋งคํ•‘๋˜์ง€ ์•Š์€ ํŒ€๋ช… ์ •๋ณด ์ถœ๋ ฅ
if len(df_filtered) < len(df):
unmapped_teams = df[~df.index.isin(df_filtered.index)]["์›๋ณธํŒ€๋ช…"].unique()
print(f"๋งคํ•‘๋˜์ง€ ์•Š์€ ํŒ€๋ช… {len(unmapped_teams)}๊ฐœ:")
for team in unmapped_teams:
print(f"- '{team}'")
# ๋งคํ•‘๋˜์ง€ ์•Š์€ ํ•ญ๋ชฉ์ด ์žˆ์œผ๋ฉด ๊ฒฝ๊ณ  (์ „์ฒด ๋‹ค ๋งคํ•‘ ์•ˆ ๋˜๋Š” ๊ฒฝ์šฐ๋งŒ ์˜ค๋ฅ˜)
if len(df_filtered) == 0:
raise ValueError("๋ชจ๋“  ํŒ€๋ช…์ด ๋งคํ•‘๋˜์ง€ ์•Š์•˜์Šต๋‹ˆ๋‹ค. ๋งคํ•‘ ํŒŒ์ผ์„ ํ™•์ธํ•ด์ฃผ์„ธ์š”.")
print(f"๋งคํ•‘๋œ ํ•ญ๋ชฉ: {len(df_filtered)}๊ฐœ / ์ „์ฒด {len(df)}๊ฐœ")
return df, df_filtered
def summarize_data(df_filtered: pd.DataFrame, mapping_dict: Dict[str, Dict[str, str]]) -> Dict[str, Any]:
"""
๋ฐ์ดํ„ฐ ์š”์•ฝ ์ •๋ณด ์ƒ์„ฑ
Args:
df_filtered: ํ•„ํ„ฐ๋ง๋œ ๋ฐ์ดํ„ฐํ”„๋ ˆ์ž„
mapping_dict: ๋งคํ•‘ ๋”•์…”๋„ˆ๋ฆฌ
Returns:
๋ฐ์ดํ„ฐ ์š”์•ฝ ์ •๋ณด
"""
total_amount = df_filtered["๊ธˆ์•ก"].sum()
# ๋งคํ•‘ ๊ฒฐ๊ณผ ์š”์•ฝ
mapping_summary = mapping_utils.get_mapping_summary(df_filtered, mapping_dict)
# ๊ณ„์ • ์‚ฌ์šฉ ํ˜„ํ™ฉ
account_counts = df_filtered['CD_ACCT'].value_counts().to_dict()
return {
'total_count': len(df_filtered),
'total_amount': total_amount,
'account_counts': account_counts,
'mapping_summary': mapping_summary
}