Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from typing import Dict, List, Any, Tuple | |
| from mappers import mapping_utils | |
| def load_and_preprocess_data(input_file: str, config: Dict[str, Any], mapping_dict: Dict[str, Dict[str, str]]) -> Tuple[pd.DataFrame, pd.DataFrame]: | |
| """ | |
| ๋ฐ์ดํฐ ๋ก๋ ๋ฐ ์ ์ฒ๋ฆฌ | |
| Args: | |
| input_file: ์ ๋ ฅ ํ์ผ ๊ฒฝ๋ก | |
| config: ๋ ํ์ฌ ์ค์ ์ ๋ณด | |
| mapping_dict: ๋งคํ ๋์ ๋๋ฆฌ | |
| Returns: | |
| ์ ์ฒ๋ฆฌ๋ ๋ฐ์ดํฐํ๋ ์, ํํฐ๋ง๋ ๋ฐ์ดํฐํ๋ ์ | |
| """ | |
| # CSV ํ์ผ ๋ก๋ - ๋ค์ํ ์ธ์ฝ๋ฉ ์๋ | |
| print(f"'{input_file}' ํ์ผ ๋ก๋ฉ ์ค...") | |
| try: | |
| rental_df = pd.read_csv(input_file, encoding='utf-8') | |
| except UnicodeDecodeError: | |
| try: | |
| # UTF-8 ์คํจ ์ CP949 ์๋ | |
| rental_df = pd.read_csv(input_file, encoding='cp949') | |
| print("CP949 ์ธ์ฝ๋ฉ์ผ๋ก ํ์ผ ๋ก๋ ์ฑ๊ณต") | |
| except UnicodeDecodeError: | |
| try: | |
| # EUC-KR ์๋ | |
| rental_df = pd.read_csv(input_file, encoding='euc-kr') | |
| print("EUC-KR ์ธ์ฝ๋ฉ์ผ๋ก ํ์ผ ๋ก๋ ์ฑ๊ณต") | |
| except Exception as e: | |
| print(f"ํ์ผ ๋ก๋ ์คํจ: {e}") | |
| raise | |
| print(f"๋ก๋ฉ ์๋ฃ: {len(rental_df)}๊ฐ ํ ๋ฐ๊ฒฌ") | |
| # ์ปฌ๋ผ๋ช ์์ชฝ ๊ณต๋ฐฑ ์ ๊ฑฐ (๋ ์๊ฒฉํ ์ฒ๋ฆฌ) | |
| original_columns = rental_df.columns.tolist() | |
| print("์๋ณธ ์ปฌ๋ผ๋ช :") | |
| for col in original_columns: | |
| print(f"- '{col}'") | |
| # ์ปฌ๋ผ๋ช ์์ ๊ณต๋ฐฑ ์ ๊ฑฐ ๋ฐ ์ฒ๋ฆฌ | |
| rental_df.columns = [col.strip() for col in rental_df.columns] | |
| # ์ฒ๋ฆฌ๋ ์ปฌ๋ผ๋ช ์ถ๋ ฅ | |
| processed_columns = rental_df.columns.tolist() | |
| print("์ฒ๋ฆฌ ํ ์ปฌ๋ผ๋ช :") | |
| for i, col in enumerate(processed_columns): | |
| orig = original_columns[i] if i < len(original_columns) else "?" | |
| print(f"- '{orig}' -> '{col}'") | |
| # ์ปฌ๋ผ๋ช ์ค๋ณต ์ฒดํฌ ๋ฐ ์ฒ๋ฆฌ | |
| if len(set(rental_df.columns)) != len(rental_df.columns): | |
| print("๊ฒฝ๊ณ : ๊ณต๋ฐฑ ์ ๊ฑฐ ํ ์ค๋ณต๋ ์ปฌ๋ผ๋ช ์ด ์์ต๋๋ค.") | |
| duplicate_count = {} | |
| new_columns = [] | |
| for col in rental_df.columns: | |
| if col in duplicate_count: | |
| duplicate_count[col] += 1 | |
| new_col = f"{col}_{duplicate_count[col]}" | |
| new_columns.append(new_col) | |
| print(f" ์ค๋ณต ์ปฌ๋ผ ์ฒ๋ฆฌ: '{col}' -> '{new_col}'") | |
| else: | |
| duplicate_count[col] = 0 | |
| new_columns.append(col) | |
| rental_df.columns = new_columns | |
| # ํ์ํ ํ๋ ํ์ธ ๋ฐ ์กฐ์ | |
| # ํ์ํ ์ปฌ๋ผ์ด ์๋์ง ํ์ธ | |
| column_exists = {} | |
| required_columns = ["๋ชจ๋ธ๋ช ", "์์ ๋ถ๋ฅ", "๊ด๋ฆฌ๋ถ์", "๊ฑฐ๋์ฒ๋ช ", "๊ด๋ฆฌ์ง์ "] | |
| for col in required_columns: | |
| if col in rental_df.columns: | |
| column_exists[col] = True | |
| else: | |
| column_exists[col] = False | |
| print(f"๊ฒฝ๊ณ : '{col}' ์ปฌ๋ผ์ด ํ์ผ์ ์์ต๋๋ค.") | |
| # ๊ธ์ก ํ๋ ์ฐพ๊ธฐ - ์๋ณ ์๋ ์ธ์ ํจํด | |
| amount_field = None | |
| # 1. ๋จผ์ config์ ์ค์ ๋ ํ๋ ์๋ (์๋ค ๊ณต๋ฐฑ ์ ๊ฑฐ ํ ๋น๊ต) | |
| clean_amount_field = config['amount_field'].strip() | |
| for col in rental_df.columns: | |
| if col.strip() == clean_amount_field: | |
| amount_field = col | |
| print(f"๊ธ์ก ํ๋๋ก '{amount_field}'๋ฅผ ์ค์ ๊ฐ์์ ์ฐพ์์ต๋๋ค.") | |
| break | |
| if not amount_field: | |
| # 2. 'N์๋ ํ๋ฃ' ํจํด ์ฐพ๊ธฐ - ๊ณต๋ฐฑ ๊ณ ๋ ค | |
| import re | |
| month_pattern = re.compile(r'^\s*(?:[0-9]{1,2})์๋ ํ๋ฃ\s*$') | |
| for col in rental_df.columns: | |
| if month_pattern.match(col): | |
| amount_field = col | |
| print(f"๊ธ์ก ํ๋๋ก '{amount_field}'๋ฅผ ์๋ ์ธ์ํ์ต๋๋ค.") | |
| break | |
| # 3. ๋ ํ๋ฃ ํฌํจ ํ๋ ์ฐพ๊ธฐ | |
| if not amount_field: | |
| for col in rental_df.columns: | |
| if '๋ ํ๋ฃ' in col: | |
| amount_field = col | |
| print(f"๊ธ์ก ํ๋๋ก '{amount_field}'๋ฅผ ์ฌ์ฉํฉ๋๋ค.") | |
| break | |
| if not amount_field: | |
| # 4. ์ปฌ๋ผ๋ช ์ '์'์ด๋ '๏ฟฆ' ๋๋ 'โฉ'๊ฐ ํฌํจ๋ ๊ฒ์ amount_field๋ก ์ฌ์ฉ | |
| for col in rental_df.columns: | |
| if '์' in col or '๏ฟฆ' in col or 'โฉ' in col: | |
| amount_field = col | |
| print(f"๊ธ์ก ํ๋๋ก '{amount_field}'๋ฅผ ์ฌ์ฉํฉ๋๋ค.") | |
| break | |
| # ๊ธ์ก ํ๋๋ฅผ ์ฐพ์ ์ ์์ผ๋ฉด ์ค๋ฅ ๋ฐ์ | |
| if not amount_field: | |
| raise ValueError("๊ธ์ก ํ๋๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค. ํ์ผ ํ์์ ํ์ธํด์ฃผ์ธ์.") | |
| # ๊ธ์ก ํ๋ ํ์ธ ์ถ๋ ฅ | |
| print(f"์ฌ์ฉํ ๊ธ์ก ํ๋: '{amount_field}'") | |
| print(f"๊ธ์ก ํ๋ ์ํ ๊ฐ: {rental_df[amount_field].head().tolist()}") | |
| # ํ ํ๋ ์ฐพ๊ธฐ - ์๋ณ ์๋ ์ธ์ ํจํด | |
| team_fields = [] | |
| # 1. ๋จผ์ config์ ์ค์ ๋ ํ๋ ์๋ | |
| configured_team_fields = config.get('team_fields', []) | |
| if isinstance(configured_team_fields, str): | |
| configured_team_fields = [configured_team_fields] | |
| for field in configured_team_fields: | |
| clean_field = field.strip() | |
| for col in rental_df.columns: | |
| if col.strip() == clean_field: | |
| team_fields.append(col) | |
| print(f"ํ ํ๋๋ก '{col}'๋ฅผ ์ค์ ๊ฐ์์ ์ฐพ์์ต๋๋ค.") | |
| break | |
| if not team_fields: | |
| # 2. '[0-9]์ ๋ณ๊ฒฝPJT' ํจํด๋ง ์ฐพ๊ธฐ - ๊ณต๋ฐฑ ํ์ฉ | |
| import re | |
| # ๊ณต๋ฐฑ ํ์ฉํ๊ณ '๋ณ๊ฒฝPJT'๋ง ์ฐพ๋ ํจํด | |
| month_pjt_pattern = re.compile(r'^\s*(?:[0-9]{1,2})์\s*๋ณ๊ฒฝPJT\s*$') | |
| for col in rental_df.columns: | |
| if month_pjt_pattern.match(col): | |
| team_fields.append(col) | |
| print(f"ํ ํ๋๋ก '{col}'๋ฅผ ์๋ ์ธ์ํ์ต๋๋ค (๋ณ๊ฒฝPJT ํจํด).") | |
| # ํ ํ๋๋ฅผ ์ฐพ์ ์ ์์ - ์ค๋ฅ ๋ฐ์ | |
| if not team_fields: | |
| raise ValueError("ํ ์ ๋ณด ํ๋๋ฅผ ์ฐพ์ ์ ์์ต๋๋ค. ํ์ผ ํ์์ ํ์ธํด์ฃผ์ธ์.") | |
| # ์ฌ์ฉ ๊ฐ๋ฅํ ์ปฌ๋ผ๋ง ์ ํ | |
| available_columns = [] | |
| for col in required_columns: | |
| if column_exists.get(col, False): | |
| available_columns.append(col) | |
| if amount_field: | |
| available_columns.append(amount_field) | |
| available_columns.extend(team_fields) | |
| # ์ค๋ณต ์ ๊ฑฐ | |
| available_columns = list(dict.fromkeys(available_columns)) | |
| print(f"์ฌ์ฉํ ์ปฌ๋ผ: {available_columns}") | |
| # ํ์ํ ํ๋๋ง ์ ํ (์กด์ฌํ๋ ์ปฌ๋ผ๋ง) | |
| df = rental_df[available_columns].copy() | |
| # ๊ธ์ก ํ๋ ์ฒ๋ฆฌ - ๊ฐ๋จํ ๋ฐฉ๋ฒ์ผ๋ก ์ซ์๋ง ์ถ์ถ | |
| print(f"๊ธ์ก ํ๋ '{amount_field}' ๋ฐ์ดํฐ ์ฒ๋ฆฌ ์ค...") | |
| # ์ซ์๋ก ๋ณํ ๊ฐ๋ฅํ ๊ฐ๋ง ์ ํจํ ๊ฒ์ผ๋ก ๊ฐ์ฃผ (ํ ์ค๋ก ์ฒ๋ฆฌ) | |
| valid_amount_mask = pd.to_numeric(df[amount_field], errors='coerce').notna() | |
| # ์ ํจํ์ง ์์ ํ ์ ์ถ๋ ฅ | |
| invalid_rows = (~valid_amount_mask).sum() | |
| if invalid_rows > 0: | |
| print(f"๊ธ์ก์ด ์๊ฑฐ๋ ์ซ์๊ฐ ์๋ ํ(๋ฐ๋ฉ ํญ๋ชฉ) {invalid_rows}๊ฐ๋ฅผ ์ ์ธํฉ๋๋ค.") | |
| # ์ ํจํ ํ๋ง ์ ํ | |
| df = df[valid_amount_mask].copy() | |
| # ๊ธ์ก ๋ณํ - ๋จ์ํ๋ ๋ฐฉ๋ฒ | |
| df["๊ธ์ก"] = pd.to_numeric(df[amount_field], errors='coerce') | |
| df["๊ธ์ก"] = df["๊ธ์ก"].astype(int) | |
| print(f"๊ธ์ก ๋ณํ ์ฑ๊ณต: ์ํ ๊ฐ = {df['๊ธ์ก'].head().tolist()}") | |
| # ํ๋ช ์ฒ๋ฆฌ (์ฐ์ ์์์ ๋ฐ๋ผ) | |
| if team_fields: | |
| df["์๋ณธํ๋ช "] = df[team_fields[0]].copy() | |
| for field in team_fields[1:]: | |
| df["์๋ณธํ๋ช "] = df["์๋ณธํ๋ช "].combine_first(df[field]) | |
| # ๋งคํ ์ ์ฉ | |
| df["๋งคํ์ ๋ณด"] = df["์๋ณธํ๋ช "].apply(lambda x: mapping_utils.apply_mapping(x, mapping_dict)) | |
| # ๋งคํ ์ ๋ณด์์ ํ๋ ์ถ์ถ | |
| df["ํ๋ช "] = df["๋งคํ์ ๋ณด"].apply(lambda x: x["present"]) | |
| df["CD_ACCT"] = df["๋งคํ์ ๋ณด"].apply(lambda x: x["CD_ACCT"]) | |
| # CD_PJT๋ฅผ ์ ์ํ์ผ๋ก ๋ณํํ๋ ๋ถ๋ถ | |
| df["CD_PJT"] = df["๋งคํ์ ๋ณด"].apply(lambda x: x["CD_PJT"]) | |
| # ๋ฌธ์์ด์ด๋ NaN ๊ฐ ์ฒ๋ฆฌ ํ ์ ์ํ์ผ๋ก ๋ณํ | |
| df["CD_PJT"] = pd.to_numeric(df["CD_PJT"], errors='coerce').fillna(1000).astype(int) | |
| # ์ ์ ์์ฑ | |
| df["์ ์"] = f"{config['note_prefix']}(" + df["ํ๋ช "] + ")" | |
| # MNG ์ฝ๋ ์ค์ | |
| df["CD_MNG1"] = config['cost_center'] # ์ฝ์คํธ์ผํฐ | |
| df["CD_MNG3"] = config['partner_code'] # ๊ฑฐ๋์ฒ ์ฝ๋ | |
| # ๋งคํ๋ ํญ๋ชฉ๋ง ์ ํ (CD_ACCT์ CD_PJT๊ฐ ์๋ ํญ๋ชฉ๋ง) | |
| df_filtered = df[(df["CD_ACCT"] != "") & (df["CD_PJT"] != "")].copy() | |
| # ๋งคํ๋์ง ์์ ํ๋ช ์ ๋ณด ์ถ๋ ฅ | |
| if len(df_filtered) < len(df): | |
| unmapped_teams = df[~df.index.isin(df_filtered.index)]["์๋ณธํ๋ช "].unique() | |
| print(f"๋งคํ๋์ง ์์ ํ๋ช {len(unmapped_teams)}๊ฐ:") | |
| for team in unmapped_teams: | |
| print(f"- '{team}'") | |
| # ๋งคํ๋์ง ์์ ํญ๋ชฉ์ด ์์ผ๋ฉด ๊ฒฝ๊ณ (์ ์ฒด ๋ค ๋งคํ ์ ๋๋ ๊ฒฝ์ฐ๋ง ์ค๋ฅ) | |
| if len(df_filtered) == 0: | |
| raise ValueError("๋ชจ๋ ํ๋ช ์ด ๋งคํ๋์ง ์์์ต๋๋ค. ๋งคํ ํ์ผ์ ํ์ธํด์ฃผ์ธ์.") | |
| print(f"๋งคํ๋ ํญ๋ชฉ: {len(df_filtered)}๊ฐ / ์ ์ฒด {len(df)}๊ฐ") | |
| return df, df_filtered | |
| def summarize_data(df_filtered: pd.DataFrame, mapping_dict: Dict[str, Dict[str, str]]) -> Dict[str, Any]: | |
| """ | |
| ๋ฐ์ดํฐ ์์ฝ ์ ๋ณด ์์ฑ | |
| Args: | |
| df_filtered: ํํฐ๋ง๋ ๋ฐ์ดํฐํ๋ ์ | |
| mapping_dict: ๋งคํ ๋์ ๋๋ฆฌ | |
| Returns: | |
| ๋ฐ์ดํฐ ์์ฝ ์ ๋ณด | |
| """ | |
| total_amount = df_filtered["๊ธ์ก"].sum() | |
| # ๋งคํ ๊ฒฐ๊ณผ ์์ฝ | |
| mapping_summary = mapping_utils.get_mapping_summary(df_filtered, mapping_dict) | |
| # ๊ณ์ ์ฌ์ฉ ํํฉ | |
| account_counts = df_filtered['CD_ACCT'].value_counts().to_dict() | |
| return { | |
| 'total_count': len(df_filtered), | |
| 'total_amount': total_amount, | |
| 'account_counts': account_counts, | |
| 'mapping_summary': mapping_summary | |
| } |