Spaces:

cyj-26
/

sem

Sleeping

File size: 7,611 Bytes

26c3195

# =============================================================================
# fsQCA (Fuzzy-Set Qualitative Comparative Analysis) 모듈
# 직접 구현: 보정(calibration) → 진리표(truth table) → 부울 최소화
# 참고: Ragin (2008), Redesigning Social Inquiry
# =============================================================================
import pandas as pd
import numpy as np
from itertools import combinations, chain


# ── 1. 보정 (Calibration) ─────────────────────────────────────────────────────
def calibrate_direct(series: pd.Series, full_in: float,
                     crossover: float, full_out: float) -> pd.Series:
    """직접 보정법 (Ragin 3점 기준)"""
    s = series.copy().astype(float)
    result = pd.Series(index=s.index, dtype=float)
    for i, val in s.items():
        if val >= full_in:
            result[i] = 0.99
        elif val <= full_out:
            result[i] = 0.01
        else:
            # 로지스틱 변환
            log_odds = np.log((val - full_out + 1e-9) / (full_in - val + 1e-9))
            result[i] = float(1 / (1 + np.exp(-log_odds)))
    return result.clip(0.01, 0.99)


# ── 2. 필요조건 분석 ──────────────────────────────────────────────────────────
def necessary_conditions(df_fs: pd.DataFrame, outcome: str,
                          conditions: list, threshold: float = 0.9):
    rows = []
    y = df_fs[outcome]
    for cond in conditions:
        x = df_fs[cond]
        cov = float((x * y).sum() / (y.sum() + 1e-9))
        cons = float((x * y).sum() / (x.sum() + 1e-9))
        rows.append({
            "조건": cond,
            "일관성(Consistency)": round(cons, 3),
            "포함도(Coverage)":    round(cov, 3),
            "필요조건":  "✓" if cons >= threshold else "✗"
        })
    return pd.DataFrame(rows)


# ── 3. 진리표 구성 ────────────────────────────────────────────────────────────
def build_truth_table(df_fs: pd.DataFrame, outcome: str,
                       conditions: list, freq_threshold: int = 1,
                       cons_threshold: float = 0.75):
    n_conds = len(conditions)
    rows = []

    for combo in range(2 ** n_conds):
        config = [(combo >> i) & 1 for i in range(n_conds - 1, -1, -1)]
        mask = pd.Series([True] * len(df_fs), index=df_fs.index)
        membership = pd.Series([1.0] * len(df_fs), index=df_fs.index)

        for ci, (cond, val) in enumerate(zip(conditions, config)):
            if val == 1:
                membership = membership * df_fs[cond]
            else:
                membership = membership * (1 - df_fs[cond])

        row_members = membership[membership >= 0.5]
        freq = len(row_members)
        if freq < freq_threshold:
            continue

        y_vals = df_fs.loc[row_members.index, outcome]
        m_vals = row_members

        cons = float((m_vals * y_vals).sum() / (m_vals.sum() + 1e-9))
        cov  = float((m_vals * y_vals).sum() / (df_fs[outcome].sum() + 1e-9))

        row = {}
        for ci, cond in enumerate(conditions):
            row[cond] = config[ci]
        row["빈도(N)"]            = freq
        row["일관성(Consistency)"] = round(cons, 3)
        row["포함도(Coverage)"]    = round(cov, 3)
        row["결과(1=포함)"]        = 1 if cons >= cons_threshold else 0
        rows.append(row)

    return pd.DataFrame(rows) if rows else pd.DataFrame()


# ── 4. 충분조건 분석 (단순 버전) ──────────────────────────────────────────────
def sufficient_conditions(truth_table: pd.DataFrame, outcome: str,
                           conditions: list, cons_threshold: float = 0.75):
    """진리표에서 일관성 충족 행 추출 → 충분조건 패턴 반환"""
    if truth_table.empty: return pd.DataFrame()
    sufficient = truth_table[truth_table["결과(1=포함)"] == 1].copy()
    if sufficient.empty: return pd.DataFrame()

    result_rows = []
    for _, row in sufficient.iterrows():
        parts = []
        for cond in conditions:
            val = row[cond]
            parts.append(f"{'~' if val==0 else ''}{cond}")
        result_rows.append({
            "충분조건 조합": " * ".join(parts),
            "일관성": row["일관성(Consistency)"],
            "포함도": row["포함도(Coverage)"],
            "빈도":   row["빈도(N)"]
        })
    return pd.DataFrame(result_rows)


# ── 5. 전체 fsQCA 실행 ────────────────────────────────────────────────────────
def run_fsqca(df: pd.DataFrame, outcome_col: str, condition_cols: list,
              calibration_params: dict,   # {col: (full_in, crossover, full_out)}
              freq_threshold: int = 1,
              cons_threshold: float = 0.75,
              nec_threshold:  float = 0.9):
    """
    Returns: dict with keys = 분석단계 이름, values = DataFrame
    """
    # 보정
    df_fs = pd.DataFrame(index=df.index)
    calib_info = []
    for col in [outcome_col] + condition_cols:
        if col in calibration_params:
            fi, co, fo = calibration_params[col]
            df_fs[col] = calibrate_direct(df[col], fi, co, fo)
            calib_info.append({"변수": col, "완전포함(1)": fi,
                                "교차점(.5)": co, "완전배제(0)": fo})
        else:
            # 자동 보정: 5%, 50%, 95% 분위
            q = df[col].quantile([0.05, 0.5, 0.95])
            df_fs[col] = calibrate_direct(df[col], q[0.95], q[0.5], q[0.05])
            calib_info.append({"변수": col, "완전포함(1)": round(q[0.95],2),
                                "교차점(.5)": round(q[0.5],2), "완전배제(0)": round(q[0.05],2)})

    calib_df = pd.DataFrame(calib_info)

    # 기술통계 (보정 후)
    desc_fs = df_fs.describe().T[["mean","std","min","max"]].round(3)
    desc_fs.columns = ["평균","표준편차","최솟값","최댓값"]
    desc_fs = desc_fs.reset_index().rename(columns={"index":"변수"})

    # 필요조건
    nec_df = necessary_conditions(df_fs, outcome_col, condition_cols, nec_threshold)

    # 진리표
    tt = build_truth_table(df_fs, outcome_col, condition_cols,
                            freq_threshold, cons_threshold)

    # 충분조건
    suf_df = sufficient_conditions(tt, outcome_col, condition_cols, cons_threshold)

    # 전체 해 통계
    if not suf_df.empty:
        sol_cons = suf_df["일관성"].mean()
        sol_cov  = suf_df["포함도"].mean()
        sol_summary = pd.DataFrame([{
            "해 수(충분조건 조합)": len(suf_df),
            "평균 일관성": round(sol_cons, 3),
            "평균 포함도": round(sol_cov, 3),
            "분석 기준(일관성 임계값)": cons_threshold
        }])
    else:
        sol_summary = pd.DataFrame([{"안내": "일관성 기준을 충족하는 충분조건 조합이 없습니다."}])

    return {
        "보정기준":    calib_df,
        "보정후기술통계": desc_fs,
        "필요조건분석": nec_df,
        "진리표":      tt,
        "충분조건분석": suf_df,
        "해요약":      sol_summary
    }