File size: 7,611 Bytes
26c3195
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# =============================================================================
# fsQCA (Fuzzy-Set Qualitative Comparative Analysis) λͺ¨λ“ˆ
# 직접 κ΅¬ν˜„: 보정(calibration) β†’ μ§„λ¦¬ν‘œ(truth table) β†’ λΆ€μšΈ μ΅œμ†Œν™”
# μ°Έκ³ : Ragin (2008), Redesigning Social Inquiry
# =============================================================================
import pandas as pd
import numpy as np
from itertools import combinations, chain


# ── 1. 보정 (Calibration) ─────────────────────────────────────────────────────
def calibrate_direct(series: pd.Series, full_in: float,
                     crossover: float, full_out: float) -> pd.Series:
    """직접 보정법 (Ragin 3점 κΈ°μ€€)"""
    s = series.copy().astype(float)
    result = pd.Series(index=s.index, dtype=float)
    for i, val in s.items():
        if val >= full_in:
            result[i] = 0.99
        elif val <= full_out:
            result[i] = 0.01
        else:
            # λ‘œμ§€μŠ€ν‹± λ³€ν™˜
            log_odds = np.log((val - full_out + 1e-9) / (full_in - val + 1e-9))
            result[i] = float(1 / (1 + np.exp(-log_odds)))
    return result.clip(0.01, 0.99)


# ── 2. ν•„μš”μ‘°κ±΄ 뢄석 ──────────────────────────────────────────────────────────
def necessary_conditions(df_fs: pd.DataFrame, outcome: str,
                          conditions: list, threshold: float = 0.9):
    rows = []
    y = df_fs[outcome]
    for cond in conditions:
        x = df_fs[cond]
        cov = float((x * y).sum() / (y.sum() + 1e-9))
        cons = float((x * y).sum() / (x.sum() + 1e-9))
        rows.append({
            "쑰건": cond,
            "일관성(Consistency)": round(cons, 3),
            "포함도(Coverage)":    round(cov, 3),
            "ν•„μš”μ‘°κ±΄":  "βœ“" if cons >= threshold else "βœ—"
        })
    return pd.DataFrame(rows)


# ── 3. μ§„λ¦¬ν‘œ ꡬ성 ────────────────────────────────────────────────────────────
def build_truth_table(df_fs: pd.DataFrame, outcome: str,
                       conditions: list, freq_threshold: int = 1,
                       cons_threshold: float = 0.75):
    n_conds = len(conditions)
    rows = []

    for combo in range(2 ** n_conds):
        config = [(combo >> i) & 1 for i in range(n_conds - 1, -1, -1)]
        mask = pd.Series([True] * len(df_fs), index=df_fs.index)
        membership = pd.Series([1.0] * len(df_fs), index=df_fs.index)

        for ci, (cond, val) in enumerate(zip(conditions, config)):
            if val == 1:
                membership = membership * df_fs[cond]
            else:
                membership = membership * (1 - df_fs[cond])

        row_members = membership[membership >= 0.5]
        freq = len(row_members)
        if freq < freq_threshold:
            continue

        y_vals = df_fs.loc[row_members.index, outcome]
        m_vals = row_members

        cons = float((m_vals * y_vals).sum() / (m_vals.sum() + 1e-9))
        cov  = float((m_vals * y_vals).sum() / (df_fs[outcome].sum() + 1e-9))

        row = {}
        for ci, cond in enumerate(conditions):
            row[cond] = config[ci]
        row["λΉˆλ„(N)"]            = freq
        row["일관성(Consistency)"] = round(cons, 3)
        row["포함도(Coverage)"]    = round(cov, 3)
        row["κ²°κ³Ό(1=포함)"]        = 1 if cons >= cons_threshold else 0
        rows.append(row)

    return pd.DataFrame(rows) if rows else pd.DataFrame()


# ── 4. 좩뢄쑰건 뢄석 (λ‹¨μˆœ 버전) ──────────────────────────────────────────────
def sufficient_conditions(truth_table: pd.DataFrame, outcome: str,
                           conditions: list, cons_threshold: float = 0.75):
    """μ§„λ¦¬ν‘œμ—μ„œ 일관성 μΆ©μ‘± ν–‰ μΆ”μΆœ β†’ 좩뢄쑰건 νŒ¨ν„΄ λ°˜ν™˜"""
    if truth_table.empty: return pd.DataFrame()
    sufficient = truth_table[truth_table["κ²°κ³Ό(1=포함)"] == 1].copy()
    if sufficient.empty: return pd.DataFrame()

    result_rows = []
    for _, row in sufficient.iterrows():
        parts = []
        for cond in conditions:
            val = row[cond]
            parts.append(f"{'~' if val==0 else ''}{cond}")
        result_rows.append({
            "좩뢄쑰건 μ‘°ν•©": " * ".join(parts),
            "일관성": row["일관성(Consistency)"],
            "포함도": row["포함도(Coverage)"],
            "λΉˆλ„":   row["λΉˆλ„(N)"]
        })
    return pd.DataFrame(result_rows)


# ── 5. 전체 fsQCA μ‹€ν–‰ ────────────────────────────────────────────────────────
def run_fsqca(df: pd.DataFrame, outcome_col: str, condition_cols: list,
              calibration_params: dict,   # {col: (full_in, crossover, full_out)}
              freq_threshold: int = 1,
              cons_threshold: float = 0.75,
              nec_threshold:  float = 0.9):
    """
    Returns: dict with keys = 뢄석단계 이름, values = DataFrame
    """
    # 보정
    df_fs = pd.DataFrame(index=df.index)
    calib_info = []
    for col in [outcome_col] + condition_cols:
        if col in calibration_params:
            fi, co, fo = calibration_params[col]
            df_fs[col] = calibrate_direct(df[col], fi, co, fo)
            calib_info.append({"λ³€μˆ˜": col, "완전포함(1)": fi,
                                "ꡐ차점(.5)": co, "μ™„μ „λ°°μ œ(0)": fo})
        else:
            # μžλ™ 보정: 5%, 50%, 95% λΆ„μœ„
            q = df[col].quantile([0.05, 0.5, 0.95])
            df_fs[col] = calibrate_direct(df[col], q[0.95], q[0.5], q[0.05])
            calib_info.append({"λ³€μˆ˜": col, "완전포함(1)": round(q[0.95],2),
                                "ꡐ차점(.5)": round(q[0.5],2), "μ™„μ „λ°°μ œ(0)": round(q[0.05],2)})

    calib_df = pd.DataFrame(calib_info)

    # κΈ°μˆ ν†΅κ³„ (보정 ν›„)
    desc_fs = df_fs.describe().T[["mean","std","min","max"]].round(3)
    desc_fs.columns = ["평균","ν‘œμ€€νŽΈμ°¨","μ΅œμ†Ÿκ°’","μ΅œλŒ“κ°’"]
    desc_fs = desc_fs.reset_index().rename(columns={"index":"λ³€μˆ˜"})

    # ν•„μš”μ‘°κ±΄
    nec_df = necessary_conditions(df_fs, outcome_col, condition_cols, nec_threshold)

    # μ§„λ¦¬ν‘œ
    tt = build_truth_table(df_fs, outcome_col, condition_cols,
                            freq_threshold, cons_threshold)

    # 좩뢄쑰건
    suf_df = sufficient_conditions(tt, outcome_col, condition_cols, cons_threshold)

    # 전체 ν•΄ 톡계
    if not suf_df.empty:
        sol_cons = suf_df["일관성"].mean()
        sol_cov  = suf_df["포함도"].mean()
        sol_summary = pd.DataFrame([{
            "ν•΄ 수(좩뢄쑰건 μ‘°ν•©)": len(suf_df),
            "평균 일관성": round(sol_cons, 3),
            "평균 포함도": round(sol_cov, 3),
            "뢄석 κΈ°μ€€(일관성 μž„κ³„κ°’)": cons_threshold
        }])
    else:
        sol_summary = pd.DataFrame([{"μ•ˆλ‚΄": "일관성 기쀀을 μΆ©μ‘±ν•˜λŠ” 좩뢄쑰건 쑰합이 μ—†μŠ΅λ‹ˆλ‹€."}])

    return {
        "보정기쀀":    calib_df,
        "λ³΄μ •ν›„κΈ°μˆ ν†΅κ³„": desc_fs,
        "ν•„μš”μ‘°κ±΄λΆ„μ„": nec_df,
        "μ§„λ¦¬ν‘œ":      tt,
        "좩뢄쑰건뢄석": suf_df,
        "ν•΄μš”μ•½":      sol_summary
    }