Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,6 +5,7 @@ from dateutil.parser import parse
|
|
| 5 |
import tempfile
|
| 6 |
import io
|
| 7 |
|
|
|
|
| 8 |
CACHE_DIR = os.path.join(tempfile.gettempdir(), "cache")
|
| 9 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 10 |
|
|
@@ -32,6 +33,7 @@ def _first_existing(df, cols):
|
|
| 32 |
return df_cols[c.lower()]
|
| 33 |
return None
|
| 34 |
|
|
|
|
| 35 |
def normalize_claims(df, source_name="uploaded_file"):
|
| 36 |
bene_col = _first_existing(df, ["DESYNPUF_ID", "BENE_ID", "BENEFICIARY_ID"])
|
| 37 |
clm_col = _first_existing(df, ["CLM_ID", "CLAIM_ID"])
|
|
@@ -91,17 +93,14 @@ def rule_B_too_frequent_billing(claims, days=7):
|
|
| 91 |
return df[(df["prev_service_date"].notna()) & (df["days_since_prev"] >= 0) & (df["days_since_prev"] <= days)]
|
| 92 |
|
| 93 |
def rule_C_overlapping_fast(claims):
|
| 94 |
-
"""Faster interval overlap using line sweep per beneficiary/procedure"""
|
| 95 |
df = claims.dropna(subset=["beneficiary_id","procedure_code","start_date","end_date"]).copy()
|
| 96 |
results = []
|
| 97 |
|
| 98 |
for (bene, proc), group in df.groupby(["beneficiary_id", "procedure_code"]):
|
| 99 |
group = group.sort_values("start_date")
|
| 100 |
active = []
|
| 101 |
-
for
|
| 102 |
-
# remove intervals that ended before current starts
|
| 103 |
active = [a for a in active if a["end_date"] >= row["start_date"]]
|
| 104 |
-
# current overlaps with all active intervals
|
| 105 |
for a in active:
|
| 106 |
results.append({
|
| 107 |
"claim_id_a": a["claim_id"], "claim_id_b": row["claim_id"],
|
|
@@ -111,7 +110,6 @@ def rule_C_overlapping_fast(claims):
|
|
| 111 |
"provider_id_a": a["provider_id"], "provider_id_b": row["provider_id"]
|
| 112 |
})
|
| 113 |
active.append(row.to_dict())
|
| 114 |
-
|
| 115 |
return pd.DataFrame(results)
|
| 116 |
|
| 117 |
# -------- API ROUTES --------
|
|
@@ -137,14 +135,14 @@ def process_claims():
|
|
| 137 |
claims["procedure_code"] = claims["procedure_code"].replace(["", "NAN"], pd.NA)
|
| 138 |
claims.to_parquet(NORMALIZED_CACHE, index=False)
|
| 139 |
|
| 140 |
-
# Compute rules
|
| 141 |
rule_A_exact_duplicates(claims).to_parquet(RULE_A_CACHE, index=False)
|
| 142 |
rule_B_too_frequent_billing(claims, days=SHORT_WINDOW_DAYS).to_parquet(RULE_B_CACHE, index=False)
|
| 143 |
rule_C_overlapping_fast(claims).to_parquet(RULE_C_CACHE, index=False)
|
| 144 |
|
| 145 |
else: # Load from cache
|
| 146 |
if not os.path.exists(NORMALIZED_CACHE):
|
| 147 |
-
return jsonify({"error": "No cached data available.
|
| 148 |
claims = pd.read_parquet(NORMALIZED_CACHE)
|
| 149 |
|
| 150 |
# Summaries from cache
|
|
|
|
| 5 |
import tempfile
|
| 6 |
import io
|
| 7 |
|
| 8 |
+
# -------- CACHE SETUP --------
|
| 9 |
CACHE_DIR = os.path.join(tempfile.gettempdir(), "cache")
|
| 10 |
os.makedirs(CACHE_DIR, exist_ok=True)
|
| 11 |
|
|
|
|
| 33 |
return df_cols[c.lower()]
|
| 34 |
return None
|
| 35 |
|
| 36 |
+
# -------- NORMALIZATION --------
|
| 37 |
def normalize_claims(df, source_name="uploaded_file"):
|
| 38 |
bene_col = _first_existing(df, ["DESYNPUF_ID", "BENE_ID", "BENEFICIARY_ID"])
|
| 39 |
clm_col = _first_existing(df, ["CLM_ID", "CLAIM_ID"])
|
|
|
|
| 93 |
return df[(df["prev_service_date"].notna()) & (df["days_since_prev"] >= 0) & (df["days_since_prev"] <= days)]
|
| 94 |
|
| 95 |
def rule_C_overlapping_fast(claims):
|
|
|
|
| 96 |
df = claims.dropna(subset=["beneficiary_id","procedure_code","start_date","end_date"]).copy()
|
| 97 |
results = []
|
| 98 |
|
| 99 |
for (bene, proc), group in df.groupby(["beneficiary_id", "procedure_code"]):
|
| 100 |
group = group.sort_values("start_date")
|
| 101 |
active = []
|
| 102 |
+
for _, row in group.iterrows():
|
|
|
|
| 103 |
active = [a for a in active if a["end_date"] >= row["start_date"]]
|
|
|
|
| 104 |
for a in active:
|
| 105 |
results.append({
|
| 106 |
"claim_id_a": a["claim_id"], "claim_id_b": row["claim_id"],
|
|
|
|
| 110 |
"provider_id_a": a["provider_id"], "provider_id_b": row["provider_id"]
|
| 111 |
})
|
| 112 |
active.append(row.to_dict())
|
|
|
|
| 113 |
return pd.DataFrame(results)
|
| 114 |
|
| 115 |
# -------- API ROUTES --------
|
|
|
|
| 135 |
claims["procedure_code"] = claims["procedure_code"].replace(["", "NAN"], pd.NA)
|
| 136 |
claims.to_parquet(NORMALIZED_CACHE, index=False)
|
| 137 |
|
| 138 |
+
# Compute rules once & cache
|
| 139 |
rule_A_exact_duplicates(claims).to_parquet(RULE_A_CACHE, index=False)
|
| 140 |
rule_B_too_frequent_billing(claims, days=SHORT_WINDOW_DAYS).to_parquet(RULE_B_CACHE, index=False)
|
| 141 |
rule_C_overlapping_fast(claims).to_parquet(RULE_C_CACHE, index=False)
|
| 142 |
|
| 143 |
else: # Load from cache
|
| 144 |
if not os.path.exists(NORMALIZED_CACHE):
|
| 145 |
+
return jsonify({"error": "No cached data available. Upload CSVs first."}), 400
|
| 146 |
claims = pd.read_parquet(NORMALIZED_CACHE)
|
| 147 |
|
| 148 |
# Summaries from cache
|