roshcheeku commited on
Commit
678df22
·
verified ·
1 Parent(s): eb13113

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -7
app.py CHANGED
@@ -5,6 +5,7 @@ from dateutil.parser import parse
5
  import tempfile
6
  import io
7
 
 
8
  CACHE_DIR = os.path.join(tempfile.gettempdir(), "cache")
9
  os.makedirs(CACHE_DIR, exist_ok=True)
10
 
@@ -32,6 +33,7 @@ def _first_existing(df, cols):
32
  return df_cols[c.lower()]
33
  return None
34
 
 
35
  def normalize_claims(df, source_name="uploaded_file"):
36
  bene_col = _first_existing(df, ["DESYNPUF_ID", "BENE_ID", "BENEFICIARY_ID"])
37
  clm_col = _first_existing(df, ["CLM_ID", "CLAIM_ID"])
@@ -91,17 +93,14 @@ def rule_B_too_frequent_billing(claims, days=7):
91
  return df[(df["prev_service_date"].notna()) & (df["days_since_prev"] >= 0) & (df["days_since_prev"] <= days)]
92
 
93
  def rule_C_overlapping_fast(claims):
94
- """Faster interval overlap using line sweep per beneficiary/procedure"""
95
  df = claims.dropna(subset=["beneficiary_id","procedure_code","start_date","end_date"]).copy()
96
  results = []
97
 
98
  for (bene, proc), group in df.groupby(["beneficiary_id", "procedure_code"]):
99
  group = group.sort_values("start_date")
100
  active = []
101
- for idx, row in group.iterrows():
102
- # remove intervals that ended before current starts
103
  active = [a for a in active if a["end_date"] >= row["start_date"]]
104
- # current overlaps with all active intervals
105
  for a in active:
106
  results.append({
107
  "claim_id_a": a["claim_id"], "claim_id_b": row["claim_id"],
@@ -111,7 +110,6 @@ def rule_C_overlapping_fast(claims):
111
  "provider_id_a": a["provider_id"], "provider_id_b": row["provider_id"]
112
  })
113
  active.append(row.to_dict())
114
-
115
  return pd.DataFrame(results)
116
 
117
  # -------- API ROUTES --------
@@ -137,14 +135,14 @@ def process_claims():
137
  claims["procedure_code"] = claims["procedure_code"].replace(["", "NAN"], pd.NA)
138
  claims.to_parquet(NORMALIZED_CACHE, index=False)
139
 
140
- # Compute rules only once & cache
141
  rule_A_exact_duplicates(claims).to_parquet(RULE_A_CACHE, index=False)
142
  rule_B_too_frequent_billing(claims, days=SHORT_WINDOW_DAYS).to_parquet(RULE_B_CACHE, index=False)
143
  rule_C_overlapping_fast(claims).to_parquet(RULE_C_CACHE, index=False)
144
 
145
  else: # Load from cache
146
  if not os.path.exists(NORMALIZED_CACHE):
147
- return jsonify({"error": "No cached data available. Please upload CSVs first."}), 400
148
  claims = pd.read_parquet(NORMALIZED_CACHE)
149
 
150
  # Summaries from cache
 
5
  import tempfile
6
  import io
7
 
8
+ # -------- CACHE SETUP --------
9
  CACHE_DIR = os.path.join(tempfile.gettempdir(), "cache")
10
  os.makedirs(CACHE_DIR, exist_ok=True)
11
 
 
33
  return df_cols[c.lower()]
34
  return None
35
 
36
+ # -------- NORMALIZATION --------
37
  def normalize_claims(df, source_name="uploaded_file"):
38
  bene_col = _first_existing(df, ["DESYNPUF_ID", "BENE_ID", "BENEFICIARY_ID"])
39
  clm_col = _first_existing(df, ["CLM_ID", "CLAIM_ID"])
 
93
  return df[(df["prev_service_date"].notna()) & (df["days_since_prev"] >= 0) & (df["days_since_prev"] <= days)]
94
 
95
  def rule_C_overlapping_fast(claims):
 
96
  df = claims.dropna(subset=["beneficiary_id","procedure_code","start_date","end_date"]).copy()
97
  results = []
98
 
99
  for (bene, proc), group in df.groupby(["beneficiary_id", "procedure_code"]):
100
  group = group.sort_values("start_date")
101
  active = []
102
+ for _, row in group.iterrows():
 
103
  active = [a for a in active if a["end_date"] >= row["start_date"]]
 
104
  for a in active:
105
  results.append({
106
  "claim_id_a": a["claim_id"], "claim_id_b": row["claim_id"],
 
110
  "provider_id_a": a["provider_id"], "provider_id_b": row["provider_id"]
111
  })
112
  active.append(row.to_dict())
 
113
  return pd.DataFrame(results)
114
 
115
  # -------- API ROUTES --------
 
135
  claims["procedure_code"] = claims["procedure_code"].replace(["", "NAN"], pd.NA)
136
  claims.to_parquet(NORMALIZED_CACHE, index=False)
137
 
138
+ # Compute rules once & cache
139
  rule_A_exact_duplicates(claims).to_parquet(RULE_A_CACHE, index=False)
140
  rule_B_too_frequent_billing(claims, days=SHORT_WINDOW_DAYS).to_parquet(RULE_B_CACHE, index=False)
141
  rule_C_overlapping_fast(claims).to_parquet(RULE_C_CACHE, index=False)
142
 
143
  else: # Load from cache
144
  if not os.path.exists(NORMALIZED_CACHE):
145
+ return jsonify({"error": "No cached data available. Upload CSVs first."}), 400
146
  claims = pd.read_parquet(NORMALIZED_CACHE)
147
 
148
  # Summaries from cache