roshcheeku commited on
Commit
17c2a53
·
verified ·
1 Parent(s): 549ef7c

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +192 -0
app.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, jsonify, send_file
2
+ import os
3
+ import pandas as pd
4
+ from dateutil.parser import parse
5
+ import tempfile
6
+ import io
7
+
8
+ # -------- CACHE SETUP --------
9
+ CACHE_DIR = os.path.join(tempfile.gettempdir(), "cache")
10
+ os.makedirs(CACHE_DIR, exist_ok=True)
11
+
12
+ NORMALIZED_CACHE = os.path.join(CACHE_DIR, "normalized_claims.parquet")
13
+ RULE_A_CACHE = os.path.join(CACHE_DIR, "rule_A.parquet")
14
+ RULE_B_CACHE = os.path.join(CACHE_DIR, "rule_B.parquet")
15
+ RULE_C_CACHE = os.path.join(CACHE_DIR, "rule_C.parquet")
16
+
17
+ SHORT_WINDOW_DAYS = 7
18
+ app = Flask(__name__)
19
+
20
+ # -------- HELPERS --------
21
+ def _safe_parse_date(x):
22
+ if pd.isna(x):
23
+ return pd.NaT
24
+ try:
25
+ return parse(str(x), dayfirst=False, yearfirst=True, fuzzy=True)
26
+ except Exception:
27
+ return pd.NaT
28
+
29
+ def _first_existing(df, cols):
30
+ df_cols = {c.lower(): c for c in df.columns}
31
+ for c in cols:
32
+ if c.lower() in df_cols:
33
+ return df_cols[c.lower()]
34
+ return None
35
+
36
+ # -------- NORMALIZATION --------
37
+ def normalize_claims(df, source_name="uploaded_file"):
38
+ bene_col = _first_existing(df, ["DESYNPUF_ID", "BENE_ID", "BENEFICIARY_ID"])
39
+ clm_col = _first_existing(df, ["CLM_ID", "CLAIM_ID"])
40
+ from_col = _first_existing(df, ["CLM_FROM_DT", "SRVC_BGN_DT", "SRVC_BGN_DATE", "LINE_SRVC_DT"])
41
+ thru_col = _first_existing(df, ["CLM_THRU_DT", "SRVC_END_DT", "SRVC_END_DATE"])
42
+ line_dt = _first_existing(df, ["LINE_SRVC_DT"])
43
+ prov_col = _first_existing(df, ["PRF_PHYSN_NPI","AT_PHYSN_NPI","OP_PHYSN_NPI","ORG_NPI_NUM","PRVDR_NUM","NPI","PROVIDER_ID"])
44
+ proc_col = _first_existing(df, ["HCPCS_CD","CPT_CODE","PRCDR_CD","PRCDR1_CD","REV_CNTR_HCPCS_CD","PROCEDURE_CODE"])
45
+ type_col = _first_existing(df, ["NCH_CLM_TYPE_CD","CLM_TYPE","FILE_TYPE"])
46
+
47
+ out = pd.DataFrame({
48
+ "beneficiary_id": df[bene_col] if bene_col else pd.NA,
49
+ "claim_id": df[clm_col] if clm_col else pd.NA,
50
+ "start_date_raw": df[from_col] if from_col else df[line_dt] if line_dt else pd.NA,
51
+ "end_date_raw": df[thru_col] if thru_col else df[line_dt] if line_dt else pd.NA,
52
+ "service_date_raw": df[line_dt] if line_dt else df[from_col] if from_col else pd.NA,
53
+ "provider_id": df[prov_col] if prov_col else pd.NA,
54
+ "procedure_code": df[proc_col] if proc_col else pd.NA,
55
+ "claim_type": df[type_col] if type_col else pd.NA,
56
+ "source_file": source_name,
57
+ })
58
+
59
+ out["start_date"] = out["start_date_raw"].apply(_safe_parse_date)
60
+ out["end_date"] = out["end_date_raw"].apply(_safe_parse_date)
61
+ out["service_date"] = out["service_date_raw"].apply(_safe_parse_date)
62
+ out.loc[out["end_date"].isna() & out["start_date"].notna(), "end_date"] = out["start_date"]
63
+
64
+ for c in ["beneficiary_id","claim_id","provider_id","procedure_code","claim_type"]:
65
+ out[c] = out[c].astype(str).str.strip().str.upper()
66
+
67
+ out = out[(out["beneficiary_id"].notna()) & (out["beneficiary_id"] != "NAN")]
68
+ out = out[out["service_date"].notna() | out["start_date"].notna() | out["end_date"].notna()]
69
+ out["service_date"] = out["service_date"].fillna(out["start_date"])
70
+
71
+ return out[[
72
+ "beneficiary_id","claim_id","service_date","start_date","end_date",
73
+ "provider_id","procedure_code","claim_type","source_file"
74
+ ]].reset_index(drop=True)
75
+
76
+ # -------- RULES --------
77
+ def _force_count_col(df):
78
+ df.columns = list(df.columns[:-1]) + ["count"]
79
+ return df
80
+
81
+ def rule_A_exact_duplicates(claims):
82
+ key = ["beneficiary_id", "procedure_code", "service_date"]
83
+ dup = claims.dropna(subset=key).groupby(key, as_index=False).size().reset_index()
84
+ dup = _force_count_col(dup)
85
+ dup = dup[dup["count"] > 1]
86
+ return dup.merge(claims, on=key, how="left")
87
+
88
+ def rule_B_too_frequent_billing(claims, days=7):
89
+ df = claims.dropna(subset=["beneficiary_id","provider_id","procedure_code","service_date"]).copy()
90
+ df = df.sort_values(["beneficiary_id","provider_id","procedure_code","service_date"])
91
+ df["prev_service_date"] = df.groupby(["beneficiary_id","provider_id","procedure_code"])["service_date"].shift(1)
92
+ df["days_since_prev"] = (df["service_date"] - df["prev_service_date"]).dt.days
93
+ return df[(df["prev_service_date"].notna()) & (df["days_since_prev"] >= 0) & (df["days_since_prev"] <= days)]
94
+
95
+ def rule_C_overlapping_fast(claims):
96
+ df = claims.dropna(subset=["beneficiary_id","procedure_code","start_date","end_date"]).copy()
97
+ results = []
98
+
99
+ for (bene, proc), group in df.groupby(["beneficiary_id", "procedure_code"]):
100
+ group = group.sort_values("start_date")
101
+ active = []
102
+ for _, row in group.iterrows():
103
+ active = [a for a in active if a["end_date"] >= row["start_date"]]
104
+ for a in active:
105
+ results.append({
106
+ "claim_id_a": a["claim_id"], "claim_id_b": row["claim_id"],
107
+ "beneficiary_id": bene, "procedure_code": proc,
108
+ "start_date_a": a["start_date"], "end_date_a": a["end_date"],
109
+ "start_date_b": row["start_date"], "end_date_b": row["end_date"],
110
+ "provider_id_a": a["provider_id"], "provider_id_b": row["provider_id"]
111
+ })
112
+ active.append(row.to_dict())
113
+ return pd.DataFrame(results)
114
+
115
+ # -------- API ROUTES --------
116
+ @app.route("/process", methods=["POST"])
117
+ def process_claims():
118
+ files = request.files.getlist("files") if "files" in request.files else []
119
+
120
+ if files: # New upload
121
+ frames = []
122
+ for f in files:
123
+ if not f.filename.lower().endswith(".csv"):
124
+ continue
125
+ try:
126
+ df = pd.read_csv(f, dtype=str, low_memory=False, encoding_errors="ignore")
127
+ frames.append(normalize_claims(df, f.filename))
128
+ except Exception as e:
129
+ return jsonify({"error": f"Failed to read {f.filename}: {str(e)}"}), 400
130
+
131
+ if not frames:
132
+ return jsonify({"error": "No valid CSV files found"}), 400
133
+
134
+ claims = pd.concat(frames, ignore_index=True).drop_duplicates()
135
+ claims["procedure_code"] = claims["procedure_code"].replace(["", "NAN"], pd.NA)
136
+ claims.to_parquet(NORMALIZED_CACHE, index=False)
137
+
138
+ # Compute rules once & cache
139
+ rule_A_exact_duplicates(claims).to_parquet(RULE_A_CACHE, index=False)
140
+ rule_B_too_frequent_billing(claims, days=SHORT_WINDOW_DAYS).to_parquet(RULE_B_CACHE, index=False)
141
+ rule_C_overlapping_fast(claims).to_parquet(RULE_C_CACHE, index=False)
142
+
143
+ else: # Load from cache
144
+ if not os.path.exists(NORMALIZED_CACHE):
145
+ return jsonify({"error": "No cached data available. Upload CSVs first."}), 400
146
+ claims = pd.read_parquet(NORMALIZED_CACHE)
147
+
148
+ # Summaries from cache
149
+ dup_A = pd.read_parquet(RULE_A_CACHE) if os.path.exists(RULE_A_CACHE) else pd.DataFrame()
150
+ dup_B = pd.read_parquet(RULE_B_CACHE) if os.path.exists(RULE_B_CACHE) else pd.DataFrame()
151
+ dup_C = pd.read_parquet(RULE_C_CACHE) if os.path.exists(RULE_C_CACHE) else pd.DataFrame()
152
+
153
+ summary = {
154
+ "Rule A": int(dup_A["claim_id"].nunique()) if not dup_A.empty else 0,
155
+ "Rule B": int(dup_B["claim_id"].nunique()) if not dup_B.empty else 0,
156
+ "Rule C": len(dup_C) if not dup_C.empty else 0,
157
+ "Total Claims": len(claims)
158
+ }
159
+ return jsonify(summary)
160
+
161
+ # -------- DOWNLOAD ENDPOINT --------
162
+ @app.route("/download/<rule>", methods=["GET"])
163
+ def download(rule):
164
+ mapping = {
165
+ "normalized": NORMALIZED_CACHE,
166
+ "rule_A": RULE_A_CACHE,
167
+ "rule_B": RULE_B_CACHE,
168
+ "rule_C": RULE_C_CACHE,
169
+ }
170
+ if rule not in mapping:
171
+ return jsonify({"error": f"Unknown rule: {rule}"}), 400
172
+ if not os.path.exists(mapping[rule]):
173
+ return jsonify({"error": f"No cached file for {rule}"}), 404
174
+
175
+ fmt = request.args.get("format", "csv").lower()
176
+ df = pd.read_parquet(mapping[rule])
177
+
178
+ buf = io.BytesIO()
179
+ if fmt == "csv":
180
+ df.to_csv(buf, index=False)
181
+ buf.seek(0)
182
+ return send_file(buf, mimetype="text/csv", as_attachment=True, download_name=f"{rule}.csv")
183
+ elif fmt == "parquet":
184
+ df.to_parquet(buf, index=False)
185
+ buf.seek(0)
186
+ return send_file(buf, mimetype="application/octet-stream", as_attachment=True, download_name=f"{rule}.parquet")
187
+ else:
188
+ return jsonify({"error": "Format must be csv or parquet"}), 400
189
+
190
+ if __name__ == "__main__":
191
+ app.run(host="0.0.0.0", port=7860, debug=False)
192
+