hf_tender / app.py
SuriRaja's picture
Upload 3 files
a413527 verified
import os, io, random, numpy as np, pandas as pd, streamlit as st
from datetime import datetime
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, average_precision_score, brier_score_loss, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.isotonic import IsotonicRegression
from sklearn.metrics.pairwise import cosine_similarity
from PyPDF2 import PdfReader
import joblib
st.set_page_config(page_title="NTPC/SCCL Tender Win Intelligence – POC v3", layout="wide")
st.title("NTPC/SCCL Tender Win Intelligence – POC v3")
st.caption("Win probability (calibrated) • Compliance-gap • Batch scoring • Decision bands • Price-band guidance • RAG exemplars • Real CSV upload")
SCHEMA = {...} # populated later at runtime for readability
TEXT_HELP = """**Jargons & Checks**
- **PQ** = Pre-Qualification (Turnover / Similar Work / OEM / Solvency / Key Personnel)
- **OEM Back-to-Back** = Authorisation/MoU from OEM for supply/spares/warranty
- **QCBS vs L1** = Quality+Cost Based Selection vs Lowest Price
- **Corrigenda Watch** = Multiple corrigenda → scope/terms changes; track closely
- **MSME / Make-in-India / Local Content** = Preferential clauses; ensure declarations & CA certificates
- **Integrity Pact / EMD** = Use correct formats and exemptions if eligible
"""
# ---------------- Synthetic data w/ price ratio target ----------------
def synth_text(cat, strict_pq, oem, site_visit):
bits = []
if cat == "Solar EPC":
bits.append("EPC for solar PV with SCADA and remote monitoring")
elif cat == "OB Removal":
bits.append("Overburden removal with HEMM deployment and production targets")
elif cat == "CHP Maintenance":
bits.append("Conveyor belt replacement, chute liners, idlers, pulleys, gearboxes")
elif cat == "E&M Spares":
bits.append("Supply of OEM-equivalent spares with QA/QC and test certificates")
else:
bits.append(f"{cat} scope including manpower, materials, QA/QC")
if strict_pq:
bits.append("Strict PQ: turnover > 1.5x estimate; similar work in last 7 years")
else:
bits.append("Moderate PQ with MSME preference applicable")
if oem:
bits.append("OEM authorization letter mandatory")
if site_visit:
bits.append("Pre-bid site visit is mandatory")
return ". ".join(bits)
def make_synth_df(N=900, seed=42):
random.seed(seed); np.random.seed(seed)
orgs = ["NTPC","SCCL"]
cats = ["CHP Maintenance","OB Removal","Solar EPC","E&M Spares","BoP O&M","Overhaul","Ash Handling"]
regions = ["ER","SR","WR","NR"]
qcbs_l1 = ["L1","QCBS"]
rows = []
start_date = np.datetime64("2022-01-01")
for i in range(N):
issue = start_date + np.timedelta64(int(i*2), "D")
org = np.random.choice(orgs, p=[0.6,0.4])
cat = np.random.choice(cats, p=[0.22,0.18,0.15,0.15,0.1,0.1,0.1])
region = np.random.choice(regions)
est = np.round(np.random.lognormal(mean=1.8, sigma=0.6), 2)
emd = np.round(np.clip(np.random.normal(2.0, 0.7), 0.5, 5.0), 2)
ql = np.random.choice(qcbs_l1, p=[0.75,0.25])
window = int(np.clip(np.random.normal(21, 7), 7, 45))
corrig = int(np.clip(np.random.poisson(0.6), 0, 5))
pq_turn = np.round(np.maximum(est*np.random.uniform(0.6,2.2),0.5),2)
pq_sim = np.round(np.maximum(est*np.random.uniform(0.5,2.0),0.3),2)
site_visit = np.random.binomial(1, 0.3 if cat!="Solar EPC" else 0.5)
mii = np.random.binomial(1, 0.6 if org=="NTPC" else 0.4)
lcr = np.round(np.clip(np.random.normal(30, 15), 0, 70), 1)
msme = np.random.binomial(1, 0.35)
retender = 1 if (corrig>=3 and random.random()<0.2) else 0
exp_bidders = int(np.clip(np.random.poisson(3 + (corrig>0) + (ql=="L1")), 1, 12))
hist = np.round(np.clip(np.random.beta(2,5)+(cat=="CHP Maintenance")*0.05,0,1),3)
oem = np.random.binomial(1, 0.55 if cat in ["E&M Spares","CHP Maintenance","Overhaul"] else 0.35)
bidder_sim = np.round(np.maximum(np.random.normal(pq_sim*0.9, pq_sim*0.3),0.2),2)
bidder_turn = np.round(np.maximum(np.random.normal(pq_turn*0.95, pq_turn*0.35),0.3),2)
safety = np.round(np.clip(np.random.normal(0.75,0.15),0.2,0.99),3)
text = synth_text(cat, strict_pq=(pq_turn>est*1.4 or pq_sim>est*1.4), oem=oem==1, site_visit=site_visit==1)
# win label
logit = (0.3*(oem) + 0.6*(hist-0.5) +
0.25*np.tanh((bidder_turn - pq_turn)/max(0.5, pq_turn*0.6)) +
0.25*np.tanh((bidder_sim - pq_sim)/max(0.5, pq_sim*0.6)) -
0.15*(corrig) - 0.2*(retender) - 0.1*(exp_bidders-3)/5 +
0.3*(safety-0.7) + (0.15 if ql=="QCBS" else 0.0) + (0.1 if msme and est<2.0 else 0.0) - 0.05*(site_visit))
p = 1/(1+np.exp(-logit))
win = np.random.binomial(1, np.clip(p, 0.02, 0.9))
# price-to-estimate ratio target (for L1-like behavior; QCBS slightly higher ratios)
base_ratio = np.clip(np.random.normal(0.98 if ql=="L1" else 1.03, 0.05), 0.85, 1.15)
# increase ratio with high corrigenda/retender; decrease with OEM tie-up (efficiency)
ratio = base_ratio + 0.01*corrig + 0.02*retender - 0.01*oem + 0.01*np.random.randn()
ratio = float(np.clip(ratio, 0.80, 1.20))
rows.append({
"tender_id": f"T{i:05d}", "issue_date": str(issue), "org": org,
"package_name": f"{cat}{region}", "category": cat, "region": region,
"est_value_crore": est, "emd_percent": emd, "qcbs_or_l1": ql,
"bid_window_days": window, "corrigenda_count": corrig,
"pq_turnover_cr": pq_turn, "pq_similar_work_cr": pq_sim,
"site_visit_required": site_visit, "make_in_india_clause": mii,
"local_content_req_pct": lcr, "msme_pref": msme,
"retender_flag": retender, "expected_bidders": exp_bidders,
"tender_text": text, "bidder_hist_winrate_cat": hist, "bidder_oem_tieup": oem,
"bidder_similar_work_cr": bidder_sim, "bidder_turnover_cr": bidder_turn,
"bidder_safety_score": safety, "label_win": win, "price_ratio": ratio
})
return pd.DataFrame(rows)
def temporal_split(df, frac=0.8):
df = df.copy()
df["issue_date"] = pd.to_datetime(df["issue_date"])
df = df.sort_values("issue_date").reset_index(drop=True)
cut = int(len(df)*frac)
return df.iloc[:cut].reset_index(drop=True), df.iloc[cut:].reset_index(drop=True)
# ---------------- Preprocess & models ----------------
def build_preprocessor():
num_cols = ["est_value_crore","emd_percent","bid_window_days","corrigenda_count",
"pq_turnover_cr","pq_similar_work_cr","local_content_req_pct",
"expected_bidders","bidder_hist_winrate_cat","bidder_similar_work_cr",
"bidder_turnover_cr","bidder_safety_score"]
cat_cols = ["org","category","region","qcbs_or_l1","site_visit_required","make_in_india_clause",
"msme_pref","retender_flag","bidder_oem_tieup"]
pre = ColumnTransformer([
("num", StandardScaler(with_mean=False), num_cols),
("cat", OneHotEncoder(handle_unknown="ignore"), cat_cols),
("txt", TfidfVectorizer(max_features=12000, ngram_range=(1,2)), "tender_text")
])
return pre
def compliance_gap(payload):
missing = []
text = (payload.get("tender_text") or "").lower()
if "oem" in text and int(payload.get("bidder_oem_tieup",0)) == 0:
missing.append("OEM Authorisation / Back-to-Back")
if float(payload.get("bidder_turnover_cr",0)) < float(payload.get("pq_turnover_cr",0)):
missing.append("PQ Turnover (add JV partner or audited statements)")
if float(payload.get("bidder_similar_work_cr",0)) < float(payload.get("pq_similar_work_cr",0)):
missing.append("PQ Similar Work (JV/Consortium or additional work orders)")
if "site visit" in text and int(payload.get("site_visit_required",0)) == 1:
missing.append("Pre-bid Site Visit Proof/Declaration")
if int(payload.get("msme_pref",0)) == 1:
missing.append("MSME Certificate / Declaration if eligible")
if int(payload.get("make_in_india_clause",0)) == 1:
missing.append("Local Content Declaration (CA certified)")
if float(payload.get("emd_percent",0)) > 0:
missing.append("EMD/BG instrument per format (or exemption, if applicable)")
return missing
# training (classification + calibration + regression)
@st.cache_resource(show_spinner=True)
def train_all(df):
df_lab = df.dropna(subset=["label_win"])
if len(df_lab) < 50:
df_lab = make_synth_df(N=900, seed=42)
train_df, test_df = temporal_split(df_lab, frac=0.8)
y_train = train_df["label_win"].values
y_test = test_df["label_win"].values
pre = build_preprocessor()
clf = RandomForestClassifier(
n_estimators=500, max_depth=None, min_samples_split=4, min_samples_leaf=2,
n_jobs=-1, random_state=42, class_weight="balanced_subsample"
)
cls_pipe = Pipeline([("pre", pre), ("clf", clf)])
cls_pipe.fit(train_df.drop(columns=["label_win","price_ratio"]), y_train)
# calibration
val_cut = int(len(train_df)*0.8)
X_val = train_df.iloc[val_cut:].drop(columns=["label_win","price_ratio"])
y_val = y_train[val_cut:]
p_val = cls_pipe.predict_proba(X_val)[:,1]
cal = IsotonicRegression(out_of_bounds="clip").fit(p_val, y_val)
# metrics
p_test = cls_pipe.predict_proba(test_df.drop(columns=["label_win","price_ratio"]))[:,1]
p_test_cal = cal.transform(p_test)
metrics = {
"train_size": int(len(train_df)),
"test_size": int(len(test_df)),
"roc_auc": float(roc_auc_score(y_test, p_test)),
"ap": float(average_precision_score(y_test, p_test)),
"brier": float(brier_score_loss(y_test, p_test_cal)),
"report": classification_report(y_test, (p_test_cal>=0.5).astype(int), digits=3)
}
# regression for price ratio
y_ratio_train = train_df["price_ratio"].values
reg = RandomForestRegressor(n_estimators=400, random_state=42, n_jobs=-1)
reg_pipe = Pipeline([("pre", pre), ("reg", reg)])
reg_pipe.fit(train_df.drop(columns=["label_win","price_ratio"]), y_ratio_train)
return {"cls": cls_pipe, "cal": cal, "reg": reg_pipe}, metrics
# ---------------- Data input ----------------
st.sidebar.header("Training Data")
mode = st.sidebar.radio("Choose source", ["Synthetic (default)","Upload CSV (real)"], index=0)
up = None
if mode == "Upload CSV (real)":
up = st.sidebar.file_uploader("Upload CSV (schema-compatible)", type=["csv"])
if up is not None:
df = pd.read_csv(up)
else:
df = make_synth_df(N=900, seed=42)
with st.spinner("Training classification, calibration, and price model..."):
arts, metrics = train_all(df)
# decision bands
st.sidebar.header("Decision Bands")
go_thr = st.sidebar.slider("GO threshold (≥)", 0.50, 0.95, 0.65, 0.01)
cond_lo = st.sidebar.slider("Conditional band (low)", 0.30, go_thr-0.01, 0.45, 0.01)
cond_hi = go_thr
def band_label(p):
if p >= go_thr: return "GO"
if cond_lo <= p < cond_hi: return "CONDITIONAL (Fix Gaps/Partner)"
return "NO-BID / PARTNER"
# exemplar upload (RAG)
st.sidebar.header("RAG Exemplars (winning proposal snippets)")
ex_files = st.sidebar.file_uploader("Upload .txt/.md/.pdf (multi-file)", type=["txt","md","pdf"], accept_multiple_files=True)
ex_docs = []
if ex_files:
for f in ex_files:
if f.name.lower().endswith(".pdf"):
try:
reader = PdfReader(f)
text = ""
for page in reader.pages[:10]: # light parse
text += page.extract_text() or ""
except Exception:
text = ""
else:
text = f.read().decode("utf-8", errors="ignore")
if text.strip():
ex_docs.append({"name": f.name, "text": text})
# build exemplar index
rag_vec = None; rag_corpus = None
if ex_docs:
rag_corpus = [d["text"] for d in ex_docs]
rag_vec = TfidfVectorizer(max_features=20000, ngram_range=(1,2)).fit(rag_corpus)
rag_matrix = rag_vec.transform(rag_corpus)
# metrics panel
left, right = st.columns([1,1])
with left:
st.subheader("Evaluation (Temporal Test Split)")
st.markdown(f"**Train**: {metrics['train_size']} &nbsp;&nbsp; **Test**: {metrics['test_size']}")
st.markdown(f"**ROC-AUC**: {metrics['roc_auc']:.3f} \n**Average Precision**: {metrics['ap']:.3f} \n**Brier (calibrated)**: {metrics['brier']:.3f}")
st.code(metrics["report"], language="text")
with right:
st.subheader("Jargons / Checks")
st.markdown(TEXT_HELP)
st.markdown("---")
# ---------------- Single scoring ----------------
st.header("Score a Single Tender")
with st.form("score_one"):
c1,c2,c3 = st.columns(3)
org = c1.selectbox("Organisation", ["NTPC","SCCL"])
category = c2.selectbox("Classification / Package", ["CHP Maintenance","OB Removal","Solar EPC","E&M Spares","BoP O&M","Overhaul","Ash Handling"])
region = c3.selectbox("Region", ["ER","SR","WR","NR"])
c4,c5,c6 = st.columns(3)
est_value_crore = c4.number_input("Estimated Value (Cr)", min_value=0.1, value=3.2, step=0.1)
emd_percent = c5.number_input("EMD (%)", min_value=0.0, value=2.0, step=0.1)
qcbs_or_l1 = c6.selectbox("Evaluation Method", ["L1","QCBS"])
c7,c8,c9 = st.columns(3)
bid_window_days = c7.number_input("Bid Window (days)", 1, 60, 21)
corrigenda_count = c8.number_input("Corrigenda Count", 0, 10, 1)
expected_bidders = c9.number_input("Expected Bidders", 1, 30, 4)
c10,c11,c12 = st.columns(3)
pq_turnover_cr = c10.number_input("PQ Turnover (Cr)", min_value=0.0, value=5.0, step=0.1)
pq_similar_work_cr = c11.number_input("PQ Similar Work (Cr)", min_value=0.0, value=4.0, step=0.1)
local_content_req_pct = c12.number_input("Local Content Req. (%)", 0.0, 100.0, 30.0, 1.0)
c13,c14,c15 = st.columns(3)
site_visit_required = c13.selectbox("Pre-bid Site Visit?", [0,1])
make_in_india_clause = c14.selectbox("Make in India?", [0,1])
msme_pref = c15.selectbox("MSME Preference?", [0,1])
c16,c17,c18 = st.columns(3)
retender_flag = c16.selectbox("Re-Tender?", [0,1])
bidder_hist_winrate_cat = c17.slider("Your Past Win-rate (0-1)", 0.0, 1.0, 0.58, 0.01)
bidder_oem_tieup = c18.selectbox("OEM Tie-up?", [0,1])
c19,c20,c21 = st.columns(3)
bidder_similar_work_cr = c19.number_input("Your Similar Work (Cr)", 0.0, 1000.0, 4.5, 0.1)
bidder_turnover_cr = c20.number_input("Your Avg Turnover (Cr)", 0.0, 1000.0, 6.0, 0.1)
bidder_safety_score = c21.slider("HSE/ESG Safety Score (0-1)", 0.0, 1.0, 0.82, 0.01)
tender_text = st.text_area("Scope/PQ Highlights", "Conveyor belt replacement and idler maintenance with OEM authorization preferred. Pre-bid site visit is mandatory. Turnover > 1.5x estimate.", height=110)
tender_id = st.text_input("E-Tender Ref No.", "ADHOC001")
issue_date = st.date_input("NIT Date", datetime.today().date())
submitted = st.form_submit_button("Predict")
if submitted:
row = {"tender_id": tender_id, "issue_date": str(issue_date), "org": org,
"package_name": f"{category}{region}", "category": category, "region": region,
"est_value_crore": est_value_crore, "emd_percent": emd_percent, "qcbs_or_l1": qcbs_or_l1,
"bid_window_days": bid_window_days, "corrigenda_count": int(corrigenda_count),
"pq_turnover_cr": pq_turnover_cr, "pq_similar_work_cr": pq_similar_work_cr,
"site_visit_required": site_visit_required, "make_in_india_clause": make_in_india_clause,
"local_content_req_pct": local_content_req_pct, "msme_pref": msme_pref,
"retender_flag": retender_flag, "expected_bidders": int(expected_bidders),
"tender_text": tender_text, "bidder_hist_winrate_cat": bidder_hist_winrate_cat,
"bidder_oem_tieup": bidder_oem_tieup, "bidder_similar_work_cr": bidder_similar_work_cr,
"bidder_turnover_cr": bidder_turnover_cr, "bidder_safety_score": bidder_safety_score}
# win prob
base = arts["cls"].predict_proba(pd.DataFrame([row]))[:,1][0]
proba = float(arts["cal"].transform([base])[0])
st.success(f"Calibrated win probability: **{proba:.3f}** | Decision: **{('GO' if proba>=0.65 else ('CONDITIONAL' if proba>=0.45 else 'NO-BID / PARTNER'))}**")
# compliance pointers
tips = []
gaps = compliance_gap(row)
if gaps: tips.append("Compliance gaps: " + "; ".join(gaps))
if bidder_oem_tieup == 0 and "oem" in tender_text.lower():
tips.append("Secure OEM Back-to-Back / Authorisation letter.")
if corrigenda_count >= 2:
tips.append("High Corrigenda: monitor changes; tailor methodology and pricing accordingly.")
if expected_bidders >= 6 and qcbs_or_l1 == "L1":
tips.append("Crowded L1: tighten pricing, reduce optional scope, enhance delivery assurances.")
if bidder_hist_winrate_cat < 0.45:
tips.append("Strengthen past performance (completion certs, LD-free delivery, client references).")
if tips:
st.info("**Improvement pointers**:\n- " + "\n- ".join(tips))
# price-band guidance
ratio_pred = float(arts["reg"].predict(pd.DataFrame([row]))[0])
# create a band around the prediction using heuristic ±IQR from training errors if available -> fallback ±3%
# (Since cache doesn't keep residuals, use ±0.03 default)
low = max(0.80, ratio_pred - 0.03)
high = min(1.20, ratio_pred + 0.03)
st.subheader("Suggested Price Band")
st.markdown(f"- **Predicted price-to-estimate ratio**: `{ratio_pred:.3f}`")
st.markdown(f"- **Suggested band**: **{low:.3f}{high:.3f} × Estimated Value**")
st.caption("Example: If estimate is ₹100.0 Cr, quote within ₹{:.1f} – ₹{:.1f} Cr.".format(100*low, 100*high))
# RAG exemplar panel
if 'rag_vec' in globals() and rag_vec is not None and ex_docs:
st.subheader("RAG Exemplar Snippets (from your uploads)")
query = (tender_text or "") + " " + category
q_vec = rag_vec.transform([query])
sims = cosine_similarity(q_vec, rag_matrix).ravel()
topk = sims.argsort()[::-1][:3]
for idx in topk:
doc = ex_docs[idx]
st.markdown(f"**{doc['name']}** — similarity `{sims[idx]:.3f}`")
snippet = doc["text"][:800].strip().replace("\n","\n\n")
st.code(snippet, language="markdown")
else:
st.caption("Upload winning proposal snippets in the sidebar (.txt/.md/.pdf) to enable exemplar retrieval.")
st.markdown("---")
# batch scoring
st.header("Batch Scoring (CSV → Excel with price band)")
batch = st.file_uploader("Upload CSV to score (same columns as schema, label_win optional)", type=["csv"])
if batch is not None:
df_in = pd.read_csv(batch)
probs = arts["cal"].transform(arts["cls"].predict_proba(df_in)[:,1])
ratio_preds = arts["reg"].predict(df_in)
bands_lo = np.clip(ratio_preds - 0.03, 0.80, 1.20)
bands_hi = np.clip(ratio_preds + 0.03, 0.80, 1.20)
def band_label(p):
if p >= 0.65: return "GO"
if p >= 0.45: return "CONDITIONAL"
return "NO-BID / PARTNER"
out = df_in.copy()
out["win_probability"] = probs
out["decision_band"] = [band_label(p) for p in probs]
out["pred_price_ratio"] = ratio_preds
out["price_band_low"] = bands_lo
out["price_band_high"] = bands_hi
recs = []
for _, r in df_in.iterrows():
payload = r.to_dict()
gaps = compliance_gap(payload)
rec = []
if gaps: rec.append("Gaps: " + "; ".join(gaps))
if int(payload.get("bidder_oem_tieup",0))==0 and "oem" in str(payload.get("tender_text","")).lower():
rec.append("Add OEM letter")
if int(payload.get("corrigenda_count",0))>=2:
rec.append("Monitor corrigenda closely")
recs.append(" | ".join(rec))
out["recommendations"] = recs
buf = io.BytesIO()
with pd.ExcelWriter(buf, engine="openpyxl") as xlw:
out.to_excel(xlw, index=False, sheet_name="Scored")
st.download_button("Download Scored Excel", data=buf.getvalue(), file_name="tender_scored_with_price.xlsx", mime="application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")
# schema object shown for users
SCHEMA = {
"tender_id": "E-Tender Ref No.", "issue_date": "NIT Date (YYYY-MM-DD)",
"org": "Organisation (NTPC / SCCL)", "package_name": "Package / Work Name (optional)",
"category": "Classification / Package Type", "region": "Region / Location (ER/SR/WR/NR)",
"est_value_crore": "Estimated Value (INR Cr)", "emd_percent": "EMD (%)",
"qcbs_or_l1": "Evaluation Method (L1/QCBS)", "bid_window_days": "Bid Window (days)",
"corrigenda_count": "Corrigenda Count", "pq_turnover_cr": "PQ: Avg Turnover required (Cr)",
"pq_similar_work_cr": "PQ: Similar Work value required (Cr)", "site_visit_required": "Site Visit Required? (0/1)",
"make_in_india_clause": "Make in India Clause (0/1)", "local_content_req_pct": "Local Content Requirement (%)",
"msme_pref": "MSME Preference (0/1)", "retender_flag": "Re-Tender (0/1)",
"expected_bidders": "Expected Bidders (best guess)", "tender_text": "Scope/PQ highlights (free text)",
"bidder_hist_winrate_cat": "Your past win-rate (0-1)", "bidder_oem_tieup": "OEM Tie-up (0/1)",
"bidder_similar_work_cr": "Your Similar Work (Cr)", "bidder_turnover_cr": "Your Avg Turnover (Cr)",
"bidder_safety_score": "HSE/ESG / Safety Score (0-1)", "label_win": "Target: Win (1) / Lose (0)",
"price_ratio": "Train-only (actual price / estimate)"
}
with st.expander("Schema (column → meaning)"):
st.json(SCHEMA)
st.header("Export trained models")
if st.button("Download model bundle"):
buf = io.BytesIO()
joblib.dump(arts, buf) # contains cls, cal, reg
st.download_button("Download model_bundle.pkl", data=buf.getvalue(), file_name="model_bundle.pkl")
st.caption("Tip: Upload your own winning proposal snippets (txt/md/pdf) to enable exemplar retrieval for clause phrasing.")