Automated-CV-Parser / pages /2_Analytics.py
Zeqhx's picture
Deploy CV parser dashboard with dataset 2 model
c59578d verified
"""Analytics page — batch-upload CVs and aggregate extracted entities."""
from collections import Counter
import pandas as pd
import streamlit as st
import config
from lib.model import predict, group_entities
from lib.extract import extract_text
from lib.ui import model_selector
from lib import viz
st.set_page_config(page_title="Analytics", page_icon="📊", layout="wide")
lm = model_selector()
st.title("📊 CV Corpus Analytics")
st.caption("Upload a batch of CVs — select every file in a folder — to see aggregate "
"skills and roles across the set.")
if lm.is_fallback:
st.warning("Demo mode: predictions come from an untrained head and are not meaningful.",
icon="⚠️")
uploads = st.file_uploader(
"Upload CVs (PDF / DOCX / TXT). Tip: open a folder and Ctrl/Cmd-A to select all.",
type=["pdf", "docx", "txt"], accept_multiple_files=True,
)
if not uploads:
st.info("Upload one or more CVs to build the analytics.")
st.stop()
if st.button(f"Analyze {len(uploads)} file(s)", type="primary"):
counters = {t: Counter() for t in config.ENTITY_TYPES}
rows = []
failures = []
progress = st.progress(0.0, text="Processing…")
for i, up in enumerate(uploads, start=1):
text, err = extract_text(up)
if err:
failures.append((up.name, err))
else:
_, entities = predict(text, lm)
grouped = group_entities(entities) # de-duped per CV
for etype in config.ENTITY_TYPES:
for val in grouped[etype]:
counters[etype][val] += 1
rows.append({
"file": up.name,
"job_titles": len(grouped["JOB_TITLE"]),
"skills": len(grouped["SKILL"]),
"education": len(grouped["EDUCATION"]),
})
progress.progress(i / len(uploads), text=f"Processed {i}/{len(uploads)}")
progress.empty()
st.session_state["analytics"] = {"counters": counters, "rows": rows,
"failures": failures, "n": len(uploads)}
# ---- Render (persists across reruns) ----------------------------------------
data = st.session_state.get("analytics")
if data:
counters, rows, failures = data["counters"], data["rows"], data["failures"]
a, b, c, d = st.columns(4)
a.metric("CVs processed", len(rows))
b.metric("Unique job titles", len(counters["JOB_TITLE"]))
c.metric("Unique skills", len(counters["SKILL"]))
d.metric("Unique education", len(counters["EDUCATION"]))
if failures:
with st.expander(f"⚠️ {len(failures)} file(s) could not be read"):
for name, err in failures:
st.write(f"- **{name}** — {err}")
st.subheader("☁️ Skills word cloud")
fig = viz.wordcloud_figure(dict(counters["SKILL"]))
if fig is not None:
st.pyplot(fig)
elif counters["SKILL"]:
st.info("Install `wordcloud` + `matplotlib` to see the cloud. Showing top skills below instead.")
else:
st.caption("No skills extracted.")
st.subheader("🏆 Most common entities")
cols = st.columns(3)
for col, etype in zip(cols, config.ENTITY_TYPES):
with col:
bar = viz.top_bar_figure(counters[etype], config.ENTITY_LABELS[etype],
config.ENTITY_COLORS[etype])
if bar is not None:
st.plotly_chart(bar, use_container_width=True)
else:
top = counters[etype].most_common(15)
st.write(f"**{config.ENTITY_LABELS[etype]}**")
st.table(pd.DataFrame(top, columns=["entity", "count"]) if top
else pd.DataFrame(columns=["entity", "count"]))
st.subheader("📋 Per-file breakdown")
df = pd.DataFrame(rows)
st.dataframe(df, use_container_width=True)
# Full long-form export of every (file-agnostic) entity count
export = pd.concat([
pd.DataFrame({"type": config.ENTITY_LABELS[t],
"entity": list(counters[t].keys()),
"count": list(counters[t].values())})
for t in config.ENTITY_TYPES
], ignore_index=True)
st.download_button("⬇️ Download entity counts (CSV)",
data=export.to_csv(index=False),
file_name="cv_entity_counts.csv", mime="text/csv")