"""Analytics page โ€” batch-upload CVs and aggregate extracted entities.""" from collections import Counter import pandas as pd import streamlit as st import config from lib.model import predict, group_entities from lib.extract import extract_text from lib.ui import model_selector from lib import viz st.set_page_config(page_title="Analytics", page_icon="๐Ÿ“Š", layout="wide") lm = model_selector() st.title("๐Ÿ“Š CV Corpus Analytics") st.caption("Upload a batch of CVs โ€” select every file in a folder โ€” to see aggregate " "skills and roles across the set.") if lm.is_fallback: st.warning("Demo mode: predictions come from an untrained head and are not meaningful.", icon="โš ๏ธ") uploads = st.file_uploader( "Upload CVs (PDF / DOCX / TXT). Tip: open a folder and Ctrl/Cmd-A to select all.", type=["pdf", "docx", "txt"], accept_multiple_files=True, ) if not uploads: st.info("Upload one or more CVs to build the analytics.") st.stop() if st.button(f"Analyze {len(uploads)} file(s)", type="primary"): counters = {t: Counter() for t in config.ENTITY_TYPES} rows = [] failures = [] progress = st.progress(0.0, text="Processingโ€ฆ") for i, up in enumerate(uploads, start=1): text, err = extract_text(up) if err: failures.append((up.name, err)) else: _, entities = predict(text, lm) grouped = group_entities(entities) # de-duped per CV for etype in config.ENTITY_TYPES: for val in grouped[etype]: counters[etype][val] += 1 rows.append({ "file": up.name, "job_titles": len(grouped["JOB_TITLE"]), "skills": len(grouped["SKILL"]), "education": len(grouped["EDUCATION"]), }) progress.progress(i / len(uploads), text=f"Processed {i}/{len(uploads)}") progress.empty() st.session_state["analytics"] = {"counters": counters, "rows": rows, "failures": failures, "n": len(uploads)} # ---- Render (persists across reruns) ---------------------------------------- data = st.session_state.get("analytics") if data: counters, rows, failures = data["counters"], data["rows"], data["failures"] a, b, c, d = st.columns(4) a.metric("CVs processed", len(rows)) b.metric("Unique job titles", len(counters["JOB_TITLE"])) c.metric("Unique skills", len(counters["SKILL"])) d.metric("Unique education", len(counters["EDUCATION"])) if failures: with st.expander(f"โš ๏ธ {len(failures)} file(s) could not be read"): for name, err in failures: st.write(f"- **{name}** โ€” {err}") st.subheader("โ˜๏ธ Skills word cloud") fig = viz.wordcloud_figure(dict(counters["SKILL"])) if fig is not None: st.pyplot(fig) elif counters["SKILL"]: st.info("Install `wordcloud` + `matplotlib` to see the cloud. Showing top skills below instead.") else: st.caption("No skills extracted.") st.subheader("๐Ÿ† Most common entities") cols = st.columns(3) for col, etype in zip(cols, config.ENTITY_TYPES): with col: bar = viz.top_bar_figure(counters[etype], config.ENTITY_LABELS[etype], config.ENTITY_COLORS[etype]) if bar is not None: st.plotly_chart(bar, use_container_width=True) else: top = counters[etype].most_common(15) st.write(f"**{config.ENTITY_LABELS[etype]}**") st.table(pd.DataFrame(top, columns=["entity", "count"]) if top else pd.DataFrame(columns=["entity", "count"])) st.subheader("๐Ÿ“‹ Per-file breakdown") df = pd.DataFrame(rows) st.dataframe(df, use_container_width=True) # Full long-form export of every (file-agnostic) entity count export = pd.concat([ pd.DataFrame({"type": config.ENTITY_LABELS[t], "entity": list(counters[t].keys()), "count": list(counters[t].values())}) for t in config.ENTITY_TYPES ], ignore_index=True) st.download_button("โฌ‡๏ธ Download entity counts (CSV)", data=export.to_csv(index=False), file_name="cv_entity_counts.csv", mime="text/csv")