Spaces:
Running
Running
| """Analytics page — batch-upload CVs and aggregate extracted entities.""" | |
| from collections import Counter | |
| import pandas as pd | |
| import streamlit as st | |
| import config | |
| from lib.model import predict, group_entities | |
| from lib.extract import extract_text | |
| from lib.ui import model_selector | |
| from lib import viz | |
| st.set_page_config(page_title="Analytics", page_icon="📊", layout="wide") | |
| lm = model_selector() | |
| st.title("📊 CV Corpus Analytics") | |
| st.caption("Upload a batch of CVs — select every file in a folder — to see aggregate " | |
| "skills and roles across the set.") | |
| if lm.is_fallback: | |
| st.warning("Demo mode: predictions come from an untrained head and are not meaningful.", | |
| icon="⚠️") | |
| uploads = st.file_uploader( | |
| "Upload CVs (PDF / DOCX / TXT). Tip: open a folder and Ctrl/Cmd-A to select all.", | |
| type=["pdf", "docx", "txt"], accept_multiple_files=True, | |
| ) | |
| if not uploads: | |
| st.info("Upload one or more CVs to build the analytics.") | |
| st.stop() | |
| if st.button(f"Analyze {len(uploads)} file(s)", type="primary"): | |
| counters = {t: Counter() for t in config.ENTITY_TYPES} | |
| rows = [] | |
| failures = [] | |
| progress = st.progress(0.0, text="Processing…") | |
| for i, up in enumerate(uploads, start=1): | |
| text, err = extract_text(up) | |
| if err: | |
| failures.append((up.name, err)) | |
| else: | |
| _, entities = predict(text, lm) | |
| grouped = group_entities(entities) # de-duped per CV | |
| for etype in config.ENTITY_TYPES: | |
| for val in grouped[etype]: | |
| counters[etype][val] += 1 | |
| rows.append({ | |
| "file": up.name, | |
| "job_titles": len(grouped["JOB_TITLE"]), | |
| "skills": len(grouped["SKILL"]), | |
| "education": len(grouped["EDUCATION"]), | |
| }) | |
| progress.progress(i / len(uploads), text=f"Processed {i}/{len(uploads)}") | |
| progress.empty() | |
| st.session_state["analytics"] = {"counters": counters, "rows": rows, | |
| "failures": failures, "n": len(uploads)} | |
| # ---- Render (persists across reruns) ---------------------------------------- | |
| data = st.session_state.get("analytics") | |
| if data: | |
| counters, rows, failures = data["counters"], data["rows"], data["failures"] | |
| a, b, c, d = st.columns(4) | |
| a.metric("CVs processed", len(rows)) | |
| b.metric("Unique job titles", len(counters["JOB_TITLE"])) | |
| c.metric("Unique skills", len(counters["SKILL"])) | |
| d.metric("Unique education", len(counters["EDUCATION"])) | |
| if failures: | |
| with st.expander(f"⚠️ {len(failures)} file(s) could not be read"): | |
| for name, err in failures: | |
| st.write(f"- **{name}** — {err}") | |
| st.subheader("☁️ Skills word cloud") | |
| fig = viz.wordcloud_figure(dict(counters["SKILL"])) | |
| if fig is not None: | |
| st.pyplot(fig) | |
| elif counters["SKILL"]: | |
| st.info("Install `wordcloud` + `matplotlib` to see the cloud. Showing top skills below instead.") | |
| else: | |
| st.caption("No skills extracted.") | |
| st.subheader("🏆 Most common entities") | |
| cols = st.columns(3) | |
| for col, etype in zip(cols, config.ENTITY_TYPES): | |
| with col: | |
| bar = viz.top_bar_figure(counters[etype], config.ENTITY_LABELS[etype], | |
| config.ENTITY_COLORS[etype]) | |
| if bar is not None: | |
| st.plotly_chart(bar, use_container_width=True) | |
| else: | |
| top = counters[etype].most_common(15) | |
| st.write(f"**{config.ENTITY_LABELS[etype]}**") | |
| st.table(pd.DataFrame(top, columns=["entity", "count"]) if top | |
| else pd.DataFrame(columns=["entity", "count"])) | |
| st.subheader("📋 Per-file breakdown") | |
| df = pd.DataFrame(rows) | |
| st.dataframe(df, use_container_width=True) | |
| # Full long-form export of every (file-agnostic) entity count | |
| export = pd.concat([ | |
| pd.DataFrame({"type": config.ENTITY_LABELS[t], | |
| "entity": list(counters[t].keys()), | |
| "count": list(counters[t].values())}) | |
| for t in config.ENTITY_TYPES | |
| ], ignore_index=True) | |
| st.download_button("⬇️ Download entity counts (CSV)", | |
| data=export.to_csv(index=False), | |
| file_name="cv_entity_counts.csv", mime="text/csv") | |