Spaces:

Zeqhx
/

Automated-CV-Parser

Running

App Files Files Community

Automated-CV-Parser / pages /2_Analytics.py

Zeqhx

Deploy CV parser dashboard with dataset 2 model

c59578d verified 2 days ago

raw

history blame contribute delete

4.4 kB

	"""Analytics page — batch-upload CVs and aggregate extracted entities."""
	from collections import Counter

	import pandas as pd
	import streamlit as st

	import config
	from lib.model import predict, group_entities
	from lib.extract import extract_text
	from lib.ui import model_selector
	from lib import viz

	st.set_page_config(page_title="Analytics", page_icon="📊", layout="wide")

	lm = model_selector()

	st.title("📊 CV Corpus Analytics")
	st.caption("Upload a batch of CVs — select every file in a folder — to see aggregate "
	"skills and roles across the set.")
	if lm.is_fallback:
	st.warning("Demo mode: predictions come from an untrained head and are not meaningful.",
	icon="⚠️")

	uploads = st.file_uploader(
	"Upload CVs (PDF / DOCX / TXT). Tip: open a folder and Ctrl/Cmd-A to select all.",
	type=["pdf", "docx", "txt"], accept_multiple_files=True,
	)

	if not uploads:
	st.info("Upload one or more CVs to build the analytics.")
	st.stop()

	if st.button(f"Analyze {len(uploads)} file(s)", type="primary"):
	counters = {t: Counter() for t in config.ENTITY_TYPES}
	rows = []
	failures = []
	progress = st.progress(0.0, text="Processing…")

	for i, up in enumerate(uploads, start=1):
	text, err = extract_text(up)
	if err:
	failures.append((up.name, err))
	else:
	_, entities = predict(text, lm)
	grouped = group_entities(entities) # de-duped per CV
	for etype in config.ENTITY_TYPES:
	for val in grouped[etype]:
	counters[etype][val] += 1
	rows.append({
	"file": up.name,
	"job_titles": len(grouped["JOB_TITLE"]),
	"skills": len(grouped["SKILL"]),
	"education": len(grouped["EDUCATION"]),
	})
	progress.progress(i / len(uploads), text=f"Processed {i}/{len(uploads)}")
	progress.empty()

	st.session_state["analytics"] = {"counters": counters, "rows": rows,
	"failures": failures, "n": len(uploads)}

	# ---- Render (persists across reruns) ----------------------------------------
	data = st.session_state.get("analytics")
	if data:
	counters, rows, failures = data["counters"], data["rows"], data["failures"]

	a, b, c, d = st.columns(4)
	a.metric("CVs processed", len(rows))
	b.metric("Unique job titles", len(counters["JOB_TITLE"]))
	c.metric("Unique skills", len(counters["SKILL"]))
	d.metric("Unique education", len(counters["EDUCATION"]))
	if failures:
	with st.expander(f"⚠️ {len(failures)} file(s) could not be read"):
	for name, err in failures:
	st.write(f"- {name} — {err}")

	st.subheader("☁️ Skills word cloud")
	fig = viz.wordcloud_figure(dict(counters["SKILL"]))
	if fig is not None:
	st.pyplot(fig)
	elif counters["SKILL"]:
	st.info("Install `wordcloud` + `matplotlib` to see the cloud. Showing top skills below instead.")
	else:
	st.caption("No skills extracted.")

	st.subheader("🏆 Most common entities")
	cols = st.columns(3)
	for col, etype in zip(cols, config.ENTITY_TYPES):
	with col:
	bar = viz.top_bar_figure(counters[etype], config.ENTITY_LABELS[etype],
	config.ENTITY_COLORS[etype])
	if bar is not None:
	st.plotly_chart(bar, use_container_width=True)
	else:
	top = counters[etype].most_common(15)
	st.write(f"{config.ENTITY_LABELS[etype]}")
	st.table(pd.DataFrame(top, columns=["entity", "count"]) if top
	else pd.DataFrame(columns=["entity", "count"]))

	st.subheader("📋 Per-file breakdown")
	df = pd.DataFrame(rows)
	st.dataframe(df, use_container_width=True)

	# Full long-form export of every (file-agnostic) entity count
	export = pd.concat([
	pd.DataFrame({"type": config.ENTITY_LABELS[t],
	"entity": list(counters[t].keys()),
	"count": list(counters[t].values())})
	for t in config.ENTITY_TYPES
	], ignore_index=True)
	st.download_button("⬇️ Download entity counts (CSV)",
	data=export.to_csv(index=False),
	file_name="cv_entity_counts.csv", mime="text/csv")