File size: 4,398 Bytes
c59578d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
"""Analytics page — batch-upload CVs and aggregate extracted entities."""
from collections import Counter

import pandas as pd
import streamlit as st

import config
from lib.model import predict, group_entities
from lib.extract import extract_text
from lib.ui import model_selector
from lib import viz

st.set_page_config(page_title="Analytics", page_icon="📊", layout="wide")

lm = model_selector()

st.title("📊 CV Corpus Analytics")
st.caption("Upload a batch of CVs — select every file in a folder — to see aggregate "
           "skills and roles across the set.")
if lm.is_fallback:
    st.warning("Demo mode: predictions come from an untrained head and are not meaningful.",
               icon="⚠️")

uploads = st.file_uploader(
    "Upload CVs (PDF / DOCX / TXT). Tip: open a folder and Ctrl/Cmd-A to select all.",
    type=["pdf", "docx", "txt"], accept_multiple_files=True,
)

if not uploads:
    st.info("Upload one or more CVs to build the analytics.")
    st.stop()

if st.button(f"Analyze {len(uploads)} file(s)", type="primary"):
    counters = {t: Counter() for t in config.ENTITY_TYPES}
    rows = []
    failures = []
    progress = st.progress(0.0, text="Processing…")

    for i, up in enumerate(uploads, start=1):
        text, err = extract_text(up)
        if err:
            failures.append((up.name, err))
        else:
            _, entities = predict(text, lm)
            grouped = group_entities(entities)  # de-duped per CV
            for etype in config.ENTITY_TYPES:
                for val in grouped[etype]:
                    counters[etype][val] += 1
            rows.append({
                "file": up.name,
                "job_titles": len(grouped["JOB_TITLE"]),
                "skills": len(grouped["SKILL"]),
                "education": len(grouped["EDUCATION"]),
            })
        progress.progress(i / len(uploads), text=f"Processed {i}/{len(uploads)}")
    progress.empty()

    st.session_state["analytics"] = {"counters": counters, "rows": rows,
                                     "failures": failures, "n": len(uploads)}

# ---- Render (persists across reruns) ----------------------------------------
data = st.session_state.get("analytics")
if data:
    counters, rows, failures = data["counters"], data["rows"], data["failures"]

    a, b, c, d = st.columns(4)
    a.metric("CVs processed", len(rows))
    b.metric("Unique job titles", len(counters["JOB_TITLE"]))
    c.metric("Unique skills", len(counters["SKILL"]))
    d.metric("Unique education", len(counters["EDUCATION"]))
    if failures:
        with st.expander(f"⚠️ {len(failures)} file(s) could not be read"):
            for name, err in failures:
                st.write(f"- **{name}** — {err}")

    st.subheader("☁️ Skills word cloud")
    fig = viz.wordcloud_figure(dict(counters["SKILL"]))
    if fig is not None:
        st.pyplot(fig)
    elif counters["SKILL"]:
        st.info("Install `wordcloud` + `matplotlib` to see the cloud. Showing top skills below instead.")
    else:
        st.caption("No skills extracted.")

    st.subheader("🏆 Most common entities")
    cols = st.columns(3)
    for col, etype in zip(cols, config.ENTITY_TYPES):
        with col:
            bar = viz.top_bar_figure(counters[etype], config.ENTITY_LABELS[etype],
                                     config.ENTITY_COLORS[etype])
            if bar is not None:
                st.plotly_chart(bar, use_container_width=True)
            else:
                top = counters[etype].most_common(15)
                st.write(f"**{config.ENTITY_LABELS[etype]}**")
                st.table(pd.DataFrame(top, columns=["entity", "count"]) if top
                         else pd.DataFrame(columns=["entity", "count"]))

    st.subheader("📋 Per-file breakdown")
    df = pd.DataFrame(rows)
    st.dataframe(df, use_container_width=True)

    # Full long-form export of every (file-agnostic) entity count
    export = pd.concat([
        pd.DataFrame({"type": config.ENTITY_LABELS[t],
                      "entity": list(counters[t].keys()),
                      "count": list(counters[t].values())})
        for t in config.ENTITY_TYPES
    ], ignore_index=True)
    st.download_button("⬇️ Download entity counts (CSV)",
                       data=export.to_csv(index=False),
                       file_name="cv_entity_counts.csv", mime="text/csv")