Spaces:
Sleeping
Sleeping
Remove Dockerfile, .gitattributes, LICENSE
Browse files- app.py +57 -14
- core/parsing/extractor.py +1 -0
- core/processing/dataframe.py +48 -8
app.py
CHANGED
|
@@ -4,21 +4,23 @@ import tempfile
|
|
| 4 |
|
| 5 |
from core.ingestion.docling_loader import load_and_convert_cv
|
| 6 |
from core.parsing.extractor import extract_resume
|
| 7 |
-
from core.processing.dataframe import resume_to_df
|
| 8 |
|
| 9 |
st.title("CV Analyzer")
|
| 10 |
|
| 11 |
# ---- session state init ----
|
| 12 |
if "processed" not in st.session_state:
|
| 13 |
st.session_state.processed = False
|
| 14 |
-
if "
|
| 15 |
-
st.session_state.
|
| 16 |
|
| 17 |
uploaded_file = st.file_uploader("Upload CV (PDF)", type=["pdf"])
|
| 18 |
|
| 19 |
-
|
|
|
|
| 20 |
st.session_state.processed = False
|
| 21 |
-
st.session_state.
|
|
|
|
| 22 |
|
| 23 |
# ---- process only once ----
|
| 24 |
if uploaded_file and not st.session_state.processed:
|
|
@@ -28,22 +30,63 @@ if uploaded_file and not st.session_state.processed:
|
|
| 28 |
|
| 29 |
text = load_and_convert_cv(pdf_path)
|
| 30 |
data = extract_resume(text)
|
| 31 |
-
|
| 32 |
|
| 33 |
-
st.session_state.
|
|
|
|
| 34 |
st.session_state.processed = True
|
| 35 |
|
| 36 |
# ---- display from session (no recompute) ----
|
| 37 |
-
if st.session_state.processed and st.session_state.
|
| 38 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
|
| 40 |
-
st.subheader("Extracted Data")
|
| 41 |
-
st.dataframe(df)
|
| 42 |
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
st.download_button(
|
| 45 |
"Download CSV",
|
| 46 |
data=csv,
|
| 47 |
-
file_name="
|
| 48 |
mime="text/csv"
|
| 49 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
|
| 5 |
from core.ingestion.docling_loader import load_and_convert_cv
|
| 6 |
from core.parsing.extractor import extract_resume
|
| 7 |
+
from core.processing.dataframe import resume_to_df, resume_to_dfs
|
| 8 |
|
| 9 |
st.title("CV Analyzer")
|
| 10 |
|
| 11 |
# ---- session state init ----
|
| 12 |
if "processed" not in st.session_state:
|
| 13 |
st.session_state.processed = False
|
| 14 |
+
if "dfs" not in st.session_state:
|
| 15 |
+
st.session_state.dfs = None
|
| 16 |
|
| 17 |
uploaded_file = st.file_uploader("Upload CV (PDF)", type=["pdf"])
|
| 18 |
|
| 19 |
+
|
| 20 |
+
if not uploaded_file:
|
| 21 |
st.session_state.processed = False
|
| 22 |
+
st.session_state.dfs = None
|
| 23 |
+
|
| 24 |
|
| 25 |
# ---- process only once ----
|
| 26 |
if uploaded_file and not st.session_state.processed:
|
|
|
|
| 30 |
|
| 31 |
text = load_and_convert_cv(pdf_path)
|
| 32 |
data = extract_resume(text)
|
| 33 |
+
dfs = resume_to_dfs(data)
|
| 34 |
|
| 35 |
+
st.session_state.data = data
|
| 36 |
+
st.session_state.dfs = dfs
|
| 37 |
st.session_state.processed = True
|
| 38 |
|
| 39 |
# ---- display from session (no recompute) ----
|
| 40 |
+
if st.session_state.processed and st.session_state.dfs is not None:
|
| 41 |
+
dfs = st.session_state.dfs
|
| 42 |
+
data = st.session_state.data # Ensure data is pulled from state
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# Extract row from 'base' dataframe (assuming it's a single-row DF)
|
| 46 |
+
base_data = dfs['base'].iloc[0]
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
st.subheader("Candidate Profile")
|
| 50 |
+
col_spacer, col_content = st.columns([0.01, 0.99])
|
| 51 |
+
with col_content:
|
| 52 |
+
st.write(f"**Name:** {base_data.get('full_name', 'N/A')}")
|
| 53 |
+
|
| 54 |
+
# Iterate through contact fields (the ones prefixed with contact_)
|
| 55 |
+
contact_fields = {k.replace("contact_", "").title(): v for k, v in base_data.items() if k.startswith("contact_") and v}
|
| 56 |
+
for label, value in contact_fields.items():
|
| 57 |
+
st.write(f"**{label}:** {value}")
|
| 58 |
+
|
| 59 |
+
st.write(f"**AI/ML Skills:** {base_data.get('ai_ml_skills') or 'N/A'}")
|
| 60 |
+
st.write(f"**Technical Skills:** {base_data.get('technical_skills') or 'N/A'}")
|
| 61 |
+
st.write(f"**Certifications:** {base_data.get('certifications') or 'N/A'}")
|
| 62 |
+
|
| 63 |
+
if base_data.get("summary"):
|
| 64 |
+
st.info(f"**Summary:** {base_data['summary']}")
|
| 65 |
|
|
|
|
|
|
|
| 66 |
|
| 67 |
+
# Display other tables (Experience, Education, etc.)
|
| 68 |
+
for label, df in dfs.items():
|
| 69 |
+
if label == "base":
|
| 70 |
+
continue
|
| 71 |
+
st.subheader(label.replace("_", " ").title())
|
| 72 |
+
st.dataframe(df, use_container_width=True)
|
| 73 |
+
|
| 74 |
+
# Download Button
|
| 75 |
+
df_full = resume_to_df(data)
|
| 76 |
+
csv = df_full.to_csv(index=False).encode("utf-8")
|
| 77 |
st.download_button(
|
| 78 |
"Download CSV",
|
| 79 |
data=csv,
|
| 80 |
+
file_name=f"analyzed_{uploaded_file.name}.csv",
|
| 81 |
mime="text/csv"
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
# "full_name": r.get("full_name"),
|
| 88 |
+
# "summary": r.get("summary"),
|
| 89 |
+
# **{f"contact_{k}": v for k, v in (r.get("contact") or {}).items()},
|
| 90 |
+
# "ai_ml_skills"
|
| 91 |
+
# "technical_skills"
|
| 92 |
+
# "certifications"
|
core/parsing/extractor.py
CHANGED
|
@@ -25,6 +25,7 @@ agent = Agent(
|
|
| 25 |
model=model,
|
| 26 |
system_prompt=(
|
| 27 |
'You are an expert resume extractor.'
|
|
|
|
| 28 |
'Do NOT infer or hallucinate missing sections.'
|
| 29 |
'If a section is not explicitly present, return null or empty list.'
|
| 30 |
),
|
|
|
|
| 25 |
model=model,
|
| 26 |
system_prompt=(
|
| 27 |
'You are an expert resume extractor.'
|
| 28 |
+
'If the context is not a Resume return null and DO NOT infer or hallucinate.'
|
| 29 |
'Do NOT infer or hallucinate missing sections.'
|
| 30 |
'If a section is not explicitly present, return null or empty list.'
|
| 31 |
),
|
core/processing/dataframe.py
CHANGED
|
@@ -1,12 +1,14 @@
|
|
| 1 |
import pandas as pd
|
|
|
|
| 2 |
|
| 3 |
-
def resume_to_df(resume):
|
| 4 |
-
r = resume.dict()
|
|
|
|
| 5 |
|
| 6 |
base = {
|
| 7 |
"full_name": r["full_name"],
|
| 8 |
"summary": r["summary"],
|
| 9 |
-
**{f"contact_{k}": v for k, v in r["contact"].items()},
|
| 10 |
"ai_ml_skills": ", ".join(r.get("ai_ml_skills", []) or []),
|
| 11 |
"technical_skills": ", ".join(r.get("technical_skills", []) or []),
|
| 12 |
"certifications": ", ".join(r.get("certifications", []) or [])
|
|
@@ -19,18 +21,17 @@ def resume_to_df(resume):
|
|
| 19 |
len(r.get("education") or []),
|
| 20 |
len(r.get("experience") or []),
|
| 21 |
len(r.get("projects") or []),
|
| 22 |
-
1
|
| 23 |
)
|
| 24 |
-
print('max_len: ', max_len)
|
| 25 |
|
| 26 |
for i in range(max_len):
|
| 27 |
-
row = base.copy()
|
| 28 |
|
| 29 |
# education
|
| 30 |
educations = r.get("education", []) or []
|
| 31 |
if i < len(educations):
|
| 32 |
e = educations[i]
|
| 33 |
-
row.update({
|
| 34 |
"edu_institution": e["institution"],
|
| 35 |
"edu_degree": e["degree"],
|
| 36 |
"edu_start": e["start_date"],
|
|
@@ -61,4 +62,43 @@ def resume_to_df(resume):
|
|
| 61 |
|
| 62 |
rows.append(row)
|
| 63 |
|
| 64 |
-
return pd.DataFrame(rows)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
+
from core.parsing.schema import Resume
|
| 3 |
|
| 4 |
+
def resume_to_df(resume: Resume):
|
| 5 |
+
# r = resume.dict()
|
| 6 |
+
r = resume.model_dump() # Dictionary -> key, value pairs
|
| 7 |
|
| 8 |
base = {
|
| 9 |
"full_name": r["full_name"],
|
| 10 |
"summary": r["summary"],
|
| 11 |
+
**{f"contact_{k}": v for k, v in r["contact"].items() if v != None},
|
| 12 |
"ai_ml_skills": ", ".join(r.get("ai_ml_skills", []) or []),
|
| 13 |
"technical_skills": ", ".join(r.get("technical_skills", []) or []),
|
| 14 |
"certifications": ", ".join(r.get("certifications", []) or [])
|
|
|
|
| 21 |
len(r.get("education") or []),
|
| 22 |
len(r.get("experience") or []),
|
| 23 |
len(r.get("projects") or []),
|
| 24 |
+
1 # atleast one row.
|
| 25 |
)
|
|
|
|
| 26 |
|
| 27 |
for i in range(max_len):
|
| 28 |
+
row = {} #base.copy()
|
| 29 |
|
| 30 |
# education
|
| 31 |
educations = r.get("education", []) or []
|
| 32 |
if i < len(educations):
|
| 33 |
e = educations[i]
|
| 34 |
+
row.update({ # row |= {}
|
| 35 |
"edu_institution": e["institution"],
|
| 36 |
"edu_degree": e["degree"],
|
| 37 |
"edu_start": e["start_date"],
|
|
|
|
| 62 |
|
| 63 |
rows.append(row)
|
| 64 |
|
| 65 |
+
return pd.DataFrame(rows)
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def resume_to_dfs(resume: Resume):
|
| 71 |
+
r = resume.model_dump()
|
| 72 |
+
|
| 73 |
+
# 1. Base Info (Contact, Skills, Summary)
|
| 74 |
+
# Flattens the top-level fields and the nested 'contact' dict
|
| 75 |
+
base_data = {
|
| 76 |
+
"full_name": r.get("full_name"),
|
| 77 |
+
"summary": r.get("summary"),
|
| 78 |
+
**{f"contact_{k}": v for k, v in (r.get("contact") or {}).items()},
|
| 79 |
+
"ai_ml_skills": ", ".join(r.get("ai_ml_skills") or []),
|
| 80 |
+
"technical_skills": ", ".join(r.get("technical_skills") or []),
|
| 81 |
+
"certifications": ", ".join(r.get("certifications") or [])
|
| 82 |
+
}
|
| 83 |
+
df_base = pd.DataFrame([base_data])
|
| 84 |
+
|
| 85 |
+
# 2. Education DataFrame
|
| 86 |
+
df_edu = pd.DataFrame(r.get("education") or [])
|
| 87 |
+
|
| 88 |
+
# 3. Experience DataFrame
|
| 89 |
+
df_exp = pd.DataFrame(r.get("experience") or [])
|
| 90 |
+
|
| 91 |
+
# 4. Projects DataFrame
|
| 92 |
+
# We handle the 'technologies' list by joining it into a string for the CSV/Table view
|
| 93 |
+
projects = r.get("projects") or []
|
| 94 |
+
for p in projects:
|
| 95 |
+
if isinstance(p.get("technologies"), list):
|
| 96 |
+
p["technologies"] = ", ".join(p["technologies"])
|
| 97 |
+
df_proj = pd.DataFrame(projects)
|
| 98 |
+
|
| 99 |
+
return {
|
| 100 |
+
"base": df_base,
|
| 101 |
+
"education": df_edu,
|
| 102 |
+
"experience": df_exp,
|
| 103 |
+
"projects": df_proj
|
| 104 |
+
}
|