CV-Extractor

Sleeping

App Files Files Community

Sher1988 commited on 17 days ago

Commit

6e51440

1 Parent(s): 5576849

Remove Dockerfile, .gitattributes, LICENSE

Browse files

Files changed (3) hide show

app.py +57 -14
core/parsing/extractor.py +1 -0
core/processing/dataframe.py +48 -8

app.py CHANGED Viewed

@@ -4,21 +4,23 @@ import tempfile
 from core.ingestion.docling_loader import load_and_convert_cv
 from core.parsing.extractor import extract_resume
-from core.processing.dataframe import resume_to_df
 st.title("CV Analyzer")
 # ---- session state init ----
 if "processed" not in st.session_state:
     st.session_state.processed = False
-if "df" not in st.session_state:
-    st.session_state.df = None
 uploaded_file = st.file_uploader("Upload CV (PDF)", type=["pdf"])
-if st.button("Upload New CV"):
     st.session_state.processed = False
-    st.session_state.df = None
 # ---- process only once ----
 if uploaded_file and not st.session_state.processed:
@@ -28,22 +30,63 @@ if uploaded_file and not st.session_state.processed:
     text = load_and_convert_cv(pdf_path)
     data = extract_resume(text)
-    df = resume_to_df(data)
-    st.session_state.df = df
     st.session_state.processed = True
 # ---- display from session (no recompute) ----
-if st.session_state.processed and st.session_state.df is not None:
-    df = st.session_state.df
-    st.subheader("Extracted Data")
-    st.dataframe(df)
-    csv = df.to_csv(index=False).encode("utf-8")
     st.download_button(
         "Download CSV",
         data=csv,
-        file_name="cv_data.csv",
         mime="text/csv"
-    )

 from core.ingestion.docling_loader import load_and_convert_cv
 from core.parsing.extractor import extract_resume
+from core.processing.dataframe import resume_to_df, resume_to_dfs
 st.title("CV Analyzer")
 # ---- session state init ----
 if "processed" not in st.session_state:
     st.session_state.processed = False
+if "dfs" not in st.session_state:
+    st.session_state.dfs = None
 uploaded_file = st.file_uploader("Upload CV (PDF)", type=["pdf"])
+if not uploaded_file:
     st.session_state.processed = False
+    st.session_state.dfs = None
 # ---- process only once ----
 if uploaded_file and not st.session_state.processed:
     text = load_and_convert_cv(pdf_path)
     data = extract_resume(text)
+    dfs = resume_to_dfs(data)
+    st.session_state.data = data
+    st.session_state.dfs = dfs
     st.session_state.processed = True
 # ---- display from session (no recompute) ----
+if st.session_state.processed and st.session_state.dfs is not None:
+    dfs = st.session_state.dfs
+    data = st.session_state.data # Ensure data is pulled from state
+    # Extract row from 'base' dataframe (assuming it's a single-row DF)
+    base_data = dfs['base'].iloc[0]
+    st.subheader("Candidate Profile")
+    col_spacer, col_content = st.columns([0.01, 0.99])
+    with col_content:
+        st.write(f"**Name:** {base_data.get('full_name', 'N/A')}")
+        # Iterate through contact fields (the ones prefixed with contact_)
+        contact_fields = {k.replace("contact_", "").title(): v for k, v in base_data.items() if k.startswith("contact_") and v}
+        for label, value in contact_fields.items():
+            st.write(f"**{label}:** {value}")
+        st.write(f"**AI/ML Skills:** {base_data.get('ai_ml_skills') or 'N/A'}")
+        st.write(f"**Technical Skills:** {base_data.get('technical_skills') or 'N/A'}")
+        st.write(f"**Certifications:** {base_data.get('certifications') or 'N/A'}")
+        if base_data.get("summary"):
+            st.info(f"**Summary:** {base_data['summary']}")
+    # Display other tables (Experience, Education, etc.)
+    for label, df in dfs.items():
+        if label == "base":
+            continue
+        st.subheader(label.replace("_", " ").title())
+        st.dataframe(df, use_container_width=True)
+    # Download Button
+    df_full = resume_to_df(data)
+    csv = df_full.to_csv(index=False).encode("utf-8")
     st.download_button(
         "Download CSV",
         data=csv,
+        file_name=f"analyzed_{uploaded_file.name}.csv",
         mime="text/csv"
+    )
+# "full_name": r.get("full_name"),
+# "summary": r.get("summary"),
+# **{f"contact_{k}": v for k, v in (r.get("contact") or {}).items()},
+# "ai_ml_skills"
+# "technical_skills"
+# "certifications"

core/parsing/extractor.py CHANGED Viewed

@@ -25,6 +25,7 @@ agent = Agent(
     model=model,
     system_prompt=(
             'You are an expert resume extractor.'
             'Do NOT infer or hallucinate missing sections.'
             'If a section is not explicitly present, return null or empty list.'
         ),

     model=model,
     system_prompt=(
             'You are an expert resume extractor.'
+            'If the context is not a Resume return null and DO NOT infer or hallucinate.'
             'Do NOT infer or hallucinate missing sections.'
             'If a section is not explicitly present, return null or empty list.'
         ),

core/processing/dataframe.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import pandas as pd
-def resume_to_df(resume):
-    r = resume.dict()
     base = {
         "full_name": r["full_name"],
         "summary": r["summary"],
-        **{f"contact_{k}": v for k, v in r["contact"].items()},
         "ai_ml_skills": ", ".join(r.get("ai_ml_skills", []) or []),
         "technical_skills": ", ".join(r.get("technical_skills", []) or []),
         "certifications": ", ".join(r.get("certifications", []) or [])
@@ -19,18 +21,17 @@ def resume_to_df(resume):
         len(r.get("education") or []),
         len(r.get("experience") or []),
         len(r.get("projects") or []),
-        1
     )
-    print('max_len: ', max_len)
     for i in range(max_len):
-        row = base.copy()
         # education
         educations = r.get("education", []) or []
         if i < len(educations):
             e = educations[i]
-            row.update({
                 "edu_institution": e["institution"],
                 "edu_degree": e["degree"],
                 "edu_start": e["start_date"],
@@ -61,4 +62,43 @@ def resume_to_df(resume):
         rows.append(row)
-    return pd.DataFrame(rows)

 import pandas as pd
+from core.parsing.schema import Resume
+def resume_to_df(resume: Resume):
+    # r = resume.dict()
+    r = resume.model_dump() # Dictionary -> key, value pairs
     base = {
         "full_name": r["full_name"],
         "summary": r["summary"],
+        **{f"contact_{k}": v for k, v in r["contact"].items() if v != None},
         "ai_ml_skills": ", ".join(r.get("ai_ml_skills", []) or []),
         "technical_skills": ", ".join(r.get("technical_skills", []) or []),
         "certifications": ", ".join(r.get("certifications", []) or [])
         len(r.get("education") or []),
         len(r.get("experience") or []),
         len(r.get("projects") or []),
+        1 # atleast one row.
     )
     for i in range(max_len):
+        row = {} #base.copy()
         # education
         educations = r.get("education", []) or []
         if i < len(educations):
             e = educations[i]
+            row.update({  # row |= {}
                 "edu_institution": e["institution"],
                 "edu_degree": e["degree"],
                 "edu_start": e["start_date"],
         rows.append(row)
+    return pd.DataFrame(rows)
+def resume_to_dfs(resume: Resume):
+    r = resume.model_dump()
+    # 1. Base Info (Contact, Skills, Summary)
+    # Flattens the top-level fields and the nested 'contact' dict
+    base_data = {
+        "full_name": r.get("full_name"),
+        "summary": r.get("summary"),
+        **{f"contact_{k}": v for k, v in (r.get("contact") or {}).items()},
+        "ai_ml_skills": ", ".join(r.get("ai_ml_skills") or []),
+        "technical_skills": ", ".join(r.get("technical_skills") or []),
+        "certifications": ", ".join(r.get("certifications") or [])
+    }
+    df_base = pd.DataFrame([base_data])
+    # 2. Education DataFrame
+    df_edu = pd.DataFrame(r.get("education") or [])
+    # 3. Experience DataFrame
+    df_exp = pd.DataFrame(r.get("experience") or [])
+    # 4. Projects DataFrame
+    # We handle the 'technologies' list by joining it into a string for the CSV/Table view
+    projects = r.get("projects") or []
+    for p in projects:
+        if isinstance(p.get("technologies"), list):
+            p["technologies"] = ", ".join(p["technologies"])
+    df_proj = pd.DataFrame(projects)
+    return {
+        "base": df_base,
+        "education": df_edu,
+        "experience": df_exp,
+        "projects": df_proj
+    }