Sher1988 commited on
Commit
153f2a7
·
1 Parent(s): 3e1d74e

Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+
4
+ # Python cache
5
+ __pycache__/
6
+ *.pyc
7
+
8
+ # Virtual environment
9
+ .venv/
10
+
11
+ # OS files
12
+ .DS_Store
13
+ Thumbs.db
14
+
15
+ # Logs
16
+ *.log
17
+
18
+ # Data (optional)
19
+ data/
20
+
21
+ # Jupyter
22
+ .ipynb_checkpoints/
23
+
24
+ # Build
25
+ dist/
26
+ build/
27
+
28
+ # IDE
29
+ .vscode/
30
+ .idea/
README.md ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ title: Caption Gen
2
+ emoji: 📸
3
+ sdk: streamlit
4
+ sdk_version: 1.43.0
5
+ app_file: app.py
6
+
7
+
8
+ # CV Analyzer (AI-Powered Resume Parser)
9
+
10
+ A Streamlit-based app that extracts structured data from CVs (PDF) using **Docling + Agentic AI + Pydantic schema**, and converts it into a clean, downloadable CSV.
11
+
12
+ ---
13
+
14
+ ## Features
15
+
16
+ - Upload CV (PDF)
17
+ - Parse document using Docling
18
+ - Extract structured data using LLM agent
19
+ - Validate with Pydantic schema
20
+ - Convert to Pandas DataFrame
21
+ - View extracted data in UI
22
+ - Download as CSV
23
+
24
+ ---
25
+
26
+ ## Tech Stack
27
+
28
+ - **Streamlit** – UI
29
+ - **Docling** – PDF parsing
30
+ - **Pydantic / pydantic-ai** – structured extraction
31
+ - **Hugging Face / LLM** – inference
32
+ - **Pandas** – data processing
33
+
34
+ ---
35
+
36
+ ## Setup
37
+
38
+ ### 1. Clone repo
39
+ ```bash
40
+ git clone https://github.com/your-username/cv-analyzer.git
41
+ cd cv-analyzer
42
+ ````
43
+
44
+ ### 2. Create virtual environment
45
+
46
+ ```bash
47
+ python -m venv .venv
48
+ source .venv/bin/activate # Linux/macOS
49
+ .venv\Scripts\activate # Windows
50
+ ```
51
+
52
+ ### 3. Install dependencies
53
+
54
+ ```bash
55
+ pip install -r requirements.txt
56
+ ```
57
+
58
+ ### 4. Environment variables
59
+
60
+ Create a `.env` file:
61
+
62
+ ```
63
+ HF_TOKEN=your_huggingface_token
64
+ ```
65
+
66
+ > `.env` is ignored via `.gitignore`
67
+
68
+ ---
69
+
70
+ ## Run App
71
+
72
+ ```bash
73
+ streamlit run app.py
74
+ ```
75
+
76
+ ---
77
+
78
+ ## How it works
79
+
80
+ 1. User uploads CV (PDF)
81
+ 2. Docling converts PDF → structured text/markdown
82
+ 3. LLM agent extracts data using predefined schema
83
+ 4. Output is validated via Pydantic
84
+ 5. Data is converted into a DataFrame
85
+ 6. User can view and download CSV
86
+
87
+ ---
88
+
89
+ ## Notes
90
+
91
+ * Schema is designed for **AI/ML-focused resumes**
92
+ * Missing fields are returned as `null` (no hallucination policy)
93
+ * Dates are stored as strings to avoid parsing errors
94
+ * Validation is relaxed to improve LLM compatibility
95
+
96
+ ---
97
+
98
+ ## Limitations
99
+
100
+ * LLM may still produce inconsistent outputs for poorly formatted CVs
101
+ * Complex layouts (tables, multi-column PDFs) may affect parsing quality
102
+ * Requires internet access for model inference
103
+
104
+ ---
105
+
106
+ ## Future Improvements
107
+
108
+ * Multi-CV batch processing
109
+ * Candidate scoring & ranking
110
+ * Semantic search over resumes (FAISS)
111
+ * UI improvements (filters, charts)
112
+ * Export to JSON / Excel
113
+
114
+ ---
115
+
116
+ ## License
117
+
118
+ MIT License
119
+
120
+ ```
121
+ ```
app.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import tempfile
4
+
5
+ from core.ingestion.docling_loader import load_and_convert_cv
6
+ from core.parsing.extractor import extract_resume
7
+ from core.processing.dataframe import resume_to_df
8
+
9
+ st.title("CV Analyzer")
10
+
11
+ # ---- session state init ----
12
+ if "processed" not in st.session_state:
13
+ st.session_state.processed = False
14
+ if "df" not in st.session_state:
15
+ st.session_state.df = None
16
+
17
+ uploaded_file = st.file_uploader("Upload CV (PDF)", type=["pdf"])
18
+
19
+ if st.button("Upload New CV"):
20
+ st.session_state.processed = False
21
+ st.session_state.df = None
22
+
23
+ # ---- process only once ----
24
+ if uploaded_file and not st.session_state.processed:
25
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
26
+ tmp.write(uploaded_file.read())
27
+ pdf_path = tmp.name
28
+
29
+ text = load_and_convert_cv(pdf_path)
30
+ data = extract_resume(text)
31
+ df = resume_to_df(data)
32
+
33
+ st.session_state.df = df
34
+ st.session_state.processed = True
35
+
36
+ # ---- display from session (no recompute) ----
37
+ if st.session_state.processed and st.session_state.df is not None:
38
+ df = st.session_state.df
39
+
40
+ st.subheader("Extracted Data")
41
+ st.dataframe(df)
42
+
43
+ csv = df.to_csv(index=False).encode("utf-8")
44
+ st.download_button(
45
+ "Download CSV",
46
+ data=csv,
47
+ file_name="cv_data.csv",
48
+ mime="text/csv"
49
+ )
core/config.py ADDED
File without changes
core/ingestion/docling_loader.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+ from docling.document_converter import DocumentConverter, PdfFormatOption
4
+
5
+
6
+ @st.cache_resource
7
+ def get_converter():
8
+ """
9
+ Initializes and caches the Docling DocumentConverter.
10
+ This ensures models are only loaded once across app reruns.
11
+ """
12
+ return DocumentConverter(
13
+ format_options={
14
+ "pdf": PdfFormatOption(
15
+ enable_ocr=False
16
+ )
17
+ }
18
+ )
19
+
20
+
21
+ def load_and_convert_cv(file_path: str) -> str:
22
+ """
23
+ Converts a PDF/DOCX file to Markdown format using Docling.
24
+
25
+ Args:
26
+ file_path (str): The local path to the uploaded CV file.
27
+
28
+ Returns:
29
+ str: The converted markdown text.
30
+ """
31
+ converter = get_converter()
32
+ result = converter.convert(file_path)
33
+ text_content = result.document.export_to_text()
34
+ return text_content
core/parsing/extractor.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_ai import Agent
2
+ from pydantic_ai.models.huggingface import HuggingFaceModel
3
+ from pydantic_ai.providers.openai import OpenAIProvider
4
+ from dotenv import load_dotenv
5
+ import os
6
+
7
+ from core.parsing.schema import Resume
8
+
9
+
10
+ load_dotenv() # unnecessary if deployed on huggingface space.
11
+ api_key = os.environ["HF_TOKEN"] # raises error if missing
12
+
13
+
14
+
15
+ model = HuggingFaceModel(
16
+ 'Qwen/Qwen2.5-7B-Instruct',
17
+ provider=OpenAIProvider(
18
+ base_url="https://router.huggingface.co/v1",
19
+ api_key=api_key
20
+ )
21
+ )
22
+
23
+
24
+ agent = Agent(
25
+ model=model,
26
+ system_prompt=(
27
+ 'You are an expert resume extractor.'
28
+ 'Do NOT infer or hallucinate missing sections.'
29
+ 'If a section is not explicitly present, return null or empty list.'
30
+ ),
31
+ output_type=Resume
32
+ )
33
+
34
+
35
+ def extract_resume(text: str) -> Resume:
36
+ result = agent.run_sync(text)
37
+ return result.output
core/parsing/schema.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Optional
3
+
4
+
5
+ # Nested models for detailed resume sections
6
+ class ContactInformation(BaseModel):
7
+ email: str = Field(None, description="Email address.")
8
+ phone: Optional[str] = Field(None, description='mobile number eg. +92 03011234567')
9
+ linkedin: Optional[str] = None
10
+ github: Optional[str] = None
11
+ hugging_face: Optional[str] = None
12
+ kaggle: Optional[str] = None
13
+
14
+
15
+ class Education(BaseModel):
16
+ institution: str
17
+ degree: str
18
+ start_date: Optional[str] = None
19
+ end_date: Optional[str] = None
20
+
21
+
22
+ class Experience(BaseModel):
23
+ title: str = Field(description="Job role/title.")
24
+ company: str = Field(description="Name of the company or organization.")
25
+ start_date: Optional[str] = None
26
+ end_date: Optional[str] = None
27
+
28
+
29
+ class Project(BaseModel):
30
+ name: str = Field(description="Name of a project.")
31
+ description: str = Field(description="Project Description")
32
+ technologies: List[str] = None
33
+ url: Optional[str] = None
34
+ difficulty_score: int = Field(
35
+ ...,
36
+ ge=1,
37
+ le=10,
38
+ description=(
39
+ "Strictly evaluate AI engineering complexity. "
40
+ "1-3: Simple 'wrapper' apps, basic prompting, or out-of-the-box RAG with a single data source. "
41
+ "4-6: Production-grade apps with persistent memory, multi-step tool use (agents), "
42
+ "complex data parsing (PDFs/Tables), or basic fine-tuning for style. "
43
+ "7-8: Advanced architectures featuring multi-agent orchestration, self-healing loops, "
44
+ "complex hybrid search (vector + keyword), or custom evaluation frameworks (LLM-as-a-judge). "
45
+ "9-10: Highly complex, mission-critical systems with real-time streaming, "
46
+ "multi-modal integration, or heavy optimization for cost and latency at scale. "
47
+ "If the project only uses a single API call without complex logic, it must not exceed 3."
48
+ )
49
+ )
50
+
51
+ # Main AI Developer Resume Schema
52
+ class Resume(BaseModel):
53
+ full_name: str = Field(..., description="Full name of the applicant.")
54
+ contact: ContactInformation
55
+ summary: str = Field(..., description="Professional summary focusing on AI/ML.")
56
+ education: Optional[List[Education]] = Field(
57
+ ..., description="List of educational degrees. Return null if not explicitly present."
58
+ )
59
+ experience: Optional[List[Experience]] = Field(
60
+ ..., description="List of experiences. Return null if not explicitly present."
61
+ )
62
+ ai_ml_skills: List[str] = Field(..., description="Specific AI/ML skills (e.g., LLMs, Computer Vision).")
63
+ technical_skills: List[str] = Field(..., description="Programming languages and tools.")
64
+ projects: Optional[List[Project]] = None
65
+ certifications: Optional[List[str]] = None
core/processing/dataframe.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ def resume_to_df(resume):
4
+ r = resume.dict()
5
+
6
+ base = {
7
+ "full_name": r["full_name"],
8
+ "summary": r["summary"],
9
+ **{f"contact_{k}": v for k, v in r["contact"].items()},
10
+ "ai_ml_skills": ", ".join(r.get("ai_ml_skills", []) or []),
11
+ "technical_skills": ", ".join(r.get("technical_skills", []) or []),
12
+ "certifications": ", ".join(r.get("certifications", []) or [])
13
+ }
14
+
15
+ rows = []
16
+
17
+ # max length among lists
18
+ max_len = max(
19
+ len(r.get("education") or []),
20
+ len(r.get("experience") or []),
21
+ len(r.get("projects") or []),
22
+ 1
23
+ )
24
+ print('max_len: ', max_len)
25
+
26
+ for i in range(max_len):
27
+ row = base.copy()
28
+
29
+ # education
30
+ educations = r.get("education", []) or []
31
+ if i < len(educations):
32
+ e = educations[i]
33
+ row.update({
34
+ "edu_institution": e["institution"],
35
+ "edu_degree": e["degree"],
36
+ "edu_start": e["start_date"],
37
+ "edu_end": e["end_date"],
38
+ })
39
+
40
+ # experience
41
+ experiences = r.get("experience", []) or []
42
+ if i < len(experiences):
43
+ ex = experiences[i]
44
+ row.update({
45
+ "exp_title": ex["title"],
46
+ "exp_company": ex["company"],
47
+ "exp_start": ex["start_date"],
48
+ "exp_end": ex["end_date"],
49
+ })
50
+
51
+ # projects
52
+ projects = r.get("projects", []) or []
53
+ if i < len(projects):
54
+ p = projects[i]
55
+ row.update({
56
+ "proj_name": p["name"],
57
+ "proj_desc": p["description"],
58
+ "proj_tech": ", ".join(p["technologies"]),
59
+ "proj_score": p["difficulty_score"],
60
+ })
61
+
62
+ rows.append(row)
63
+
64
+ return pd.DataFrame(rows)
core/utils/helpers.py ADDED
File without changes
requirements.txt ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit==1.37.1
2
+ pandas==2.2.2
3
+ python-dotenv==1.0.1
4
+
5
+ # Pydantic AI stack
6
+ pydantic==2.7.4
7
+ pydantic-ai==0.0.14
8
+
9
+ # HF ecosystem (important: keep older compatible versions)
10
+ transformers==4.41.2
11
+ huggingface-hub==0.34.0
12
+ tokenizers==0.19.1
13
+
14
+ # Docling
15
+ docling==2.28.0
16
+
17
+ # Optional but often required by docling
18
+ torch==2.3.1
19
+ numpy==1.26.4