CV-Extractor

Running

App Files Files Community

Sher1988 commited on 17 days ago

Commit

153f2a7

1 Parent(s): 3e1d74e

Initial commit

Browse files

Files changed (10) hide show

.gitignore +30 -0
README.md +121 -0
app.py +49 -0
core/config.py +0 -0
core/ingestion/docling_loader.py +34 -0
core/parsing/extractor.py +37 -0
core/parsing/schema.py +65 -0
core/processing/dataframe.py +64 -0
core/utils/helpers.py +0 -0
requirements.txt +19 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,30 @@

+# Environment variables
+.env
+# Python cache
+__pycache__/
+*.pyc
+# Virtual environment
+.venv/
+# OS files
+.DS_Store
+Thumbs.db
+# Logs
+*.log
+# Data (optional)
+data/
+# Jupyter
+.ipynb_checkpoints/
+# Build
+dist/
+build/
+# IDE
+.vscode/
+.idea/

README.md ADDED Viewed

	@@ -0,0 +1,121 @@

+title: Caption Gen
+emoji: 📸
+sdk: streamlit
+sdk_version: 1.43.0
+app_file: app.py
+# CV Analyzer (AI-Powered Resume Parser)
+A Streamlit-based app that extracts structured data from CVs (PDF) using **Docling + Agentic AI + Pydantic schema**, and converts it into a clean, downloadable CSV.
+---
+## Features
+- Upload CV (PDF)
+- Parse document using Docling
+- Extract structured data using LLM agent
+- Validate with Pydantic schema
+- Convert to Pandas DataFrame
+- View extracted data in UI
+- Download as CSV
+---
+## Tech Stack
+- **Streamlit** – UI
+- **Docling** – PDF parsing
+- **Pydantic / pydantic-ai** – structured extraction
+- **Hugging Face / LLM** – inference
+- **Pandas** – data processing
+---
+## Setup
+### 1. Clone repo
+```bash
+git clone https://github.com/your-username/cv-analyzer.git
+cd cv-analyzer
+````
+### 2. Create virtual environment
+```bash
+python -m venv .venv
+source .venv/bin/activate   # Linux/macOS
+.venv\Scripts\activate      # Windows
+```
+### 3. Install dependencies
+```bash
+pip install -r requirements.txt
+```
+### 4. Environment variables
+Create a `.env` file:
+```
+HF_TOKEN=your_huggingface_token
+```
+> `.env` is ignored via `.gitignore`
+---
+## Run App
+```bash
+streamlit run app.py
+```
+---
+## How it works
+1. User uploads CV (PDF)
+2. Docling converts PDF → structured text/markdown
+3. LLM agent extracts data using predefined schema
+4. Output is validated via Pydantic
+5. Data is converted into a DataFrame
+6. User can view and download CSV
+---
+## Notes
+* Schema is designed for **AI/ML-focused resumes**
+* Missing fields are returned as `null` (no hallucination policy)
+* Dates are stored as strings to avoid parsing errors
+* Validation is relaxed to improve LLM compatibility
+---
+## Limitations
+* LLM may still produce inconsistent outputs for poorly formatted CVs
+* Complex layouts (tables, multi-column PDFs) may affect parsing quality
+* Requires internet access for model inference
+---
+## Future Improvements
+* Multi-CV batch processing
+* Candidate scoring & ranking
+* Semantic search over resumes (FAISS)
+* UI improvements (filters, charts)
+* Export to JSON / Excel
+---
+## License
+MIT License
+```
+```

app.py ADDED Viewed

	@@ -0,0 +1,49 @@

+import streamlit as st
+import pandas as pd
+import tempfile
+from core.ingestion.docling_loader import load_and_convert_cv
+from core.parsing.extractor import extract_resume
+from core.processing.dataframe import resume_to_df
+st.title("CV Analyzer")
+# ---- session state init ----
+if "processed" not in st.session_state:
+    st.session_state.processed = False
+if "df" not in st.session_state:
+    st.session_state.df = None
+uploaded_file = st.file_uploader("Upload CV (PDF)", type=["pdf"])
+if st.button("Upload New CV"):
+    st.session_state.processed = False
+    st.session_state.df = None
+# ---- process only once ----
+if uploaded_file and not st.session_state.processed:
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+        tmp.write(uploaded_file.read())
+        pdf_path = tmp.name
+    text = load_and_convert_cv(pdf_path)
+    data = extract_resume(text)
+    df = resume_to_df(data)
+    st.session_state.df = df
+    st.session_state.processed = True
+# ---- display from session (no recompute) ----
+if st.session_state.processed and st.session_state.df is not None:
+    df = st.session_state.df
+    st.subheader("Extracted Data")
+    st.dataframe(df)
+    csv = df.to_csv(index=False).encode("utf-8")
+    st.download_button(
+        "Download CSV",
+        data=csv,
+        file_name="cv_data.csv",
+        mime="text/csv"
+    )

core/config.py ADDED Viewed

File without changes

core/ingestion/docling_loader.py ADDED Viewed

	@@ -0,0 +1,34 @@

+import streamlit as st
+from pathlib import Path
+from docling.document_converter import DocumentConverter, PdfFormatOption
+@st.cache_resource
+def get_converter():
+    """
+    Initializes and caches the Docling DocumentConverter.
+    This ensures models are only loaded once across app reruns.
+    """
+    return DocumentConverter(
+        format_options={
+            "pdf": PdfFormatOption(
+                enable_ocr=False
+            )
+        }
+    )
+def load_and_convert_cv(file_path: str) -> str:
+    """
+    Converts a PDF/DOCX file to Markdown format using Docling.
+    Args:
+        file_path (str): The local path to the uploaded CV file.
+    Returns:
+        str: The converted markdown text.
+    """
+    converter = get_converter()
+    result = converter.convert(file_path)
+    text_content = result.document.export_to_text()
+    return text_content

core/parsing/extractor.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from pydantic_ai import Agent
+from pydantic_ai.models.huggingface import HuggingFaceModel
+from pydantic_ai.providers.openai import OpenAIProvider
+from dotenv import load_dotenv
+import os
+from core.parsing.schema import Resume
+load_dotenv() # unnecessary if deployed on huggingface space.
+api_key = os.environ["HF_TOKEN"]   # raises error if missing
+model = HuggingFaceModel(
+    'Qwen/Qwen2.5-7B-Instruct',
+    provider=OpenAIProvider(
+        base_url="https://router.huggingface.co/v1",
+        api_key=api_key
+        )
+    )
+agent = Agent(
+    model=model,
+    system_prompt=(
+            'You are an expert resume extractor.'
+            'Do NOT infer or hallucinate missing sections.'
+            'If a section is not explicitly present, return null or empty list.'
+        ),
+    output_type=Resume
+)
+def extract_resume(text: str) -> Resume:
+    result = agent.run_sync(text)
+    return result.output

core/parsing/schema.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from pydantic import BaseModel, Field
+from typing import List, Optional
+# Nested models for detailed resume sections
+class ContactInformation(BaseModel):
+    email: str = Field(None, description="Email address.")
+    phone: Optional[str] = Field(None, description='mobile number eg. +92 03011234567')
+    linkedin: Optional[str] = None
+    github: Optional[str] = None
+    hugging_face: Optional[str] = None
+    kaggle: Optional[str] = None
+class Education(BaseModel):
+    institution: str
+    degree: str
+    start_date: Optional[str] = None
+    end_date: Optional[str] = None
+class Experience(BaseModel):
+    title: str = Field(description="Job role/title.")
+    company: str = Field(description="Name of the company or organization.")
+    start_date: Optional[str] = None
+    end_date: Optional[str] = None
+class Project(BaseModel):
+    name: str = Field(description="Name of a project.")
+    description: str = Field(description="Project Description")
+    technologies: List[str] = None
+    url: Optional[str] = None
+    difficulty_score: int = Field(
+      ...,
+      ge=1,
+      le=10,
+      description=(
+        "Strictly evaluate AI engineering complexity. "
+        "1-3: Simple 'wrapper' apps, basic prompting, or out-of-the-box RAG with a single data source. "
+        "4-6: Production-grade apps with persistent memory, multi-step tool use (agents), "
+        "complex data parsing (PDFs/Tables), or basic fine-tuning for style. "
+        "7-8: Advanced architectures featuring multi-agent orchestration, self-healing loops, "
+        "complex hybrid search (vector + keyword), or custom evaluation frameworks (LLM-as-a-judge). "
+        "9-10: Highly complex, mission-critical systems with real-time streaming, "
+        "multi-modal integration, or heavy optimization for cost and latency at scale. "
+        "If the project only uses a single API call without complex logic, it must not exceed 3."
+        )
+    )
+# Main AI Developer Resume Schema
+class Resume(BaseModel):
+    full_name: str = Field(..., description="Full name of the applicant.")
+    contact: ContactInformation
+    summary: str = Field(..., description="Professional summary focusing on AI/ML.")
+    education: Optional[List[Education]] = Field(
+            ..., description="List of educational degrees. Return null if not explicitly present."
+        )
+    experience: Optional[List[Experience]] = Field(
+            ..., description="List of experiences. Return null if not explicitly present."
+        )
+    ai_ml_skills: List[str] = Field(..., description="Specific AI/ML skills (e.g., LLMs, Computer Vision).")
+    technical_skills: List[str] = Field(..., description="Programming languages and tools.")
+    projects: Optional[List[Project]] = None
+    certifications: Optional[List[str]] = None

core/processing/dataframe.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import pandas as pd
+def resume_to_df(resume):
+    r = resume.dict()
+    base = {
+        "full_name": r["full_name"],
+        "summary": r["summary"],
+        **{f"contact_{k}": v for k, v in r["contact"].items()},
+        "ai_ml_skills": ", ".join(r.get("ai_ml_skills", []) or []),
+        "technical_skills": ", ".join(r.get("technical_skills", []) or []),
+        "certifications": ", ".join(r.get("certifications", []) or [])
+    }
+    rows = []
+    # max length among lists
+    max_len = max(
+        len(r.get("education") or []),
+        len(r.get("experience") or []),
+        len(r.get("projects") or []),
+        1
+    )
+    print('max_len: ', max_len)
+    for i in range(max_len):
+        row = base.copy()
+        # education
+        educations = r.get("education", []) or []
+        if i < len(educations):
+            e = educations[i]
+            row.update({
+                "edu_institution": e["institution"],
+                "edu_degree": e["degree"],
+                "edu_start": e["start_date"],
+                "edu_end": e["end_date"],
+            })
+        # experience
+        experiences = r.get("experience", []) or []
+        if i < len(experiences):
+            ex = experiences[i]
+            row.update({
+                "exp_title": ex["title"],
+                "exp_company": ex["company"],
+                "exp_start": ex["start_date"],
+                "exp_end": ex["end_date"],
+            })
+        # projects
+        projects = r.get("projects", []) or []
+        if i < len(projects):
+            p = projects[i]
+            row.update({
+                "proj_name": p["name"],
+                "proj_desc": p["description"],
+                "proj_tech": ", ".join(p["technologies"]),
+                "proj_score": p["difficulty_score"],
+            })
+        rows.append(row)
+    return pd.DataFrame(rows)

core/utils/helpers.py ADDED Viewed

File without changes

requirements.txt ADDED Viewed

	@@ -0,0 +1,19 @@

+streamlit==1.37.1
+pandas==2.2.2
+python-dotenv==1.0.1
+# Pydantic AI stack
+pydantic==2.7.4
+pydantic-ai==0.0.14
+# HF ecosystem (important: keep older compatible versions)
+transformers==4.41.2
+huggingface-hub==0.34.0
+tokenizers==0.19.1
+# Docling
+docling==2.28.0
+# Optional but often required by docling
+torch==2.3.1
+numpy==1.26.4