Sher1988 commited on
Commit
626428e
·
1 Parent(s): 0ceb825

initial commit

Browse files
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ flickr8k/images/101654506_8eb26cfb60.jpg filter=lfs diff=lfs merge=lfs -text
37
+ flickr8k/images/109202756_b97fcdc62c.jpg filter=lfs diff=lfs merge=lfs -text
38
+ flickr8k/images/136644343_0e2b423829.jpg filter=lfs diff=lfs merge=lfs -text
39
+ flickr8k/images/47870024_73a4481f7d.jpg filter=lfs diff=lfs merge=lfs -text
40
+ flickr8k/images/47871819_db55ac4699.jpg filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Environment variables
2
+ .env
3
+
4
+ # Python cache
5
+ __pycache__/
6
+ *.pyc
7
+
8
+ # Virtual environment
9
+ .venv/
10
+
11
+ # OS files
12
+ .DS_Store
13
+ Thumbs.db
14
+
15
+ # Logs
16
+ *.log
17
+
18
+ # Data (optional)
19
+ data/
20
+
21
+ # Jupyter
22
+ .ipynb_checkpoints/
23
+
24
+ # Build
25
+ dist/
26
+ build/
27
+
28
+ # IDE
29
+ .vscode/
30
+ .idea/
31
+
32
+
33
+ # ignore dockerfile for github upload.
34
+ Dockerfile
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Sher Alam
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,12 +1,160 @@
1
  ---
2
- title: Resume Intelligence Chat
3
- emoji: 🏆
4
- colorFrom: blue
5
- colorTo: indigo
6
- sdk: docker
7
- pinned: false
8
- license: mit
9
- short_description: LLM Powered Resume Data Extraction and Intelligence Chat app
10
  ---
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Resume-Intelligence-Chat
3
+ emoji: 📸
4
+ sdk: streamlit
5
+ sdk_version: 1.37.1
6
+ app_file: app.py
 
 
 
7
  ---
8
+ ## Resume Intelligence Chat
9
 
10
+ A Streamlit app that ingests resumes, structures them into a schema, stores them in a SQLite database, and enables natural language querying using LLM-generated SQL.
11
+
12
+ ---
13
+
14
+ ## Features
15
+
16
+ * **Resume Parsing**: Uses Docling to extract raw text from PDF resumes
17
+ * **Structured Extraction**: Pydantic AI agent converts text into a typed `Resume` schema
18
+ * **Database Storage**: Extracted data is stored in SQLite
19
+ * **Natural Language Queries**:
20
+
21
+ * User asks a question
22
+ * LLM generates SQL query
23
+ * Query runs on database
24
+ * Result is fed back to LLM for final answer
25
+ * **Chat Interface**: Streamlit-based conversational UI
26
+ * **Database Control**: Option to delete/reset database
27
+
28
+ ---
29
+
30
+ ## Architecture
31
+
32
+ ```
33
+ PDF Resume
34
+
35
+ Docling Parser
36
+
37
+ Pydantic AI Agent → Resume Schema
38
+
39
+ SQLite Database
40
+
41
+ User Query (NL)
42
+
43
+ LLM → SQL Query
44
+
45
+ Execute on DB
46
+
47
+ LLM → Final Answer
48
+ ```
49
+
50
+ ---
51
+
52
+ ## Project Structure
53
+
54
+ ```
55
+ .
56
+ ├── app.py
57
+ ├── core/
58
+ │ ├── ingestion/
59
+ │ │ └── docling_loader.py
60
+ │ ├── parsing/
61
+ │ │ ├── extractor.py
62
+ │ │ └── schema.py
63
+ │ ├── processing/
64
+ │ │ └── database.py
65
+ │ └── chains/
66
+ │ ├── generate_sql_query
67
+ │ └── generate_nl_answer
68
+ ├── data/db/
69
+ │ └── resumes.db
70
+ ```
71
+
72
+ ---
73
+
74
+ ## Setup
75
+
76
+ ### 1. Install dependencies
77
+
78
+ ```bash
79
+ pip install -r requirements.txt
80
+ ```
81
+
82
+ ### 2. Run app
83
+
84
+ ```bash
85
+ streamlit run app.py
86
+ ```
87
+
88
+ ---
89
+
90
+ ## Usage
91
+
92
+ ### Upload & Index
93
+
94
+ 1. Upload one or more PDF resumes
95
+ 2. Click **"Process & Index Resumes"**
96
+ 3. Data is parsed, structured, and stored
97
+
98
+ ### Query
99
+
100
+ * Ask questions like:
101
+
102
+ * `List all candidate names`
103
+ * `List complex projects with candidate name`
104
+ * `Show candidates with 5+ years experience`
105
+
106
+ ### Database Reset
107
+
108
+ * Check **Confirm delete database**
109
+ * Click **Delete Database**
110
+
111
+ ---
112
+
113
+ ## Core Components
114
+
115
+ ### 1. Docling Loader
116
+
117
+ Extracts clean text from PDF resumes.
118
+
119
+ ### 2. Resume Extractor
120
+
121
+ Uses Pydantic AI agent to map text → structured `Resume` object.
122
+
123
+ ### 3. SQLite Storage
124
+
125
+ Stores structured resume data for querying.
126
+
127
+ ### 4. SQL Generator
128
+
129
+ LLM converts user query → SQL statement.
130
+
131
+ ### 5. Answer Generator
132
+
133
+ LLM converts DB results → natural language response.
134
+
135
+ ---
136
+
137
+ ## Safety Checks
138
+
139
+ * Only allows `SELECT` / `WITH` queries
140
+ * Rejects irrelevant or unsafe queries
141
+ * Handles empty results (`NO_DATA`)
142
+
143
+ ---
144
+
145
+ ## Notes
146
+
147
+ * Minimizes LLM calls by separating SQL generation and response generation
148
+ * Works with multiple resumes
149
+ * Designed for local/offline LLM setups (e.g., Ollama)
150
+
151
+ ---
152
+
153
+ ## Future Improvements
154
+
155
+ * Human-in-the-loop SQL approval
156
+ * Multi-table schema support
157
+ * Better handling of missing/empty fields
158
+ * Query caching for performance
159
+
160
+ ---
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import ast
2
+
3
+ import streamlit as st
4
+ import tempfile
5
+ import os
6
+
7
+ from core.chains import generate_sql_query, generate_nl_answer
8
+ from core.ingestion.docling_loader import load_and_convert_cv
9
+ from core.parsing.extractor import extract_resume
10
+ from core.parsing.schema import Resume
11
+ from core.processing.database import resume_to_sqlite
12
+ from langchain_community.utilities import SQLDatabase
13
+
14
+
15
+ @st.cache_resource
16
+ def get_db():
17
+ return SQLDatabase.from_uri("sqlite:///data/db/resumes.db")
18
+
19
+ st.set_page_config(page_title="Resume AI Assistant", layout="wide")
20
+ st.title("🤖 Resume Intelligence Chat")
21
+
22
+ # Initialize chat history
23
+ if "messages" not in st.session_state:
24
+ st.session_state.messages = []
25
+
26
+ # Sidebar for File Uploads
27
+ with st.sidebar:
28
+ st.header("Upload Center")
29
+ uploaded_files = st.file_uploader(
30
+ "Upload PDF Resumes",
31
+ type=["pdf"],
32
+ accept_multiple_files=True
33
+ )
34
+
35
+ if uploaded_files and st.button("Process & Index Resumes"):
36
+ os.makedirs("data/db", exist_ok=True)
37
+ with st.spinner(f"Indexing {len(uploaded_files)} resumes..."):
38
+ for uploaded_file in uploaded_files:
39
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
40
+ tmp.write(uploaded_file.getbuffer())
41
+ pdf_path = tmp.name
42
+ try:
43
+ text = load_and_convert_cv(pdf_path)
44
+ data: Resume = extract_resume(text)
45
+ resume_to_sqlite(data, "data/db/resumes.db")
46
+ finally:
47
+ os.remove(pdf_path)
48
+ st.success("Indexing complete!")
49
+
50
+ st.divider()
51
+
52
+ confirm = st.checkbox("Confirm delete database")
53
+ # DELETE DB BUTTON
54
+ if st.button("🗑️ Delete Database"):
55
+ if confirm:
56
+ tables = get_db().run("select name from sqlite_master where type='table';")
57
+ # tables = "[('resume_base',), ('contact',), ('certifications',), ('education',), ('experience',), ('projects',)]"
58
+ tables = ast.literal_eval(tables)
59
+ for table in tables:
60
+ get_db().run(f"drop table if exists {table[0]};")
61
+ st.success("All tables dropped successfully.")
62
+ else:
63
+ st.warning("Please confirm deletion first.")
64
+
65
+ # Display chat history
66
+ for message in st.session_state.messages:
67
+ with st.chat_message(message["role"]):
68
+ st.markdown(message["content"])
69
+
70
+ # Chat Input
71
+ if prompt := st.chat_input("Ask about your resumes (e.g., 'List all candidates names')"):
72
+ # Add user message to history
73
+ st.session_state.messages.append({"role": "user", "content": prompt})
74
+ with st.chat_message("user"):
75
+ st.markdown(prompt)
76
+
77
+ # Generate response
78
+ with st.chat_message("assistant"):
79
+ with st.spinner("Analyzing database..."):
80
+ try:
81
+ # create sql query for the user query
82
+ sql_query = generate_sql_query(prompt).strip()
83
+ # sql_query = "select name from resume_base;" # fake query
84
+ print('sql_query: ', sql_query)
85
+ if sql_query == "IRRELEVANT QUERY":
86
+ response = "This question is outside the resume database scope."
87
+ elif not sql_query.upper().startswith(("SELECT", "WITH")):
88
+ raise ValueError("Invalid SQL generated")
89
+ else:
90
+ db_result = get_db().run(sql_query)
91
+ if not db_result:
92
+ db_result = "NO_DATA"
93
+ # create a natural language response based on db results.
94
+ response = generate_nl_answer(prompt, db_result)
95
+ # response = db_result # fake response
96
+ print('response generated')
97
+ st.markdown(response)
98
+ st.code(sql_query, language="sql", width="content") # show query
99
+ st.session_state.messages.append({"role": "assistant", "content": response})
100
+ except Exception as e:
101
+ error_msg = f"Sorry, I ran into an error: {str(e)}"
102
+ st.error(error_msg)
core/chains.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain_core.prompts import ChatPromptTemplate
3
+ from langchain_core.runnables import RunnableLambda
4
+ from core.llm_model import model2 as llm
5
+
6
+
7
+ system_prompt_sql = '''
8
+ You are a SQLite SQL generator for a resume database.
9
+
10
+ SCHEMA:
11
+ resume_base(resume_id, name, summary)
12
+ contact(resume_id, email, phone, linkedin, github, hugging_face, kaggle)
13
+ certifications(resume_id, certification_name)
14
+ education(resume_id, institution, degree, start_date, end_date)
15
+ experience(resume_id, title, company, start_date, end_date)
16
+ projects(resume_id, name, description, technologies, url, difficulty_score)
17
+
18
+ RELATIONS:
19
+ resume_base.resume_id = contact.resume_id
20
+ resume_base.resume_id = education.resume_id
21
+ resume_base.resume_id = experience.resume_id
22
+ resume_base.resume_id = projects.resume_id
23
+
24
+ TASK:
25
+ Convert user query → valid SQLite SQL.
26
+
27
+ OUTPUT RULES:
28
+ - One line only
29
+ - Only SQL
30
+ - No markdown, no backticks, no text
31
+
32
+ RELEVANCE:
33
+ - If not answerable from schema → IRRELEVANT QUERY
34
+
35
+ QUERY RULES:
36
+ - Use COLLATE NOCASE on searches
37
+ - Use only listed tables/columns
38
+ - Use explicit JOIN when needed
39
+ - Always join on resume_id
40
+ - No invented schema
41
+ '''
42
+
43
+ primary_template = ChatPromptTemplate.from_messages([
44
+ ("system", system_prompt_sql),
45
+ ("human", "Query: {user_query}")
46
+ ])
47
+
48
+ primary_chain = primary_template | llm | RunnableLambda(lambda response: response.content)
49
+
50
+ def generate_sql_query(user_query):
51
+ return primary_chain.invoke({
52
+ "user_query": user_query
53
+ })
54
+
55
+
56
+
57
+ system_prompt_analyst = '''
58
+ You are a data analyst.
59
+
60
+ INPUT:
61
+ - User question
62
+ - SQL query result (from database)
63
+
64
+ TASK:
65
+ - Generate a clear natural language answer based ONLY on the SQL result
66
+ - If result is empty, say no matching data found
67
+ - Generate short and concise answer in markdown format
68
+ - Do NOT generate SQL
69
+ - Do NOT hallucinate missing data
70
+ '''
71
+
72
+
73
+ # Secondary Chain
74
+ secondary_template = ChatPromptTemplate.from_messages([
75
+ ("system", system_prompt_analyst),
76
+ ("human", "Query: {user_query}\nResults: {db_results}")
77
+ ])
78
+
79
+ secondary_chain = secondary_template | llm | RunnableLambda(lambda response: response.content)
80
+
81
+ def generate_nl_answer(user_query, db_results):
82
+ return secondary_chain.invoke({
83
+ "user_query": user_query,
84
+ "db_results": db_results
85
+ })
86
+
87
+
core/ingestion/docling_loader.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pathlib import Path
3
+ from docling.document_converter import DocumentConverter, PdfFormatOption
4
+
5
+
6
+ @st.cache_resource
7
+ def get_converter():
8
+ """
9
+ Initializes and caches the Docling DocumentConverter.
10
+ This ensures models are only loaded once across app reruns.
11
+ """
12
+ return DocumentConverter(
13
+ format_options={
14
+ "pdf": PdfFormatOption(enable_ocr=False)
15
+ }
16
+ )
17
+
18
+
19
+ def load_and_convert_cv(file_path: str) -> str:
20
+ """
21
+ Converts a PDF/DOCX file to Markdown format using Docling.
22
+
23
+ Args:
24
+ file_path (str): The local path to the uploaded CV file.
25
+
26
+ Returns:
27
+ str: The converted markdown text.
28
+ """
29
+ converter = get_converter()
30
+ result = converter.convert(file_path)
31
+ text_content = result.document.export_to_text()
32
+ return text_content
core/llm_model.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from langchain_openai import ChatOpenAI
3
+
4
+ from pydantic_ai.models.openai import OpenAIChatModel
5
+ from pydantic_ai.providers.openai import OpenAIProvider
6
+ import os
7
+ from dotenv import load_dotenv
8
+
9
+
10
+ load_dotenv() # unnecessary if deployed on huggingface space as HF has secret key.
11
+ # api_key = os.environ["HF_TOKEN"] # raises error if missing
12
+ api_key = os.environ["GITHUB_TOKEN"] # raises error if missing
13
+
14
+
15
+
16
+ # 1. Define the GitHub-compatible provider
17
+ github_provider = OpenAIProvider(
18
+ base_url="https://models.inference.ai.azure.com", # GitHub Models endpoint
19
+ api_key=api_key # Your GitHub PAT
20
+ )
21
+
22
+ # 2. Initialize the model using the GitHub provider
23
+ # Use model IDs like 'gpt-4o', 'meta-llama-3.1-70b-instruct', or 'DeepSeek-R1'
24
+ model1 = OpenAIChatModel(
25
+ "gpt-4o",
26
+ provider=github_provider
27
+ )
28
+
29
+
30
+
31
+
32
+ model2 = ChatOpenAI(
33
+ model="gpt-4o", # Or other models like "meta-llama-3.1-70b-instruct"
34
+ openai_api_key=api_key,
35
+ base_url="https://models.github.ai/inference"
36
+ )
37
+
38
+
39
+
40
+
41
+ # from pydantic_ai import Agent
42
+ # from pydantic_ai.models.openai import OpenAIChatModel
43
+ # from pydantic_ai.providers.openai import OpenAIProvider
44
+
45
+ # model1 = OpenAIChatModel(
46
+ # # model="qwen2.5:7b-instruct",
47
+ # "qwen2.5-7b-instruct-q4_k_m",
48
+ # provider=OpenAIProvider(
49
+ # base_url="http://localhost:11434/v1", # 👈 Ollama
50
+ # api_key="ollama" # dummy
51
+ # )
52
+ # )
53
+
54
+
55
+ # from langchain_ollama import ChatOllama
56
+
57
+ # model2 = ChatOllama(
58
+ # model="Dolphin_SQL"
59
+ # )
60
+
61
+
62
+
63
+ # from pydantic_ai.models.huggingface import HuggingFaceModel
64
+ # from pydantic_ai.providers.openai import OpenAIProvider
65
+ # from dotenv import load_dotenv
66
+ # import os
67
+ # from langchain_openai import ChatOpenAI
68
+
69
+
70
+ # load_dotenv() # unnecessary if deployed on huggingface space.
71
+ # api_key = os.environ["HF_TOKEN"] # raises error if missing
72
+
73
+
74
+ # model1 = HuggingFaceModel(
75
+ # 'Qwen/Qwen2.5-7B-Instruct',
76
+ # provider=OpenAIProvider(
77
+ # base_url="https://router.huggingface.co/v1",
78
+ # api_key=api_key
79
+ # )
80
+ # )
81
+
82
+
83
+ # # Initialize using the OpenAI-compatible router
84
+ # model2 = ChatOpenAI(
85
+ # model='Qwen/Qwen2.5-7B-Instruct',
86
+ # openai_api_key=api_key,
87
+ # openai_api_base="https://router.huggingface.co/v1"
88
+ # )
core/parsing/extractor.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_ai import Agent
2
+
3
+ from core.parsing.schema import Resume
4
+ from core.llm_model import model1
5
+
6
+
7
+ agent = Agent(
8
+ model=model1,
9
+ system_prompt=(
10
+ 'You are an expert resume extractor.'
11
+ 'If the context is not a Resume return null and DO NOT infer or hallucinate.'
12
+ 'Do NOT infer or hallucinate missing sections.'
13
+ 'If a section is not explicitly present, return null or empty list.'
14
+ ),
15
+ output_type=Resume
16
+ )
17
+
18
+
19
+ def extract_resume(text: str) -> Resume:
20
+ '''
21
+ Extract data from text using pydantic ai agent.
22
+
23
+ Args:
24
+ text (str): Text extracted from resume (using parser eg. Docling)
25
+
26
+ Returns:
27
+ Resume: Structured schema for resume
28
+ '''
29
+ result = agent.run_sync(text)
30
+ return result.output
core/parsing/schema.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field
2
+ from typing import List, Optional
3
+
4
+
5
+ # Nested models for detailed resume sections
6
+ class ContactInformation(BaseModel):
7
+ email: str = Field(None, description="Email address.")
8
+ phone: Optional[str] = Field(None, description='mobile number eg. +92 03011234567')
9
+ linkedin: Optional[str] = None
10
+ github: Optional[str] = None
11
+ hugging_face: Optional[str] = None
12
+ kaggle: Optional[str] = None
13
+
14
+
15
+ class Education(BaseModel):
16
+ institution: str
17
+ degree: str
18
+ start_date: Optional[str] = None
19
+ end_date: Optional[str] = None
20
+
21
+
22
+ class Experience(BaseModel):
23
+ title: str = Field(description="Job role/title.")
24
+ company: str = Field(description="Name of the company or organization.")
25
+ start_date: Optional[str] = None
26
+ end_date: Optional[str] = None
27
+
28
+
29
+ class Project(BaseModel):
30
+ name: str = Field(description="Name of a project.")
31
+ description: str = Field(description="Project Description")
32
+ technologies: List[str] = None
33
+ url: Optional[str] = None
34
+ difficulty_score: int = Field(
35
+ ...,
36
+ ge=1,
37
+ le=10,
38
+ description=(
39
+ "Strictly evaluate AI engineering complexity. "
40
+ "1-3: Simple 'wrapper' apps, basic prompting, or out-of-the-box RAG with a single data source. "
41
+ "4-6: Production-grade apps with persistent memory, multi-step tool use (agents), "
42
+ "complex data parsing (PDFs/Tables), or basic fine-tuning for style. "
43
+ "7-8: Advanced architectures featuring multi-agent orchestration, self-healing loops, "
44
+ "complex hybrid search (vector + keyword), or custom evaluation frameworks (LLM-as-a-judge). "
45
+ "9-10: Highly complex, mission-critical systems with real-time streaming, "
46
+ "multi-modal integration, or heavy optimization for cost and latency at scale. "
47
+ "If the project only uses a single API call without complex logic, it must not exceed 3."
48
+ )
49
+ )
50
+
51
+ # Main AI Developer Resume Schema
52
+ class Resume(BaseModel):
53
+ full_name: str = Field(..., description="Full name of the applicant.")
54
+ contact: ContactInformation
55
+ summary: str = Field(..., description="Professional summary focusing on AI/ML.")
56
+ education: Optional[List[Education]] = Field(
57
+ ..., description="List of educational degrees. Return null if not explicitly present."
58
+ )
59
+ experience: Optional[List[Experience]] = Field(
60
+ ..., description="List of experiences. Return null if not explicitly present."
61
+ )
62
+ ai_ml_skills: List[str] = Field(..., description="Specific AI/ML skills (e.g., LLMs, Computer Vision).")
63
+ technical_skills: List[str] = Field(..., description="Programming languages and tools.")
64
+ projects: Optional[List[Project]] = None
65
+ certifications: Optional[List[str]] = None
core/processing/database.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import sqlite3
3
+ import uuid
4
+ from core.parsing.schema import Resume
5
+
6
+ def resume_to_sqlite(resume: Resume, db_path: str = "resumes.db"):
7
+ r = resume.model_dump()
8
+ resume_id = str(uuid.uuid4())[:8]
9
+
10
+ # 1. Prepare Data
11
+ base_data = {
12
+ "resume_id": resume_id,
13
+ "name": r.get("full_name"),
14
+ # **{f"contact_{k}": v for k, v in (r.get("contact") or {}).items()},
15
+ "summary": r.get("summary"),
16
+ }
17
+
18
+ # Helper to create DF with resume_id
19
+ def create_df(key):
20
+ data = r.get(key) or []
21
+ # Special handling for project technologies list
22
+ if key == "projects":
23
+ for p in data:
24
+ if isinstance(p.get("technologies"), list):
25
+ p["technologies"] = ", ".join(p["technologies"])
26
+ if key == "contact":
27
+ df = pd.DataFrame([data])
28
+ else:
29
+ df = pd.DataFrame(data)
30
+ if not df.empty:
31
+ df.insert(0, 'resume_id', resume_id)
32
+ return df
33
+
34
+ # 2. Write to SQLite
35
+ with sqlite3.connect(db_path) as conn:
36
+ # Save base info
37
+ pd.DataFrame([base_data]).to_sql("resume_base", conn, if_exists="append", index=False)
38
+ # pd.DataFrame([r.get("contact" or [])]).to_sql('contact', conn, if_exists='append', index=False)
39
+ # Save nested lists
40
+ tables = ["contact", "skills", "certifications", "education", "experience", "projects"]
41
+ for table in tables:
42
+ df = create_df(table)
43
+ if not df.empty:
44
+ df.to_sql(table, conn, if_exists="append", index=False)
45
+
46
+ return resume_id
core/processing/dataframe.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from core.parsing.schema import Resume
3
+ import uuid
4
+
5
+
6
+ def resume_to_dfs(resume: Resume):
7
+ r = resume.model_dump()
8
+
9
+ # Flattens the top-level fields and the nested 'contact' dict
10
+ base_data = {
11
+ "name": r.get("full_name"),
12
+ **{f"contact_{k}": v for k, v in (r.get("contact") or {}).items()},
13
+ "summary": r.get("summary"),
14
+ }
15
+
16
+ df_base = pd.DataFrame([base_data])
17
+
18
+ df_skl = pd.DataFrame(r.get("skills") or [])
19
+
20
+ df_cert = pd.DataFrame(r.get("certifications") or [])
21
+
22
+ df_edu = pd.DataFrame(r.get("education") or [])
23
+
24
+ df_exp = pd.DataFrame(r.get("experience") or [])
25
+
26
+ # We handle the 'technologies' list by joining it into a string for the CSV/Table view
27
+ projects = r.get("projects") or []
28
+ for p in projects:
29
+ if isinstance(p.get("technologies"), list):
30
+ p["technologies"] = ", ".join(p["technologies"])
31
+ df_proj = pd.DataFrame(projects)
32
+
33
+ dfs = {
34
+ "base": df_base,
35
+ "skills": df_skl,
36
+ "certifications": df_cert,
37
+ "education": df_edu,
38
+ "experience": df_exp,
39
+ "projects": df_proj
40
+ }
41
+
42
+ resume_id = str(uuid.uuid4())[:8] # Short unique ID
43
+ for df in dfs:
44
+ df.insert(0, 'resume_id', resume_id)
45
+
46
+ return dfs
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ docling==2.82.0
2
+ langchain_community==0.4.1
3
+ langchain_core==1.2.31
4
+ langchain_openai==1.1.14
5
+ pandas==3.0.2
6
+ pydantic==2.13.1
7
+ pydantic_ai==1.83.0
8
+ python-dotenv==1.2.2
9
+ streamlit==1.55.0