earlsab commited on
Commit
b34a619
·
1 Parent(s): c04d3f7

Broken; Committed for testing

Browse files
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ models_lfs
ThesisFinal_(Added_Implementation_of_Quality)-10.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
__pycache__/date_extraction_model.cpython-310.pyc ADDED
Binary file (666 Bytes). View file
 
__pycache__/model_date_extraction.cpython-310.pyc ADDED
Binary file (666 Bytes). View file
 
__pycache__/model_section_segmentation.cpython-310.pyc ADDED
Binary file (8.48 kB). View file
 
__pycache__/model_section_sementation.cpython-310.pyc ADDED
Binary file (2.56 kB). View file
 
__pycache__/model_skill_extraction.cpython-310.pyc ADDED
Binary file (1.58 kB). View file
 
__pycache__/model_skill_quality_extraction.cpython-310.pyc ADDED
Binary file (1.39 kB). View file
 
__pycache__/resume_analysis_model.cpython-310.pyc ADDED
Binary file (2.25 kB). View file
 
__pycache__/skill_extraction_model.cpython-310.pyc ADDED
Binary file (1.58 kB). View file
 
__pycache__/skill_quality_extraction_model.cpython-310.pyc ADDED
Binary file (1.39 kB). View file
 
app.py CHANGED
@@ -1,8 +1,143 @@
1
  import gradio as gr
2
 
3
- def greet(name):
4
- return "Hello " + name + "!!"
 
 
5
 
6
- demo = gr.Interface(fn=greet, inputs="text", outputs="text")
7
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
1
  import gradio as gr
2
 
3
+ from model_skill_extraction import SkillExtractionModel
4
+ from model_section_segmentation import SegmentationModelJobDescription, SegmentationModelResume
5
+ from model_skill_quality_extraction import SkillQualityExtractionModel
6
+ from model_date_extraction import DateExtractionModel
7
 
8
+ import json
9
+ from typing import List, Dict
10
+ import time
11
+
12
+ # Initialize models
13
+ segmentation_model_resume = SegmentationModelResume()
14
+ segmentation_model_job_description= SegmentationModelJobDescription()
15
+ skill_model = SkillExtractionModel()
16
+ skill_quality_model = SkillQualityExtractionModel()
17
+ date_model = DateExtractionModel()
18
+
19
+ def process_job_description(job_description: str) -> Dict:
20
+ """Process job description and extract skills"""
21
+ result = skill_model.process_text(job_description)
22
+ return result
23
+
24
+ def process_resume(resume_text: str, job_skills: List[str]) -> Dict:
25
+ """Process resume and analyze against job skills"""
26
+ # result = resume_model.process_resume(resume_text, job_skills)
27
+ result = skill_model.process_text(resume_text)
28
+ return result
29
+
30
+ def create_html_output(job_result: Dict, resume_results: List[Dict]) -> str:
31
+ """Create HTML output for the interface"""
32
+ html = "<div style='font-family: Arial, sans-serif;'>"
33
+
34
+ # Job Description Section
35
+ html += "<h2>Job Description Analysis</h2>"
36
+ html += f"<p><strong>Total Skills Found:</strong> {job_result['total_skills']}</p>"
37
+ html += "<p><strong>Skills:</strong></p>"
38
+ html += "<div style='background-color: #f0f0f0; padding: 10px; border-radius: 5px;'>"
39
+ for skill in job_result['skills']:
40
+ html += f"<span style='background-color: #e0e0e0; padding: 2px 5px; margin: 2px; border-radius: 3px; display: inline-block;'>{skill['text']}</span>"
41
+ html += "</div>"
42
+
43
+ # Resume Analysis Section
44
+ html += "<h2>Resume Analysis</h2>"
45
+ for i, resume_result in enumerate(resume_results, 1):
46
+ html += f"<div style='margin-bottom: 20px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;'>"
47
+ html += f"<h3>Resume {i}</h3>"
48
+ # html += f"<p><strong>Skill Match Quality:</strong> {resume_result['skill_quality']['quality_score']:.2%}</p>"
49
+ # html += f"<p><strong>Matched Skills:</strong> {resume_result['skill_quality']['matched_skills_count']}/{resume_result['skill_quality']['total_required_skills']}</p>"
50
+ html += "<p><strong>Matched Skills:</strong></p>"
51
+ html += "<div style='background-color: #f0f0f0; padding: 10px; border-radius: 5px;'>"
52
+ # for skill in resume_result['skill_quality']['matched_skills']:
53
+ # html += f"<span style='background-color: #e0e0e0; padding: 2px 5px; margin: 2px; border-radius: 3px; display: inline-block;'>{skill}</span>"
54
+ html += "</div>"
55
+ html += "<details><summary>View Full Resume</summary>"
56
+ # html += f"<pre style='white-space: pre-wrap;'>{resume_result['full_text']}</pre>"
57
+ html += "</details>"
58
+ html += "</div>"
59
+
60
+ html += "</div>"
61
+ return html
62
+
63
+ def process_inputs(job_description: str, input_type: str, resume_text: str, resume_files: List[str]) -> str:
64
+ """Main processing function"""
65
+ # Process job description
66
+ job_result = process_job_description(job_description)
67
+
68
+ # Process resumes based on input type
69
+ resume_results = []
70
+ if input_type == "Paste Text":
71
+ # Process single resume from text input
72
+ resume_result = process_resume(resume_text, [skill['text'] for skill in job_result['skills']])
73
+ resume_results.append(resume_result)
74
+ else:
75
+ # Process multiple resumes from file uploads
76
+ for file_path in resume_files:
77
+ with open(file_path, 'r', encoding='utf-8') as f:
78
+ resume_content = f.read()
79
+ resume_result = process_resume(resume_content, [skill['text'] for skill in job_result['skills']])
80
+ resume_results.append(resume_result)
81
+
82
+ # Create HTML output
83
+ return create_html_output(job_result, resume_results)
84
+
85
+ # Create Gradio interface
86
+ with gr.Blocks(title="Resume Analysis System") as demo:
87
+ gr.Markdown("# Beyond Keywords: Job Description and Resume Analyzer")
88
+ gr.Markdown("Upload a job description and resume(s) to analyze skill matches and quality.")
89
+
90
+ with gr.Row():
91
+ with gr.Column():
92
+ job_description = gr.Textbox(
93
+ label="Job Description",
94
+ placeholder="Paste the job description here...",
95
+ lines=13.10
96
+ )
97
+
98
+ with gr.Column():
99
+ resume_input = gr.Group()
100
+ with resume_input:
101
+ input_type = gr.Radio(
102
+ choices=["Paste Text", "Upload File"],
103
+ label="Input Method",
104
+ value="Paste Text"
105
+ )
106
+ resume_text = gr.Textbox(
107
+ label="Resume Text",
108
+ placeholder="Paste the resume text here...",
109
+ lines=8.85,
110
+ visible=True
111
+ )
112
+ resume_file = gr.Files(
113
+ label="Upload Resume(s) (.txt files)",
114
+ file_types=[".txt"],
115
+ visible=False,
116
+ interactive=True,
117
+ type="filepath"
118
+ )
119
+
120
+ def toggle_input(choice):
121
+ return {
122
+ resume_text: gr.update(visible=choice=="Paste Text"),
123
+ resume_file: gr.update(visible=choice=="Upload File")
124
+ }
125
+
126
+ input_type.change(
127
+ fn=toggle_input,
128
+ inputs=input_type,
129
+ outputs=[resume_text, resume_file]
130
+ )
131
+
132
+ submit_btn = gr.Button("Analyze")
133
+ output = gr.HTML(label="Analysis Results")
134
+
135
+ submit_btn.click(
136
+ fn=process_inputs,
137
+ inputs=[job_description, input_type, resume_text, resume_file],
138
+ outputs=output
139
+ )
140
+
141
+ if __name__ == "__main__":
142
+ demo.launch()
143
 
model_date_extraction.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ class DateExtractionModel:
3
+ def __init__(self):
4
+ self.model = None
5
+ self.tokenizer = None
6
+ self.device = None
7
+ self.load_model()
8
+
9
+ def load_model(self):
10
+ """
11
+ Init
12
+ """
13
+
14
+ pass
model_section_segmentation.py ADDED
@@ -0,0 +1,243 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import List, Tuple
3
+ import torch.nn as nn
4
+ from transformers import LongformerModel
5
+ from transformers import LongformerTokenizer
6
+ import torch.nn.functional as F
7
+
8
+ class LongformerSentenceClassifier(nn.Module):
9
+ def __init__(self, model_name="allenai/longformer-base-4096", num_labels=13):
10
+ """
11
+ Custom Longformer model for sentence classification.
12
+
13
+ Args:
14
+ model_name (str): Hugging Face Longformer model.
15
+ num_labels (int): Number of possible sentence labels.
16
+ """
17
+ super(LongformerSentenceClassifier, self).__init__()
18
+ self.longformer = LongformerModel.from_pretrained(model_name)
19
+ self.classifier = nn.Linear(self.longformer.config.hidden_size, num_labels)
20
+ def forward(self, input_ids, attention_mask, global_attention_mask, cls_positions):
21
+ """
22
+ Forward pass for sentence classification.
23
+
24
+ Args:
25
+ input_ids (Tensor): Tokenized input IDs, shape (batch_size, max_length)
26
+ attention_mask (Tensor): Attention mask, shape (batch_size, max_length)
27
+ global_attention_mask (Tensor): Global attention mask, shape (batch_size, max_length)
28
+ cls_positions (List[Tensor]): Indices of `[CLS]` tokens for each batch element.
29
+ """
30
+ outputs = self.longformer(
31
+ input_ids=input_ids,
32
+ attention_mask=attention_mask,
33
+ global_attention_mask=global_attention_mask
34
+ )
35
+
36
+ last_hidden_state = outputs.last_hidden_state
37
+ cls_positions = cls_positions.view(input_ids.shape[0], -1)
38
+ cls_embeddings = last_hidden_state.gather(1, cls_positions.unsqueeze(-1).expand(-1, -1, last_hidden_state.size(-1)))
39
+ logits = self.classifier(cls_embeddings)
40
+
41
+ return logits
42
+
43
+
44
+ class SegmentationModelJobDescription:
45
+ def __init__(self):
46
+ """Initialize segmentation model for either resume or job description.
47
+
48
+ Args:
49
+ model_type (str): Either "resume" or "job" to specify type of segmentation
50
+ """
51
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52
+
53
+ # Label mapping as provided
54
+ self.Job_label_map = {
55
+ "JT": 0, # Job Title
56
+ "JS": 1, # Job Summary
57
+ "COT": 2, # Title of Company Overview Section
58
+ "COC": 3, # Content of Company Overview Section
59
+ "RT": 4, # Title of Responsibilites Section
60
+ "RC": 5, # Content of Responsibilites Section
61
+ "RQT": 6, # Title of Required Qualifications Section
62
+ "RQC": 7, # Content of Required Qualifications Section
63
+ "PQT": 8, # Title of Preferred Qualifications Section
64
+ "PQC": 9, # Content of Preferred Qualifications Section
65
+ "ET": 10, # Employment Type
66
+ "SBC": 11, # Content of Salary and Benefits Section
67
+ "SBT": 12 # Title of Salary and Benefits Section
68
+ }
69
+ self.Job_num_labels = len(self.Job_label_map)
70
+ self.Job_labels = [
71
+ {"value": "JT", "label": "Job Title"},
72
+ {"value": "JS", "label": "Job Summary"},
73
+ {"value": "COT", "label": "Title of Company Overview Section"},
74
+ {"value": "COC", "label": "Content of Company Overview Section"},
75
+ {"value": "RT", "label": "Title of Responsibilites Section"},
76
+ {"value": "RC", "label": "Content of Responsibilites Section"},
77
+ {"value": "RQT", "label": "Title of Required Qualifications Section"},
78
+ {"value": "RQC", "label": "Content of Required Qualifications Section"},
79
+ {"value": "PQT", "label": "Title of Preferred Qualifications Section"},
80
+ {"value": "PQC", "label": "Content of Preferred Qualifications Section"},
81
+ {"value": "ET", "label": "Employment Type"},
82
+ {"value": "SBC", "label": "Content of Salary and Benefits Section"},
83
+ {"value": "SBT", "label": "Title of Salary and Benefits Section"},
84
+ ]
85
+
86
+ # Load tokenizer
87
+ Job_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
88
+ Job_tokenizer.cls_token
89
+ # Load model architecture
90
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
91
+ Job_model = LongformerSentenceClassifier(num_labels=self.Job_num_labels)
92
+ Job_model.to(device)
93
+ # Load trained weights
94
+ Job_model.load_state_dict(torch.load("model_lfs/JobSegmentClassifier3rdEpoch_v2.pth"))
95
+
96
+ # Set model to evaluation mode
97
+ Job_model.eval()
98
+
99
+ def segment(self, text: str) -> Tuple[List[int], List[str]]:
100
+ """Segment text into sections based on model type.
101
+
102
+ Args:
103
+ text (str): Text to segment
104
+
105
+ Returns:
106
+ Tuple containing:
107
+ - List of predicted section labels as integers
108
+ - List of text lines
109
+ """
110
+ # Split text into lines and remove empty lines
111
+ lines = [line for line in text.splitlines() if line.strip()]
112
+
113
+ if self.model_type == "job":
114
+ # Job description segmentation logic
115
+ concatenated_text = " ".join(f"[CLS] {sentence}" for sentence in lines)
116
+ predictions = self._predict_sections(concatenated_text)
117
+ return predictions, lines
118
+
119
+ else:
120
+ # Resume segmentation logic would go here
121
+ return [], lines
122
+
123
+ def _predict_sections(self, text: str) -> List[int]:
124
+ """Make predictions on the text using appropriate model.
125
+
126
+ Args:
127
+ text (str): Text to make predictions on
128
+
129
+ Returns:
130
+ List of predicted section labels as integers
131
+ """
132
+ # Model prediction logic would go here
133
+ return []
134
+
135
+
136
+
137
+ def predict_job_sections(model, text, tokenizer, device):
138
+ model.eval()
139
+
140
+ # Tokenize text and get input tensors
141
+ encoding = tokenizer(
142
+ text,
143
+ return_tensors="pt",
144
+ truncation=True,
145
+ padding="max_length",
146
+ max_length=4096
147
+ )
148
+
149
+ input_ids = encoding["input_ids"].to(device)
150
+ attention_mask = encoding["attention_mask"].to(device)
151
+
152
+ # Identify `[CLS]` positions (assuming each sentence starts with `[CLS]`)
153
+ cls_positions = (input_ids == tokenizer.cls_token_id).nonzero(as_tuple=True)[1]
154
+ cls_positions = cls_positions.unsqueeze(0).to(device) # Shape: (1, num_sentences)
155
+
156
+ # Create global attention mask (Longformer requires at least 1 global attention token)
157
+ global_attention_mask = torch.zeros_like(input_ids)
158
+ global_attention_mask[:, cls_positions] = 1 # Assign global attention to `[CLS]` tokens
159
+
160
+ # Run the model
161
+ with torch.no_grad():
162
+ logits = model(
163
+ input_ids=input_ids,
164
+ attention_mask=attention_mask,
165
+ global_attention_mask=global_attention_mask,
166
+ cls_positions=cls_positions
167
+ ) # Shape: (1, num_sentences, num_labels)
168
+
169
+ logits = logits.squeeze(0) # Shape: (num_sentences, num_labels)
170
+ probs = F.softmax(logits, dim=-1) # Convert logits to probabilities
171
+ predictions = torch.argmax(probs, dim=-1) # Get predicted label indices
172
+
173
+ return predictions.cpu().numpy() # Convert to NumPy array for easy use
174
+
175
+ def extract_job_sections(self, text):
176
+ lines = text.splitlines()
177
+ lines = [line for line in text.splitlines() if line.strip()]
178
+ text = lines
179
+
180
+ concatenated_text = " ".join(f"{self.Job_tokenizer.cls_token} {sentence}" for sentence in text)
181
+ predictions = self.predict_job_sections(self.Job_model, concatenated_text, self.Job_tokenizer, self.device)
182
+
183
+ return predictions, text
184
+
185
+ def extract_job_requirements(self, text):
186
+ lines = text.splitlines()
187
+ lines = [line for line in text.splitlines() if line.strip()]
188
+ text = lines
189
+
190
+ concatenated_text = " ".join(f"{self.Job_tokenizer.cls_token} {sentence}" for sentence in text)
191
+ predictions = self.predict_job_sections(self.Job_model, concatenated_text, self.Job_tokenizer, self.device)
192
+
193
+ requirements = []
194
+
195
+ i = 0
196
+ for item in predictions[:len(predictions) - 1]:
197
+ if self.Job_labels[item]['value'] == "RQC":
198
+ requirements.append(lines[i])
199
+ i += 1
200
+
201
+ return requirements
202
+
203
+ class SegmentationModelResume:
204
+ def __init__(self):
205
+ pass
206
+
207
+ def segment(self, text: str) -> Tuple[List[int], List[str]]:
208
+ pass
209
+
210
+
211
+ if __name__ == "__main__":
212
+ # Example usage
213
+ job_segmenter = SegmentationModelJobDescription()
214
+ resume_segmenter = SegmentationModelResume()
215
+
216
+ # Example job text
217
+ sample_job_text = """
218
+ Senior Software Engineer
219
+
220
+ We are looking for an experienced developer to join our team.
221
+
222
+ Requirements:
223
+ - 5+ years Python experience
224
+ - Strong knowledge of ML/AI
225
+ - Excellent communication skills
226
+
227
+ Benefits:
228
+ - Competitive salary
229
+ - Remote work options
230
+ - Health insurance
231
+ """
232
+
233
+ # Test job section extraction
234
+ job_sections, job_text = job_segmenter.extract_job_sections(sample_job_text)
235
+ print("\nJob Sections:")
236
+ for section, text in zip(job_sections, job_text):
237
+ print(f"{job_segmenter.Job_labels[section]['value']}: {text}")
238
+
239
+ # Test requirements extraction
240
+ requirements = job_segmenter.extract_job_requirements(sample_job_text)
241
+ print("\nJob Requirements:")
242
+ for req in requirements:
243
+ print(f"- {req}")
model_skill_extraction.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spacy
2
+ from typing import List, Dict
3
+ import time
4
+
5
+ class SkillExtractionModel:
6
+ def __init__(self):
7
+ self.nlp = spacy.load("en_core_web_sm")
8
+ # Add custom skill patterns
9
+ self.skill_patterns = [
10
+ {"label": "SKILL", "pattern": [{"LOWER": {"IN": ["python", "java", "javascript", "sql", "aws", "docker", "kubernetes", "git", "agile", "scrum", "jira", "confluence"]}}]},
11
+ {"label": "SKILL", "pattern": [{"LOWER": {"IN": ["machine learning", "deep learning", "data analysis", "data science", "project management", "leadership"]}}]},
12
+ ]
13
+ self.ruler = self.nlp.add_pipe("entity_ruler")
14
+ self.ruler.add_patterns(self.skill_patterns)
15
+
16
+ def extract_skills(self, text: str) -> List[Dict]:
17
+ doc = self.nlp(text)
18
+ skills = []
19
+ for ent in doc.ents:
20
+ if ent.label_ == "SKILL":
21
+ skills.append({
22
+ "text": ent.text,
23
+ "start": ent.start_char,
24
+ "end": ent.end_char,
25
+ "label": ent.label_
26
+ })
27
+ return skills
28
+
29
+ def process_text(self, text: str) -> Dict:
30
+ # Simulate model loading time
31
+ time.sleep(2)
32
+ skills = self.extract_skills(text)
33
+ return {
34
+ "text": text,
35
+ "skills": skills,
36
+ "total_skills": len(skills)
37
+ }
model_skill_quality_extraction.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class SkillQualityExtractionModel:
2
+ def __init__(self):
3
+ self.model = None
4
+ self.tokenizer = None
5
+ self.device = None
6
+ self.load_model()
7
+
8
+ def load_model(self):
9
+ """Initialize the model and tokenizer"""
10
+ model_name = "roberta-large-mnli"
11
+ leadership_model_path = "models_lfs/pet-leadership-model-roberta-large-mnli_bs4_gas4_lr3e-05_ep2" + "/checkpoint-742"
12
+ collab_model_path = "models_lfs/pet-collaboration-model-roberta-large-mnli_bs4_gas4_lr1e-05_ep3" + "/checkpoint-936"
13
+
14
+ leadership_pattern = "Sentence: {} Question: Does this show leadership? Answer: <mask>"
15
+ collab_pattern = "Sentence: {} Question: Does this show teamwork and collaboration? Answer: <mask>"
16
+ pass
17
+
18
+ def process_resume(self, resume_text: str, required_skills: list) -> dict:
19
+ """Process resume and calculate skill match quality"""
20
+ # Placeholder implementation
21
+ matched_skills = [skill for skill in required_skills if skill.lower() in resume_text.lower()]
22
+ quality_score = len(matched_skills) / len(required_skills) if required_skills else 0
23
+
24
+ return {
25
+ 'skill_quality': {
26
+ 'quality_score': quality_score,
27
+ 'matched_skills': matched_skills,
28
+ 'matched_skills_count': len(matched_skills),
29
+ 'total_required_skills': len(required_skills)
30
+ },
31
+ 'full_text': resume_text
32
+ }
33
+
34
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ spacy>=3.7.0
3
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl
4
+ typing-extensions>=4.5.0
5
+ python-dateutil>=2.8.2