Spaces:

ejqs
/

beyond-keywords

Sleeping

App Files Files Community

earlsab commited on Mar 28, 2025

Commit

b34a619

1 Parent(s): c04d3f7

Broken; Committed for testing

Browse files

Files changed (17) hide show

.gitignore +1 -0
ThesisFinal_(Added_Implementation_of_Quality)-10.ipynb +0 -0
__pycache__/date_extraction_model.cpython-310.pyc +0 -0
__pycache__/model_date_extraction.cpython-310.pyc +0 -0
__pycache__/model_section_segmentation.cpython-310.pyc +0 -0
__pycache__/model_section_sementation.cpython-310.pyc +0 -0
__pycache__/model_skill_extraction.cpython-310.pyc +0 -0
__pycache__/model_skill_quality_extraction.cpython-310.pyc +0 -0
__pycache__/resume_analysis_model.cpython-310.pyc +0 -0
__pycache__/skill_extraction_model.cpython-310.pyc +0 -0
__pycache__/skill_quality_extraction_model.cpython-310.pyc +0 -0
app.py +139 -4
model_date_extraction.py +14 -0
model_section_segmentation.py +243 -0
model_skill_extraction.py +37 -0
model_skill_quality_extraction.py +34 -0
requirements.txt +5 -0

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ models_lfs

ThesisFinal_(Added_Implementation_of_Quality)-10.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

__pycache__/date_extraction_model.cpython-310.pyc ADDED Viewed

Binary file (666 Bytes). View file

__pycache__/model_date_extraction.cpython-310.pyc ADDED Viewed

Binary file (666 Bytes). View file

__pycache__/model_section_segmentation.cpython-310.pyc ADDED Viewed

Binary file (8.48 kB). View file

__pycache__/model_section_sementation.cpython-310.pyc ADDED Viewed

Binary file (2.56 kB). View file

__pycache__/model_skill_extraction.cpython-310.pyc ADDED Viewed

Binary file (1.58 kB). View file

__pycache__/model_skill_quality_extraction.cpython-310.pyc ADDED Viewed

Binary file (1.39 kB). View file

__pycache__/resume_analysis_model.cpython-310.pyc ADDED Viewed

Binary file (2.25 kB). View file

__pycache__/skill_extraction_model.cpython-310.pyc ADDED Viewed

Binary file (1.58 kB). View file

__pycache__/skill_quality_extraction_model.cpython-310.pyc ADDED Viewed

Binary file (1.39 kB). View file

app.py CHANGED Viewed

@@ -1,8 +1,143 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
-demo.launch()

 import gradio as gr
+from model_skill_extraction import SkillExtractionModel
+from model_section_segmentation import SegmentationModelJobDescription, SegmentationModelResume
+from model_skill_quality_extraction import SkillQualityExtractionModel
+from model_date_extraction import DateExtractionModel
+import json
+from typing import List, Dict
+import time
+# Initialize models
+segmentation_model_resume = SegmentationModelResume()
+segmentation_model_job_description= SegmentationModelJobDescription()
+skill_model = SkillExtractionModel()
+skill_quality_model = SkillQualityExtractionModel()
+date_model = DateExtractionModel()
+def process_job_description(job_description: str) -> Dict:
+    """Process job description and extract skills"""
+    result = skill_model.process_text(job_description)
+    return result
+def process_resume(resume_text: str, job_skills: List[str]) -> Dict:
+    """Process resume and analyze against job skills"""
+    # result = resume_model.process_resume(resume_text, job_skills)
+    result = skill_model.process_text(resume_text)
+    return result
+def create_html_output(job_result: Dict, resume_results: List[Dict]) -> str:
+    """Create HTML output for the interface"""
+    html = "<div style='font-family: Arial, sans-serif;'>"
+    # Job Description Section
+    html += "<h2>Job Description Analysis</h2>"
+    html += f"<p><strong>Total Skills Found:</strong> {job_result['total_skills']}</p>"
+    html += "<p><strong>Skills:</strong></p>"
+    html += "<div style='background-color: #f0f0f0; padding: 10px; border-radius: 5px;'>"
+    for skill in job_result['skills']:
+        html += f"<span style='background-color: #e0e0e0; padding: 2px 5px; margin: 2px; border-radius: 3px; display: inline-block;'>{skill['text']}</span>"
+    html += "</div>"
+    # Resume Analysis Section
+    html += "<h2>Resume Analysis</h2>"
+    for i, resume_result in enumerate(resume_results, 1):
+        html += f"<div style='margin-bottom: 20px; padding: 10px; border: 1px solid #ddd; border-radius: 5px;'>"
+        html += f"<h3>Resume {i}</h3>"
+        # html += f"<p><strong>Skill Match Quality:</strong> {resume_result['skill_quality']['quality_score']:.2%}</p>"
+        # html += f"<p><strong>Matched Skills:</strong> {resume_result['skill_quality']['matched_skills_count']}/{resume_result['skill_quality']['total_required_skills']}</p>"
+        html += "<p><strong>Matched Skills:</strong></p>"
+        html += "<div style='background-color: #f0f0f0; padding: 10px; border-radius: 5px;'>"
+        # for skill in resume_result['skill_quality']['matched_skills']:
+        #     html += f"<span style='background-color: #e0e0e0; padding: 2px 5px; margin: 2px; border-radius: 3px; display: inline-block;'>{skill}</span>"
+        html += "</div>"
+        html += "<details><summary>View Full Resume</summary>"
+        # html += f"<pre style='white-space: pre-wrap;'>{resume_result['full_text']}</pre>"
+        html += "</details>"
+        html += "</div>"
+    html += "</div>"
+    return html
+def process_inputs(job_description: str, input_type: str, resume_text: str, resume_files: List[str]) -> str:
+    """Main processing function"""
+    # Process job description
+    job_result = process_job_description(job_description)
+    # Process resumes based on input type
+    resume_results = []
+    if input_type == "Paste Text":
+        # Process single resume from text input
+        resume_result = process_resume(resume_text, [skill['text'] for skill in job_result['skills']])
+        resume_results.append(resume_result)
+    else:
+        # Process multiple resumes from file uploads
+        for file_path in resume_files:
+            with open(file_path, 'r', encoding='utf-8') as f:
+                resume_content = f.read()
+            resume_result = process_resume(resume_content, [skill['text'] for skill in job_result['skills']])
+            resume_results.append(resume_result)
+    # Create HTML output
+    return create_html_output(job_result, resume_results)
+# Create Gradio interface
+with gr.Blocks(title="Resume Analysis System") as demo:
+    gr.Markdown("# Beyond Keywords: Job Description and Resume Analyzer")
+    gr.Markdown("Upload a job description and resume(s) to analyze skill matches and quality.")
+    with gr.Row():
+        with gr.Column():
+            job_description = gr.Textbox(
+                label="Job Description",
+                placeholder="Paste the job description here...",
+                lines=13.10
+            )
+        with gr.Column():
+            resume_input = gr.Group()
+            with resume_input:
+                input_type = gr.Radio(
+                    choices=["Paste Text", "Upload File"],
+                    label="Input Method",
+                    value="Paste Text"
+                )
+                resume_text = gr.Textbox(
+                    label="Resume Text",
+                    placeholder="Paste the resume text here...",
+                    lines=8.85,
+                    visible=True
+                )
+                resume_file = gr.Files(
+                    label="Upload Resume(s) (.txt files)",
+                    file_types=[".txt"],
+                    visible=False,
+                    interactive=True,
+                    type="filepath"
+                )
+                def toggle_input(choice):
+                    return {
+                        resume_text: gr.update(visible=choice=="Paste Text"),
+                        resume_file: gr.update(visible=choice=="Upload File")
+                    }
+                input_type.change(
+                    fn=toggle_input,
+                    inputs=input_type,
+                    outputs=[resume_text, resume_file]
+                )
+    submit_btn = gr.Button("Analyze")
+    output = gr.HTML(label="Analysis Results")
+    submit_btn.click(
+        fn=process_inputs,
+        inputs=[job_description, input_type, resume_text, resume_file],
+        outputs=output
+    )
+if __name__ == "__main__":
+    demo.launch()

model_date_extraction.py ADDED Viewed

	@@ -0,0 +1,14 @@

+class DateExtractionModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.device = None
+        self.load_model()
+    def load_model(self):
+        """
+        Init
+        """
+        pass

model_section_segmentation.py ADDED Viewed

	@@ -0,0 +1,243 @@

+import torch
+from typing import List, Tuple
+import torch.nn as nn
+from transformers import LongformerModel
+from transformers import LongformerTokenizer
+import torch.nn.functional as F
+class LongformerSentenceClassifier(nn.Module):
+    def __init__(self, model_name="allenai/longformer-base-4096", num_labels=13):
+        """
+        Custom Longformer model for sentence classification.
+        Args:
+            model_name (str): Hugging Face Longformer model.
+            num_labels (int): Number of possible sentence labels.
+        """
+        super(LongformerSentenceClassifier, self).__init__()
+        self.longformer = LongformerModel.from_pretrained(model_name)
+        self.classifier = nn.Linear(self.longformer.config.hidden_size, num_labels)
+    def forward(self, input_ids, attention_mask, global_attention_mask, cls_positions):
+        """
+        Forward pass for sentence classification.
+        Args:
+            input_ids (Tensor): Tokenized input IDs, shape (batch_size, max_length)
+            attention_mask (Tensor): Attention mask, shape (batch_size, max_length)
+            global_attention_mask (Tensor): Global attention mask, shape (batch_size, max_length)
+            cls_positions (List[Tensor]): Indices of `[CLS]` tokens for each batch element.
+        """
+        outputs = self.longformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            global_attention_mask=global_attention_mask
+        )
+        last_hidden_state = outputs.last_hidden_state
+        cls_positions = cls_positions.view(input_ids.shape[0], -1)
+        cls_embeddings = last_hidden_state.gather(1, cls_positions.unsqueeze(-1).expand(-1, -1, last_hidden_state.size(-1)))
+        logits = self.classifier(cls_embeddings)
+        return logits
+class SegmentationModelJobDescription:
+    def __init__(self):
+        """Initialize segmentation model for either resume or job description.
+        Args:
+            model_type (str): Either "resume" or "job" to specify type of segmentation
+        """
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        # Label mapping as provided
+        self.Job_label_map = {
+            "JT": 0,    # Job Title
+            "JS": 1,    # Job Summary
+            "COT": 2,   # Title of Company Overview Section
+            "COC": 3,   # Content of Company Overview Section
+            "RT": 4,    # Title of Responsibilites Section
+            "RC": 5,    # Content of Responsibilites Section
+            "RQT": 6,   # Title of Required Qualifications Section
+            "RQC": 7,   # Content of Required Qualifications Section
+            "PQT": 8,   # Title of Preferred Qualifications Section
+            "PQC": 9,   # Content of Preferred Qualifications Section
+            "ET": 10,   # Employment Type
+            "SBC": 11,  # Content of Salary and Benefits Section
+            "SBT": 12   # Title of Salary and Benefits Section
+        }
+        self.Job_num_labels = len(self.Job_label_map)
+        self.Job_labels = [
+            {"value": "JT", "label": "Job Title"},
+            {"value": "JS", "label": "Job Summary"},
+            {"value": "COT", "label": "Title of Company Overview Section"},
+            {"value": "COC", "label": "Content of Company Overview Section"},
+            {"value": "RT", "label": "Title of Responsibilites Section"},
+            {"value": "RC", "label": "Content of Responsibilites Section"},
+            {"value": "RQT", "label": "Title of Required Qualifications Section"},
+            {"value": "RQC", "label": "Content of Required Qualifications Section"},
+            {"value": "PQT", "label": "Title of Preferred Qualifications Section"},
+            {"value": "PQC", "label": "Content of Preferred Qualifications Section"},
+            {"value": "ET", "label": "Employment Type"},
+            {"value": "SBC", "label": "Content of Salary and Benefits Section"},
+            {"value": "SBT", "label": "Title of Salary and Benefits Section"},
+        ]
+        # Load tokenizer
+        Job_tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
+        Job_tokenizer.cls_token
+        # Load model architecture
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        Job_model = LongformerSentenceClassifier(num_labels=self.Job_num_labels)
+        Job_model.to(device)
+        # Load trained weights
+        Job_model.load_state_dict(torch.load("model_lfs/JobSegmentClassifier3rdEpoch_v2.pth"))
+        # Set model to evaluation mode
+        Job_model.eval()
+    def segment(self, text: str) -> Tuple[List[int], List[str]]:
+        """Segment text into sections based on model type.
+        Args:
+            text (str): Text to segment
+        Returns:
+            Tuple containing:
+                - List of predicted section labels as integers
+                - List of text lines
+        """
+        # Split text into lines and remove empty lines
+        lines = [line for line in text.splitlines() if line.strip()]
+        if self.model_type == "job":
+            # Job description segmentation logic
+            concatenated_text = " ".join(f"[CLS] {sentence}" for sentence in lines)
+            predictions = self._predict_sections(concatenated_text)
+            return predictions, lines
+        else:
+            # Resume segmentation logic would go here
+            return [], lines
+    def _predict_sections(self, text: str) -> List[int]:
+        """Make predictions on the text using appropriate model.
+        Args:
+            text (str): Text to make predictions on
+        Returns:
+            List of predicted section labels as integers
+        """
+        # Model prediction logic would go here
+        return []
+    def predict_job_sections(model, text, tokenizer, device):
+        model.eval()
+        # Tokenize text and get input tensors
+        encoding = tokenizer(
+            text,
+            return_tensors="pt",
+            truncation=True,
+            padding="max_length",
+            max_length=4096
+        )
+        input_ids = encoding["input_ids"].to(device)
+        attention_mask = encoding["attention_mask"].to(device)
+        # Identify `[CLS]` positions (assuming each sentence starts with `[CLS]`)
+        cls_positions = (input_ids == tokenizer.cls_token_id).nonzero(as_tuple=True)[1]
+        cls_positions = cls_positions.unsqueeze(0).to(device)  # Shape: (1, num_sentences)
+        # Create global attention mask (Longformer requires at least 1 global attention token)
+        global_attention_mask = torch.zeros_like(input_ids)
+        global_attention_mask[:, cls_positions] = 1  # Assign global attention to `[CLS]` tokens
+        # Run the model
+        with torch.no_grad():
+            logits = model(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                global_attention_mask=global_attention_mask,
+                cls_positions=cls_positions
+            )  # Shape: (1, num_sentences, num_labels)
+        logits = logits.squeeze(0)  # Shape: (num_sentences, num_labels)
+        probs = F.softmax(logits, dim=-1)  # Convert logits to probabilities
+        predictions = torch.argmax(probs, dim=-1)  # Get predicted label indices
+        return predictions.cpu().numpy()  # Convert to NumPy array for easy use
+    def extract_job_sections(self, text):
+        lines = text.splitlines()
+        lines = [line for line in text.splitlines() if line.strip()]
+        text = lines
+        concatenated_text = " ".join(f"{self.Job_tokenizer.cls_token} {sentence}" for sentence in text)
+        predictions = self.predict_job_sections(self.Job_model, concatenated_text, self.Job_tokenizer, self.device)
+        return predictions, text
+    def extract_job_requirements(self, text):
+        lines = text.splitlines()
+        lines = [line for line in text.splitlines() if line.strip()]
+        text = lines
+        concatenated_text = " ".join(f"{self.Job_tokenizer.cls_token} {sentence}" for sentence in text)
+        predictions = self.predict_job_sections(self.Job_model, concatenated_text, self.Job_tokenizer, self.device)
+        requirements = []
+        i = 0
+        for item in predictions[:len(predictions) - 1]:
+            if self.Job_labels[item]['value'] == "RQC":
+                requirements.append(lines[i])
+                i += 1
+        return requirements
+class SegmentationModelResume:
+    def __init__(self):
+        pass
+    def segment(self, text: str) -> Tuple[List[int], List[str]]:
+        pass
+if __name__ == "__main__":
+    # Example usage
+    job_segmenter = SegmentationModelJobDescription()
+    resume_segmenter = SegmentationModelResume()
+    # Example job text
+    sample_job_text = """
+    Senior Software Engineer
+    We are looking for an experienced developer to join our team.
+    Requirements:
+    - 5+ years Python experience
+    - Strong knowledge of ML/AI
+    - Excellent communication skills
+    Benefits:
+    - Competitive salary
+    - Remote work options
+    - Health insurance
+    """
+    # Test job section extraction
+    job_sections, job_text = job_segmenter.extract_job_sections(sample_job_text)
+    print("\nJob Sections:")
+    for section, text in zip(job_sections, job_text):
+        print(f"{job_segmenter.Job_labels[section]['value']}: {text}")
+    # Test requirements extraction
+    requirements = job_segmenter.extract_job_requirements(sample_job_text)
+    print("\nJob Requirements:")
+    for req in requirements:
+        print(f"- {req}")

model_skill_extraction.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import spacy
+from typing import List, Dict
+import time
+class SkillExtractionModel:
+    def __init__(self):
+        self.nlp = spacy.load("en_core_web_sm")
+        # Add custom skill patterns
+        self.skill_patterns = [
+            {"label": "SKILL", "pattern": [{"LOWER": {"IN": ["python", "java", "javascript", "sql", "aws", "docker", "kubernetes", "git", "agile", "scrum", "jira", "confluence"]}}]},
+            {"label": "SKILL", "pattern": [{"LOWER": {"IN": ["machine learning", "deep learning", "data analysis", "data science", "project management", "leadership"]}}]},
+        ]
+        self.ruler = self.nlp.add_pipe("entity_ruler")
+        self.ruler.add_patterns(self.skill_patterns)
+    def extract_skills(self, text: str) -> List[Dict]:
+        doc = self.nlp(text)
+        skills = []
+        for ent in doc.ents:
+            if ent.label_ == "SKILL":
+                skills.append({
+                    "text": ent.text,
+                    "start": ent.start_char,
+                    "end": ent.end_char,
+                    "label": ent.label_
+                })
+        return skills
+    def process_text(self, text: str) -> Dict:
+        # Simulate model loading time
+        time.sleep(2)
+        skills = self.extract_skills(text)
+        return {
+            "text": text,
+            "skills": skills,
+            "total_skills": len(skills)
+        }

model_skill_quality_extraction.py ADDED Viewed

	@@ -0,0 +1,34 @@

+class SkillQualityExtractionModel:
+    def __init__(self):
+        self.model = None
+        self.tokenizer = None
+        self.device = None
+        self.load_model()
+    def load_model(self):
+        """Initialize the model and tokenizer"""
+        model_name = "roberta-large-mnli"
+        leadership_model_path = "models_lfs/pet-leadership-model-roberta-large-mnli_bs4_gas4_lr3e-05_ep2" + "/checkpoint-742"
+        collab_model_path = "models_lfs/pet-collaboration-model-roberta-large-mnli_bs4_gas4_lr1e-05_ep3" + "/checkpoint-936"
+        leadership_pattern = "Sentence: {} Question: Does this show leadership? Answer: <mask>"
+        collab_pattern = "Sentence: {} Question: Does this show teamwork and collaboration? Answer: <mask>"
+        pass
+    def process_resume(self, resume_text: str, required_skills: list) -> dict:
+        """Process resume and calculate skill match quality"""
+        # Placeholder implementation
+        matched_skills = [skill for skill in required_skills if skill.lower() in resume_text.lower()]
+        quality_score = len(matched_skills) / len(required_skills) if required_skills else 0
+        return {
+            'skill_quality': {
+                'quality_score': quality_score,
+                'matched_skills': matched_skills,
+                'matched_skills_count': len(matched_skills),
+                'total_required_skills': len(required_skills)
+            },
+            'full_text': resume_text
+        }

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+gradio>=4.0.0
+spacy>=3.7.0
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.0/en_core_web_sm-3.7.0-py3-none-any.whl
+typing-extensions>=4.5.0
+python-dateutil>=2.8.2