File size: 5,085 Bytes
6db7601
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
import os
import re
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from src.groq_client import analyze_resume

# Load spaCy model
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    # If model is not installed, provide instructions
    print("The spaCy model 'en_core_web_sm' is not installed.")
    print("Please install it using: python3 -m spacy download en_core_web_sm")
    # Create a simple placeholder model for basic functionality
    nlp = spacy.blank("en")

def preprocess_text(text):
    """Preprocess resume text for analysis
    
    Args:
        text (str): Raw text extracted from resume
        
    Returns:
        str: Preprocessed text
    """
    # Remove special characters and extra whitespace
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Convert to lowercase
    text = text.lower()
    
    return text

def extract_keywords(text, job_role):
    """Extract keywords from resume text
    
    Args:
        text (str): Preprocessed resume text
        job_role (str): Target job role
        
    Returns:
        list: Extracted keywords
    """
    # Process the text with spaCy
    doc = nlp(text)
    
    # Extract nouns, proper nouns, and skill-related words
    keywords = [token.text for token in doc if token.pos_ in ["NOUN", "PROPN"] and len(token.text) > 2]
    
    # Use CountVectorizer to get the most common terms
    vectorizer = CountVectorizer(max_features=50, stop_words='english', ngram_range=(1, 2))
    X = vectorizer.fit_transform([text])
    common_terms = vectorizer.get_feature_names_out()
    
    # Combine and remove duplicates
    all_keywords = list(set(keywords + list(common_terms)))
    
    return all_keywords

def analyze_resume_local(resume_text, job_role):
    """Perform local analysis on resume text before calling the Groq API
    
    Args:
        resume_text (str): Raw text extracted from resume
        job_role (str): Target job role
        
    Returns:
        dict: Local analysis results
    """
    # Preprocess the text
    processed_text = preprocess_text(resume_text)
    
    # Extract keywords
    keywords = extract_keywords(processed_text, job_role)
    
    # Perform basic format analysis
    format_score = calculate_format_score(resume_text)
    
    # Perform basic readability analysis
    readability_score = calculate_readability_score(resume_text)
    
    return {
        "local_keywords": keywords,
        "local_format_score": format_score,
        "local_readability_score": readability_score
    }

def calculate_format_score(text):
    """Calculate a basic format score for the resume
    
    Args:
        text (str): Resume text
        
    Returns:
        int: Format score (0-100)
    """
    score = 70  # Base score
    
    # Check for section headers
    section_patterns = ["experience", "education", "skills", "projects", "certifications", "summary"]
    found_sections = 0
    for pattern in section_patterns:
        if re.search(r'\b' + pattern + r'\b', text.lower()):
            found_sections += 1
    
    # Adjust score based on sections found
    section_score = min(found_sections * 5, 20)
    score += section_score
    
    # Check for bullet points
    bullet_count = text.count('•') + text.count('·') + text.count('-')
    bullet_score = min(bullet_count, 10)
    score += bullet_score
    
    return min(score, 100)  # Cap at 100

def calculate_readability_score(text):
    """Calculate a basic readability score for the resume
    
    Args:
        text (str): Resume text
        
    Returns:
        int: Readability score (0-100)
    """
    # Base score
    score = 70
    
    # Split into sentences and words
    sentences = re.split(r'[.!?]+', text)
    sentences = [s.strip() for s in sentences if s.strip()]
    
    # Calculate average sentence length
    if sentences:
        words = []
        for sentence in sentences:
            words.extend(sentence.split())
        
        avg_sentence_length = len(words) / len(sentences)
        
        # Penalize very long sentences
        if avg_sentence_length > 25:
            score -= 10
        elif avg_sentence_length < 10:
            score += 5
    
    return min(max(score, 0), 100)  # Keep between 0-100

def get_resume_analysis(resume_text, job_role, job_description=None):
    """Main function to analyze a resume
    
    Args:
        resume_text (str): Text extracted from resume
        job_role (str): Target job role
        job_description (str, optional): Specific job description for enhanced analysis
        
    Returns:
        dict: Complete analysis results
    """
    # First perform local analysis
    local_results = analyze_resume_local(resume_text, job_role)
    
    # Then call the Groq API for advanced analysis
    groq_results = analyze_resume(resume_text, job_role, job_description)
    
    # Combine results
    combined_results = {
        **groq_results,
        "local_keywords": local_results["local_keywords"]
    }
    
    return combined_results