Imarticuslearning commited on
Commit
227f173
·
verified ·
1 Parent(s): 57ba822

Upload 6 files

Browse files
Files changed (6) hide show
  1. src/config.py +223 -0
  2. src/logging_config.py +42 -0
  3. src/main.py +61 -0
  4. src/resume_parser.py +1274 -0
  5. src/reviewer.py +33 -0
  6. src/submitter.py +32 -0
src/config.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ BASE_DIR = os.getcwd()
4
+ UPLOAD_FOLDER = os.path.join(BASE_DIR, '..', 'data', 'uploads')
5
+ ALLOWED_EXTENSIONS = {'pdf'}
6
+ linkedin_domain = (r'https?://(www\.)?linkedin\.com/[^\s<>"]')
7
+ github_domain = (r'https?://(www\.)?github\.com/[^\s<>"]')
8
+ kaggle_domain = (r'https?://(www\.)?kaggle\.com/[^\s<>"]')
9
+ medium_domain = (r'https?://(www\.)?medium\.com/[^\s<>"]')
10
+ hackerrank_domain = (r'https?://(www\.)?hackerrank\.com/[^\s<>"]')
11
+ leetcode_domain = (r'https?://(www\.)?leetcode\.com/[^\s<>"]')
12
+
13
+
14
+
15
+ required_sections = ['PROFILE SUMMARY','ACADEMIC PROFILE','TECHNICAL SKILLS','CERTIFICATIONS','PROJECTS','CAREER OBJECTIVE']
16
+
17
+ basic_informations = ["name", "contact_number", "email", "linkedin_urls", "github_urls"]
18
+
19
+ data_science_skills = ['queries', 'beautifulsoup', 'ms excel', 'mathematics', 'selenium',
20
+ 'html', 'analytical skills', 'statsmodels','ai', 'improvement',
21
+ 'analyze', 'metrics', 'forecasting', 'analytics', 'analytical',
22
+ 'mysql', 'postgresql', 'database', 'writing', 'excel','regulations',
23
+ 'algorithms', 'scipy', 'opencv', 'reports', 'eda', 'jupyter',
24
+ 'presentations', 'modeling', 'audit', 'technical skills',
25
+ 'schedule', 'nltk', 'iso', 'xgboost', 'segmentation', 'github',
26
+ 'seaborn', 'keras', 'distribution', 'investigation', 'tableau',
27
+ 'probability', 'analysis', 'r', 'technical', 'programming',
28
+ 'web scraping', 'research', 'pandas', 'statistical analysis',
29
+ 'numpy', 'predictive analysis', 'tensorflow', 'hypothesis',
30
+ 'matplotlib', 'scikit-learn', 'information technology',
31
+ 'machine learning', 'cloud', 'streamlit', 'mining', 'python',
32
+ 'data analytics', 'deep learning', 'testing', 'training',
33
+ 'clustering & classification', 'data analysis', 'engineering',
34
+ 'data visualization', 'quantitative analysis', 'statistics',
35
+ 'flask', 'statistical modeling', 'pytorch', 'data mining',
36
+ 'aws', 'sql']
37
+
38
+ essential_skills = ["Python", "SQL", "MySQL", "Tableau", "NumPy",
39
+ "Statsmodels", "CNN", "ANN",
40
+ "RNN", "Machine Learning", "Deep Learning", "SciKit Learn", "MS Excel",
41
+ "Data Visualization", "Power BI", "Data Analysis"]
42
+
43
+ quality_mapping = {
44
+ 'Resume needs significant improvement': 0.15,
45
+ 'Resume needs improvement': 0.35,
46
+ 'Resume is average': 0.55,
47
+ 'Resume is good': 0.75,
48
+ 'Resume is very good': 0.90,
49
+ 'Resume is excellent': 1,
50
+ 'The resume is bad': 1.1
51
+ }
52
+
53
+ keyword_variations = {
54
+ "Python": ["Python", "Python_Language", "Python Programming"],
55
+ "SQL": ["SQL", "SQL_Language", "Structured Query Language", "Structured_Query_Language"],
56
+ "MySQL": ["MySQL", "MySQL_Database", "My_SQL", "My SQL"],
57
+ "Pandas": ["Pandas", "Pandas_Library", "Pandas Data Analysis Library","Pandas_Data Analysis_Library"],
58
+ "R": [" R ", "R_Programming", "R Language",",R "," R,", ",R,"],
59
+ "Matplotlib": ["Matplotlib", "Matplotlib_Library", "Matplotlib Plotting Library","Matplotlib_Plotting_Library"],
60
+ "Seaborn": ["Seaborn", "Seaborn_Library", "Seaborn Data Visualization Library"],
61
+ "StatsModel": ["StatsModel", "StatsModel_Library", "StatsModel Statistical Library", "Statistical Modeling Library", "Statistics Modeling", "StatModelLib", "StatsMod", "SM Library", "SM"],
62
+ "Tableau": ["Tableau", "Tableau_Software", "Tableau Data Visualization", "Tableau Analytics", "Tableau BI Tool", "Tableau Visualization Software", "Tableau Data Analysis", "Tableau BI","TableauBI"],
63
+ "TensorFlow": ["TensorFlow", "TensorFlow_Library"],
64
+ "NumPy": ["NumPy", "NumPy_Library", "Numerical Computing Library"],
65
+ "PyTorch": ["PyTorch", "PyTorch_Library"],
66
+ "Keras": ["Keras", "Keras_Library"],
67
+ "Plotly": ["Plotly", "Plotly_Library",],
68
+ "RFM": ["RFM", "RFM_Analysis", "Recency Frequency Monetary Analysis"],
69
+ "ANOVA": ["ANOVA", "ANOVA_Test", "Analysis of Variance","Analysis_of_Variance"],
70
+ "BeautifulSoup": ["BeautifulSoup", "BeautifulSoup_Library"],
71
+ "Imputation": ["Imputation", "Data_Imputation","Data Imputation", "Missing Data Imputation"],
72
+ "Scrappy": ["Scrappy", "Scrappy_Library"],
73
+ "Selenium": ["Selenium", "Selenium_Library", "Selenium WebDriver", "Selenium Automation"],
74
+ "TensorBoard": ["TensorBoard", "TensorBoard_Library", "TensorBoard Visualization Tool"],
75
+ "SciPy": ["SciPy", "SciPy_Library", "Scientific Computing Library"],
76
+ "OpenCV": ["OpenCV", "OpenCV_Library", "Computer Vision Library"],
77
+ "NLTK": ["NLTK", "NLTK_Library", "Natural Language Toolkit"],
78
+ "Hadoop": ["Hadoop", "Hadoop_Framework"],
79
+ "Spark": ["Spark", "Spark_Framework", "Apache_Spark"],
80
+ "spacy": ["spacy","Spacy_Library"],
81
+ "AdaBoost": ["AdaBoost","Ada_Boost","Ada Boost", "AdaBoost_Algorithm", "Adaptive Boosting","Adaptive_Boosting"],
82
+ "XGBoost": ["XGBoost","XG_Boost","XG Boost", "XGBoost_Algorithm", "Extreme Gradient Boosting"],
83
+ "CNN": [" CNN ", "CNN,", ",CNN", "Convolutional Neural Network", "ConvNet", "CNN Algorithm","CNN"],
84
+ "ANN": [" ANN ", "ANN,", ",ANN", "Artificial Neural Network", "ANN Algorithm","ANN"],
85
+ "RNN": [" RNN ", "RNN,", ",RNN", "Recurrent Neural Network", "RNN Algorithm","RNN"],
86
+ "KNN": [" kNN ", "kNN,", ",kNN","K-Nearest Neighbours", "K_Nearest_Neighbours", "K-Nearest-Neighbours", "K Nearest Neighbours", "KNN"],
87
+ "LSTM": ["LSTM", "Long Short-Term Memory", "LSTM Network", "LSTM Algorithm"],
88
+ "GAN": [" GAN ", "GAN,", ",GAN", "Generative Adversarial Network", "GAN Algorithm"," GAN "],
89
+ "YOLO": ["YOLO", "You Only Look Once", "YOLO_Algorithm"],
90
+ "Clustering": ["Clustering", "Clustering_Algorithms", "Data_Clustering"],
91
+ "Classification": ["Classification", "Classification_Algorithms", "Data_Classification"],
92
+ "Word2Vec": ["Word2Vec", "Word2Vec_Algorithm", "Word2Vec Word Embeddings","word2vector"],
93
+ "Tf-idf": ["Tf-idf","Tf_idf","Tf idf", "Term Frequency-Inverse Document Frequency", "Tf_idf_Algorithm","Tf-idf_Algorithm"],
94
+ "Tokenization": ["Tokenization", "Text_Tokenization", "Word_Tokenization"],
95
+ "Machine Learning": ["Machine Learning", "Machine_Learning", "Machine Learning Algorithms", "Machine_Learning_Algorithms", "ML"],
96
+ "Deep Learning": ["Deep Learning", "Deep_Learning", "Deep Learning Algorithms", "Deep_Learning_Algorithms", "DL"],
97
+ "SciKit Learn": ["SciKit Learn", "SciKit_Learn", "Sci Kit Learn", "SciKit-Learn","Sci_Kit_Learn", "sklearn","sk_learn"],
98
+ "Hugging Face": ["Hugging Face", "Hugging_Face", "HuggingFace"],
99
+ "MS Excel": ["Excel", "MS Excel","MSExcel", "MS_Excel", "Microsoft_Excel", "Microsoft Excel", "advance_excel","advance_MS_excel","advance_MSexcel", "advance excel", "Advance_Microsoft_excel", "Advance Microsoft excel"],
100
+ "Data Visualization": ["Data Visualization", "Data_Visualization", "Data_Viz", "Visualization"],
101
+ "Power BI": ["Power BI", "Power_BI", "Microsoft_Power_BI", "Microsoft Power BI","PowerBI"],
102
+ "Transfer Learning": ["Transfer Learning", "Transfer_Learning"],
103
+ "Linear Regression": ["Linear Regression", "Linear_Regression"],
104
+ "Logistic Regression": ["Logistic Regression", "Logistic_Regression"],
105
+ "Decision Tree": ["Decision Tree", "Decision_Tree"],
106
+ "Random Forest": ["Random Forest", "Random_Forest"],
107
+ "K-Means Clustering": ["K-Means Clustering", "K_Means_Clustering", "K-Means-Clustering", "K Means Clustering", "K-means", "k_means","K-mean", "k_mean"],
108
+ "T-test": ["T-test", "T_Test", "T Test"],
109
+ "Z-test": ["Z-test", "Z_Test", "Z Test"],
110
+ "Hypothesis Testing": ["Hypothesis Testing", "Hypothesis_Testing"],
111
+ "Chi-square": ["Chi-square", "Chi_Square", "Chi2"],
112
+ "Normal Distribution": ["Normal Distribution", "Normal_Distribution"],
113
+ "Correlation Analysis": ["Correlation Analysis", "Correlation_Analysis"],
114
+ "Feature Scaling": ["Feature Scaling", "Feature_Scaling"],
115
+ "Dimensionality Reduction": ["Dimensionality Reduction", "Dimensionality_Reduction"],
116
+ "Jupyter Notebook": ["Jupyter Notebook", "Jupyter_Notebook"],
117
+ "Google Colab": ["Google Colab", "Google_Colab"],
118
+ "Data Analysis": ["Data Analysis", "Data_Analysis"],
119
+ "Big Data": ["Big Data", "Big_Data"],
120
+ "Support Vector Machines (SVM)": ["Support Vector Machines (SVM)", "Support_Vector_Machines", "SVM", "Support Vector Machines", "Support_Vector_Machines_SVM"],
121
+ "Natural Language Processing": ["Natural Language Processing", "Natural_Language_Processing", "NLP"],
122
+ "Artificial Intelligence": ["Artificial Intelligence", "Artificial_Intelligence"," AI ",",AI "," AI,","AI"],
123
+ "Naive Bayes": ["Naive Bayes", "Naive_Bayes"],
124
+ "Principal Component Analysis (PCA)": ["Principal Component Analysis (PCA)", "Principal_Component_Analysis", "Principal Component Analysis", "PCA"],
125
+ "Descriptive Statistics": ["Descriptive Statistics", "Descriptive_Statistics"],
126
+ "Inferential Statistics": ["Inferential Statistics", "Inferential_Statistics"],
127
+ "Gradient Boosting Machines (GBM)": ["Gradient Boosting Machines (GBM)", "Gradient_Boosting_Machines", "Gradient Boosting Machines", "GBM","Gradient Boosting","Gradient_Boosting"],
128
+ "Association Rule Learning (Apriori)": ["Association Rule Learning (Apriori)", "Association_Rule_Learning", "Association Rule Learning", "Apriori"],
129
+ "Hierarchical Clustering": ["Hierarchical Clustering", "Hierarchical_Clustering"],
130
+ "Image Segmentation": ["Image Segmentation", "Image_Segmentation"],
131
+ "Object Detection": ["Object Detection", "Object_Detection"],
132
+ "Encoder Decoder": ["Encoder - Decoder", "Encoder_Decoder","Encoder Decoder","Encoder Decode",
133
+ "Sequence-to-Sequence Models", "Seq2Seq Models", "Language Encoding", "Language Decoding", "Text Encoding", "Text Decoding",
134
+ "Image Encoding", "Image Decoding", "Audio Encoding", "Audio Decoding", "Video Encoding", "Video Decoding", "Speech Encoding", "Speech Decoding", "Data Compression",
135
+ "Data Encryption", "Data Decryption","Encoder","Decoder"],
136
+ "Word Embedding": ["Word Embedding", "Word_Embedding"],
137
+ "Bag of Words": ["Bag of Words", "Bag_of_Words"],
138
+ "Sentiment Analysis": ["Sentiment Analysis", "Sentiment_Analysis"],
139
+ "Predictive Analysis": ["Predictive Analysis", "Predictive_Analysis"],
140
+ "Statistical Modeling": ["Statistical Modeling", "Statistical_Modeling","Statistical_Analysis","Statistical Analysis"],
141
+ "Data Preprocessing": ["Data Preprocessing", "Data_Preprocessing"],
142
+ "Model Development": ["Model Development", "Model_Development"],
143
+ "Time Series Analysis": ["Time Series Analysis", "Time_Series_Analysis","TimeSeries","TimeSeries_Analysis"],
144
+ "Statistics Fundamentals": ["Statistics Fundamentals", "Statistics_Fundamentals"],
145
+ "Advanced ML": ["Advanced ML", "Advanced_ML", "Advanced Machine Learning", "Advanced_Machine_Learning", "Advanced-ML"],
146
+ "Advanced DL": ["Advanced DL", "Advanced_DL", "Advanced Deep Learning", "Advanced_Deep_Learning", "Advanced-DL"],
147
+ "EDA": ["EDA","Exploratory_Data_Analysis","Exploratory Data Analysis"],
148
+ "Data Mining":["Data Mining","Data_Mining"],
149
+ "Outlier Detection": ["Outlier_Detection","Outlier Detection"],
150
+ "Missing Values Handling": ["Missing Values Handling","Missing_Values_Handling","Missing Values"],
151
+ "Scaling Techniques": ["Scaling Techniques","Feature Scaling","Feature_Scaling","Data Scaling","Data_Scaling","Data Normalization","Data_Normalization","Standardization","Min-Max Scaling","Min-Max_Scaling","Normalization"],
152
+ "R2 and Adjusted R2": ["R2 Score","R2_Score","Adjusted_R2_Score","Adjusted R2 Score","R Squared Score","R_Squared_Score","R2 Accuracy","R2_Accuracy","Adjusted R2 Accuracy","R2 Metric","Adjusted R2 Metric"],
153
+ "Accuracy, Recall, F1 Score": ["Accuracy","Classification_Accuracy","Accuracy_Metrics","Recall","Precision","Recall_Score","F1 Score","F1-Score","F1_Metric","F1_Score","Classification_F1-Score"],
154
+ "MS Office": ["MS_Office","MS Office","Microsoft Office","MS Word","MS_Word","Microsoft_Office","Microsoft_Word","Microsoft Word"],
155
+ "Subquery": ["Subquery","Sub-query","Nested Query","Inner Query"],
156
+ "SQL Join": ["SQL Join","Join in SQL","Join"],
157
+ "Stemming": ["Stemming","Stemming Algorithm","Word Stemming","Stemming Techniques","Stemming in NLP","Text_Stemming","Text Stemming"],
158
+ "Stopwords": ["Stopwords","Stop Words","Common Words","Text Stopwords","Stopwords Removal","Removing Stopwords","Stopwords List","Stopwords in NLP"],
159
+ "docker_variations" : ["Docker Integration","Docker", "Docker Automation", "Advanced Docker","Advanced_Docker", "Docker Tools"],
160
+ "jenkins_variations" : ["Jenkins","Jenkins CI/CD","CI/CD", "Jenkins Automation", "Jenkins Pipeline", "Jenkins Plugins"],
161
+ "prometheus_variations" : ["Prometheus Monitoring", "Prometheus Metrics", "PromQL", "Prometheus Alerting"],
162
+ "cicd_variations" : ["Continuous Integration", "Continuous Deployment", "CI/CD Automation", "CI/CD Tools"],
163
+ "flask_variations" : ["Flask","Flask Framework", "Flask RESTful", "Flask Deployment", "Flask Security"],
164
+ "fastapi_variations" : ["FastAPI","FastAPI Framework", "FastAPI RESTful","FastAPI_RESTful", "FastAPI Deployment", "FastAPI Tools","FastAPI_Tools"],
165
+ "django_variations" : ["Django Framework", "Django Web Development", "Django REST Framework", "Django Deployment","Django"],
166
+ "aws_variations" : ["Amazon Web Services", "AWS Cloud", "AWS Services", "AWS Management","AWS","AWS_Cloud"],
167
+ "statistics_variations" : ["Statistical Analysis", "Descriptive Statistics", "Inferential Statistics", "Probability Theory"],
168
+ "hypothesis_testing_variations" : ["Null Hypothesis", "Alternative Hypothesis", "Significance Level", "Type I Error"],
169
+ "smote_variations" : ["Synthetic Minority Over-sampling Technique", "SMOTE Algorithm", "SMOTE Python", "SMOTE Applications","SMOTE"],
170
+ "mlflow_variations" : ["MLflow Framework", "MLflow Tracking", "MLflow Deployment", "MLflow Integration","MLflow"],
171
+ "packaging_variations" : ["Software Packaging", "Package Management", "Python Packaging", "Packaging Best Practices"],
172
+ "version_control_variations" : ["Git Version Control", "Git Commands", "Git Workflow", "Git Collaboration","Git"],
173
+ "communication skills" : ["communication_skills" , "communication_skill" ,"communication skills"],
174
+ "problem-solving": ["problem-solving","problem_solving", "problem-solving"],
175
+ "decision making" : ["decision making" , "decision-making","decision_making"]
176
+
177
+ }
178
+
179
+ Extract_sections = ["CAREER OBJECTIVE", "PROFILE SUMMARY"]
180
+
181
+ section_headers = [
182
+ "CAREER OBJECTIVE", "PROFILE SUMMARY", "WORK EXPERIENCE", "EDUCATION","ADDITIONAL INFORMATION AND HOBBIES",
183
+ "ACADEMIC PROFILE", "PROJECTS", "CERTIFICATIONS","SKILLS",
184
+ "PERSONAL SKILLS", "PERSONAL INFORMATION", "REFERENCES",
185
+ "EXTRACURRICULAR ACTIVITIES", "TECHNICAL SKILLS", "KEY SKILLS",
186
+ "ADDITIONAL INFORMATION", "CERTIFICATIONS & ACADEMIC ENDEAVOURS",
187
+ "AWARDS & ACCOLADES", "SOFTWARE SKILLS", "AWARDS"
188
+ ]
189
+
190
+ common_projects = ["Titanic","Iris","MNIST", "COVID-19", "Bank Churn",
191
+ "Spam","Handwritten Digit","Heart Disease","House Price",
192
+ "Diabetes","Twitter", "Churn",
193
+ "Wine Quality", "Loan","Titanic Survival Prediction",
194
+ "Iris Flower Classification",
195
+ "House Price Prediction",
196
+ "MNIST Handwritten Digit Recognition",
197
+ "Customer Churn Prediction",
198
+ "Sentiment Analysis of Movie Reviews",
199
+ "Spam Email Detection",
200
+ "Fake News Detection",
201
+ "Image Classification with CNNs",
202
+ "Stock Price Prediction"]
203
+
204
+ suggested_projects = ["Predicting Patient Readmissions in Hospitals",
205
+ "Optimizing Ad Spend with Machine Learning Models","Developing a Fake News Detection System",
206
+ "Developing an AI Chatbot for Customer Service Automation","Personalized Health Recommendations Using Wearable Data"]
207
+
208
+ # Specify rule IDs and error keywords to ignore
209
+ ignore_rule_ids = ['WHITESPACE_RULE']
210
+ ignore_error_keywords = ['repeated a whitespace']
211
+
212
+ # Blogs & Articles
213
+ blog_articles = ["https://www.dataquest.io/blog/how-data-science-resume-cv/",
214
+ "https://medium.com/data-science-at-microsoft/writing-a-resume-for-a-data-science-role-345b98bdf80b",
215
+ "https://medium.com/@alicechen.ai/resume-201-how-to-write-an-effective-data-science-resume-441cbe6c0932"
216
+ ]
217
+
218
+ # Links
219
+ youtube_links = ["https://youtu.be/Tt08KmFfIYQ?si=EdebdWUfbttysrfL",
220
+ "https://youtu.be/R3abknwWX7k?si=m4EyviXgKDoPgIGr",
221
+ "https://youtu.be/1-z9ptlBar4?si=lA7WgU4j4MFGjBZV",
222
+ "https://youtu.be/pjqi_M3SPwY?si=5aRizcfpreKR9xUr",
223
+ "https://youtu.be/ROfceyeD7f4?si=OTbrL7BUKSW1u2mt"]
src/logging_config.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/logging_config.py
2
+ import logging
3
+ from logging.handlers import TimedRotatingFileHandler
4
+ import os
5
+ from datetime import datetime
6
+
7
+ def setup_logging():
8
+ # Create the logs directory if it doesn't exist
9
+ log_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'logs')
10
+ if not os.path.exists(log_dir):
11
+ os.makedirs(log_dir)
12
+
13
+ # Set the logging level to DEBUG
14
+ logger = logging.getLogger()
15
+ logger.setLevel(logging.DEBUG)
16
+
17
+ # Create a console handler to output logs to the console
18
+ console_handler = logging.StreamHandler()
19
+ console_handler.setLevel(logging.DEBUG)
20
+
21
+ # Create a timed rotating file handler to output logs to a file
22
+ today_date = datetime.now().strftime('%Y-%m-%d')
23
+ log_file = os.path.join(log_dir, f'{today_date}.log')
24
+ file_handler = TimedRotatingFileHandler(log_file, when='midnight', interval=1)
25
+ file_handler.setLevel(logging.DEBUG)
26
+ file_handler.suffix = '%Y-%m-%d' # Date format for log file names
27
+
28
+ # Create a formatter and set it for both handlers
29
+ formatter = logging.Formatter(
30
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
31
+ console_handler.setFormatter(formatter)
32
+ file_handler.setFormatter(formatter)
33
+
34
+ # Add both handlers to the logger
35
+ logger.addHandler(console_handler)
36
+ logger.addHandler(file_handler)
37
+
38
+ # Avoid duplicate logs by removing the default handler if present
39
+ if logger.hasHandlers():
40
+ logger.handlers.clear()
41
+ logger.addHandler(console_handler)
42
+ logger.addHandler(file_handler)
src/main.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from flask import Flask, jsonify, request, flash, redirect, url_for, render_template
3
+ from submitter import ResumeSubmitter
4
+ from reviewer import ResumeReviewer
5
+ from resume_parser import ResumeParser
6
+ from logging_config import setup_logging
7
+ import json
8
+
9
+ app = Flask(__name__, template_folder=os.path.join(os.path.dirname(__file__), '..', 'templates'))
10
+ app.secret_key = 'supersecretkey'
11
+ setup_logging()
12
+
13
+ @app.route('/v1/resumes/', methods=['POST', 'GET'])
14
+ def submit_resume():
15
+ if request.method == 'POST':
16
+ result = ResumeSubmitter().upload_file()
17
+ if os.path.exists(result):
18
+ resume_path = result # Get the path of the uploaded resume
19
+ try:
20
+ # Redirect to the /v1/reviews/ endpoint with the resume path as a parameter
21
+ return redirect(url_for('get_reviews', path=resume_path))
22
+ except Exception as e:
23
+ app.logger.error("Failed to redirect to /v1/reviews/: %s", str(e))
24
+ return jsonify(message="failed to redirect to reviews page"), 500
25
+ else:
26
+ return jsonify(message=f"failed to submit resume, {result}"), 400
27
+ else:
28
+ return ResumeSubmitter().upload_form()
29
+
30
+ @app.route("/v1/reviews/<path:path>", methods=['POST', 'GET'])
31
+ def get_reviews(path):
32
+ app.logger.debug("Inside get_reviews")
33
+ resume_parser = ResumeParser()
34
+ resume_reviewer = ResumeReviewer()
35
+ parsed_resume_response = resume_parser.parse_text(path)
36
+
37
+ # Check if the response data is in JSON format
38
+ try:
39
+ # Assuming parsed_resume_response.data contains the JSON string
40
+ parsed_resume_dict = json.loads(parsed_resume_response.data)
41
+ except json.JSONDecodeError:
42
+ app.logger.error("Failed to decode JSON from the response")
43
+ return "Invalid JSON response from parser", 500
44
+
45
+ # Save the dictionary as JSON file (optional, if you want to save it to a file)
46
+ with open('parsed_resume.json', 'w') as json_file:
47
+ json.dump(parsed_resume_dict, json_file)
48
+
49
+ # Pass the dictionary to the template
50
+ return render_template("review_output.html", parsed_resume=parsed_resume_dict)
51
+
52
+ @app.route("/v1/users/<int:id>", methods=['GET'])
53
+ def get_user(id):
54
+ return jsonify(message="user retrieved successfully! for given id {}".format(id))
55
+
56
+ @app.route('/', methods=['GET'])
57
+ def greet():
58
+ return render_template('home_page.html')
59
+
60
+ if __name__ == '__main__':
61
+ app.run()
src/resume_parser.py ADDED
@@ -0,0 +1,1274 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # python file to parse different section from resume
2
+ from pdfminer.high_level import extract_pages, extract_text
3
+ from pdfminer.layout import LTTextContainer, LTChar, LTTextLineHorizontal
4
+ from collections import defaultdict
5
+ from flask import jsonify
6
+ import re, fitz, requests, logging, datetime
7
+ from .config import data_science_skills, keyword_variations, essential_skills, quality_mapping, Extract_sections, suggested_projects, ignore_rule_ids
8
+ from .config import required_sections, linkedin_domain, github_domain, basic_informations, section_headers, common_projects, ignore_error_keywords,blog_articles,youtube_links
9
+ from .config import kaggle_domain,hackerrank_domain,leetcode_domain,medium_domain
10
+ from spacy.matcher import Matcher
11
+ import language_tool_python
12
+ from collections import defaultdict
13
+ import random
14
+ tool = language_tool_python.LanguageTool('en-US')
15
+
16
+
17
+
18
+ class ResumeParser:
19
+
20
+ def extract_contact_number_from_resume(self, text):
21
+ contact_number = None
22
+ suggestion = ""
23
+
24
+ # Use regex pattern to find a potential contact number
25
+ pattern = r"\b(?:\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}\b"
26
+ match = re.search(pattern, text)
27
+ if match:
28
+ contact_number = match.group()
29
+ # Check if the contact number is of the correct length
30
+ digits_only = re.sub(r'\D', '', contact_number)
31
+ if len(digits_only) == 10:
32
+ suggestion = ""
33
+ elif len(digits_only) > 10 and digits_only.startswith('91') and len(digits_only[2:]) == 10:
34
+ suggestion = ""
35
+ else:
36
+ suggestion = "Contact number should have exactly 10 digits."
37
+
38
+ return contact_number, suggestion
39
+
40
+
41
+
42
+ def extract_hyperlinks(self, pdf_path):
43
+ doc = fitz.open(pdf_path)
44
+ links = []
45
+
46
+ for page_num in range(len(doc)):
47
+ page = doc.load_page(page_num)
48
+ link_list = page.get_links()
49
+ for link in link_list:
50
+ uri = link.get('uri', None)
51
+ if uri:
52
+ links.append(uri)
53
+
54
+ return links
55
+
56
+ def extract_text_from_pdf(self, pdf_path):
57
+ return extract_text(pdf_path)
58
+
59
+ def extract_email_from_text(self, text):
60
+ pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
61
+ match = re.search(pattern, text)
62
+ if match:
63
+ return match.group()
64
+ return None
65
+
66
+ def extract_email_from_resume(self, pdf_path):
67
+ text = self.extract_text_from_pdf(pdf_path)
68
+ email = self.extract_email_from_text(text)
69
+ suggestion = ""
70
+
71
+ # If no email found in text, check hyperlinks
72
+ if not email:
73
+ links = self.extract_hyperlinks(pdf_path)
74
+ for link in links:
75
+ if link.startswith('mailto:'):
76
+ email_candidate = link.split('mailto:')[1]
77
+ if self.is_valid_email(email_candidate):
78
+ email = email_candidate
79
+ break
80
+
81
+ # Additional validation for email found in text or links
82
+ if email and not self.is_valid_email(email):
83
+ suggestion += "Your email address doesn't seem to be valid. Please check and correct."
84
+
85
+ return email, suggestion
86
+
87
+
88
+ def is_valid_email(self, email):
89
+ # Length check
90
+ if len(email) > 254:
91
+ return False
92
+
93
+ # Consecutive special characters check
94
+ if re.search(r"[._%+-]{2,}", email):
95
+ return False
96
+
97
+ # Domain part validation
98
+ domain_part = email.split('@')[1]
99
+ if not re.match(r"[A-Za-z0-9.-]+\.[A-Za-z]{2,}", domain_part):
100
+ return False
101
+
102
+ # Standard email format check
103
+ pattern = r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$"
104
+ return re.match(pattern, email) is not None
105
+
106
+
107
+ def extract_sections_from_resume(self, text):
108
+ missing_sections = []
109
+ sections_not_capitalized = []
110
+
111
+ for section in required_sections:
112
+ pattern = r"\b{}\b".format(re.escape(section))
113
+
114
+ match_obj = re.search(pattern, text, re.IGNORECASE)
115
+ if not match_obj:
116
+ missing_sections.append(section)
117
+ else:
118
+ if match_obj.group() not in map(str.upper, required_sections):
119
+ sections_not_capitalized.append(section)
120
+
121
+ return missing_sections, sections_not_capitalized
122
+
123
+ def extract_skills_from_resume(self, text):
124
+ if not isinstance(text, str):
125
+ raise ValueError(f"Expected 'text' to be a string, but got {type(text)}")
126
+
127
+ skills = []
128
+ for skill in essential_skills:
129
+ pattern = r"\b{}\b".format(re.escape(skill))
130
+ match = re.search(pattern, text, re.IGNORECASE)
131
+ if match:
132
+ skills.append(skill)
133
+ return skills
134
+
135
+ def extract_keyword_variations_from_resume(self, text):
136
+ found_keywords = []
137
+ for keyword, variations in keyword_variations.items():
138
+ for variation in variations:
139
+ if variation.lower() in text.lower():
140
+ found_keywords.append(variation)
141
+ break
142
+
143
+ return found_keywords
144
+
145
+ def extract_keyword_variations_from_formatted_text(self, formatted_text):
146
+ found_keyword_section = []
147
+ for keyword, variations in keyword_variations.items():
148
+ for variation in variations:
149
+ if variation.lower() in formatted_text.lower():
150
+ found_keyword_section.append(variation)
151
+ break
152
+
153
+ return found_keyword_section
154
+
155
+ def extract_linkedIn_urls_from_pdf(self, pdf_path):
156
+ linkedin_urls = None
157
+ pdf_document = fitz.open(pdf_path)
158
+ for page_num in range(len(pdf_document)):
159
+ page = pdf_document.load_page(page_num)
160
+ links = page.get_links()
161
+ for link in links:
162
+ url = link.get('uri', '')
163
+ if re.search(linkedin_domain, url):
164
+ linkedin_urls = url
165
+ pdf_document.close()
166
+ return linkedin_urls
167
+
168
+ def extract_github_urls_from_pdf(self, pdf_path):
169
+ github_urls = None
170
+ pdf_document = fitz.open(pdf_path)
171
+ for page_num in range(len(pdf_document)):
172
+ page = pdf_document.load_page(page_num)
173
+ links = page.get_links()
174
+ for link in links:
175
+ url = link.get('uri', '')
176
+ if re.search(github_domain, url):
177
+ path = re.sub(github_domain, '', url)
178
+ parts = path.split('/')
179
+ if len(parts) == 1:
180
+ github_urls = url
181
+ pdf_document.close()
182
+ return github_urls
183
+
184
+
185
+ def extract_extra_urls_pdf(self,pdf_path, domains):
186
+ extracted_urls = defaultdict(set)
187
+ try:
188
+ # Open the PDF document
189
+ pdf_document = fitz.open(pdf_path)
190
+
191
+ # Iterate through all pages in the PDF
192
+ for page_num in range(len(pdf_document)):
193
+ page = pdf_document.load_page(page_num)
194
+ links = page.get_links()
195
+
196
+ for link in links:
197
+ url = link.get('uri', '')
198
+ if url: # Ensure there's a URL
199
+ for domain in domains:
200
+ if re.search(domain, url, re.IGNORECASE):
201
+ extracted_urls[domain].add(url) # Add URL to the domain's set
202
+ except Exception as e:
203
+ print(f"Error processing PDF: {e}")
204
+ finally:
205
+ pdf_document.close()
206
+
207
+ return {domain: list(urls) for domain, urls in extracted_urls.items()}
208
+
209
+ def is_valid_url(self , github_urls ):
210
+ suggest = ""
211
+ for _ in [github_urls]:
212
+ if not github_urls:
213
+ break
214
+
215
+ try:
216
+ response = requests.head(github_urls)
217
+ if response.status_code != 200:
218
+ suggest = "GitHub URL is not valid, please check and correct. "
219
+ except requests.RequestException:
220
+ suggest = "GitHub URL is not valid, please check and correct. "
221
+
222
+ return suggest
223
+ return suggest
224
+
225
+
226
+ def is_valid_name(self, name):
227
+ if any(char.isdigit() for char in name):
228
+ return False
229
+ if len(name.split()) > 3:
230
+ return False
231
+ common_non_names = {"Email", "Github", "LinkedIn", "Portfolio", "Data Analyst"}
232
+ if name in common_non_names:
233
+ return False
234
+ return True
235
+
236
+ def extract_name(self, resume_text):
237
+
238
+ lines = resume_text.split('\n')
239
+
240
+ # Use regex to find lines that likely contain names
241
+ name_lines = [line for line in lines if re.match(r'^[A-Za-z]*\s[A-Za-z]*', line.strip())]
242
+
243
+ names = []
244
+ for i in range(len(name_lines)):
245
+ if self.is_valid_name(name_lines[i].strip()):
246
+ names.append(name_lines[i].strip())
247
+
248
+ if len(names) >= 1:
249
+ name = names[0]
250
+ suggestion = ""
251
+ # Check if the name parts contain only alphabetic characters
252
+ name_parts = name.split()
253
+ if any(part[0].islower() for part in name_parts):
254
+ suggestion += " name should start with a capital letter. "
255
+ return name, suggestion
256
+
257
+ return None, "No valid name found"
258
+
259
+
260
+ def check_missing_sections(self, resume_data):
261
+ missing_information = []
262
+ for section in basic_informations:
263
+ if not resume_data.get(section):
264
+ missing_information.append(section)
265
+ return missing_information
266
+
267
+ def segregate_sections(self, text):
268
+ header_pattern = re.compile(rf'^\s*({"|".join(re.escape(header) for header in section_headers)}):?\s*$', re.IGNORECASE)
269
+ sections_text = {}
270
+ current_section = None
271
+ lines = text.splitlines()
272
+ for line in lines:
273
+ clean_line = line.strip()
274
+ match = header_pattern.match(clean_line)
275
+ if match:
276
+ current_section = match.group(1).upper()
277
+ sections_text[current_section] = []
278
+ elif current_section:
279
+ sections_text[current_section].append(line.strip())
280
+
281
+ return sections_text
282
+
283
+ def extract_and_format_sections(self, sections_text, Extract_sections):
284
+ formatted_text = ""
285
+ for section in Extract_sections:
286
+ if section in sections_text:
287
+ section_content = " ".join(sections_text[section]).replace('\n', ' ')
288
+ formatted_text += f"{section}:\n{section_content}\n\n"
289
+ return formatted_text
290
+
291
+ def replace_keywords_with_placeholders(self, formatted_text, found_keyword_section):
292
+ placeholder_text = formatted_text
293
+ keyword_placeholders = {}
294
+
295
+ # Use a set to avoid duplicates and keep track of keyword placeholders
296
+ used_keywords = set()
297
+ for i, keyword in enumerate(found_keyword_section):
298
+ if keyword not in used_keywords:
299
+ used_keywords.add(keyword)
300
+ placeholder = f"{{KEYWORD_{i}}}"
301
+ keyword_placeholders[placeholder] = keyword
302
+ # Using word boundary to match whole words
303
+ placeholder_text = re.sub(r'\b' + re.escape(keyword) + r'\b', placeholder, placeholder_text, flags=re.IGNORECASE)
304
+
305
+ return placeholder_text, keyword_placeholders
306
+
307
+ def replace_placeholders_with_keywords(self, grammar_issues, keyword_placeholders):
308
+ updated_issues = []
309
+ for issue in grammar_issues:
310
+ context = issue['context']
311
+ for placeholder, keyword in keyword_placeholders.items():
312
+ context = context.replace(placeholder, keyword)
313
+ # Update the context in the issue dictionary
314
+ issue['context'] = context
315
+ updated_issues.append(issue)
316
+ return updated_issues
317
+
318
+ def grammar_check(self, placeholder_text):
319
+ matches = tool.check(placeholder_text)
320
+ grammar_issues = []
321
+ for match in matches:
322
+ issue = {
323
+ "context": match.context,
324
+ "error": match.message,
325
+ "rule_id": match.ruleId,
326
+ "suggested_correction": match.replacements
327
+ }
328
+ grammar_issues.append(issue)
329
+ return grammar_issues
330
+
331
+ def filter_grammar_issues(self, grammar_issues, ignore_rule_ids=None, ignore_error_keywords=None):
332
+ if ignore_rule_ids is None:
333
+ ignore_rule_ids = []
334
+ if ignore_error_keywords is None:
335
+ ignore_error_keywords = []
336
+
337
+ filtered_issues = []
338
+ for issue in grammar_issues:
339
+ if issue['rule_id'] not in ignore_rule_ids and not any(keyword in issue['error'] for keyword in ignore_error_keywords):
340
+ filtered_issues.append(issue)
341
+
342
+ return filtered_issues
343
+
344
+ def process_resume(self, text, found_keyword_section, Extract_sections):
345
+ sections_text = self.segregate_sections(text)
346
+ formatted_text = self.extract_and_format_sections(sections_text, Extract_sections)
347
+ found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text)
348
+ placeholder_text, keyword_placeholders = self.replace_keywords_with_placeholders(formatted_text, found_keyword_section)
349
+ grammar_issues = self.grammar_check(placeholder_text)
350
+ grammar_issues_text = self.replace_placeholders_with_keywords(grammar_issues, keyword_placeholders)
351
+ filtered_grammar_issues = self.filter_grammar_issues(grammar_issues, ignore_rule_ids, ignore_error_keywords)
352
+ return filtered_grammar_issues
353
+
354
+ def grammar_issue_check(self, text, found_keyword_section, Extract_sections):
355
+ issues = {}
356
+ text1 = " ".join(text.split("\n"))
357
+ for section in Extract_sections:
358
+ grammar_issues = self.process_resume(text, found_keyword_section, [section])
359
+ if not grammar_issues:
360
+ grammar_issues = "no error found"
361
+ issues[section] = grammar_issues
362
+ return issues
363
+
364
+ def normalize_font_name(self,font_name):
365
+ if '-' in font_name:
366
+ font_name = font_name.split('-')[0]
367
+ if '+' in font_name:
368
+ font_name = font_name.split('+')[1]
369
+ return font_name
370
+
371
+
372
+ def extract_text_properties(self, pdf_path, predefined_terms):
373
+ text_properties = []
374
+ current_phrase = ""
375
+ current_font_size = None
376
+ current_font_name = None
377
+ current_page_num = None
378
+
379
+ special_characters = set("●▪•!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~")
380
+
381
+ def add_current_phrase():
382
+ nonlocal current_phrase
383
+ if current_phrase.strip():
384
+ flag = any(current_phrase in term for term in predefined_terms)
385
+ if not flag:
386
+ text_properties.append({
387
+ "text": current_phrase,
388
+ "font_size": current_font_size,
389
+ "font_name": current_font_name,
390
+ "page_num": current_page_num
391
+ })
392
+ current_phrase = ""
393
+
394
+ for page_layout in extract_pages(pdf_path):
395
+ for element in page_layout:
396
+ if isinstance(element, LTTextContainer):
397
+ for text_line in element:
398
+ if isinstance(text_line, LTTextLineHorizontal):
399
+ for character in text_line:
400
+ if isinstance(character, LTChar):
401
+ text = character.get_text()
402
+ font_size = round(character.size, 2)
403
+ font_name = self.normalize_font_name(character.fontname)
404
+ page_num = page_layout.pageid
405
+
406
+ if text.isspace() or text in special_characters:
407
+ add_current_phrase()
408
+ continue
409
+
410
+ if (font_size != current_font_size or font_name != current_font_name or
411
+ page_num != current_page_num):
412
+ add_current_phrase()
413
+ current_font_size = font_size
414
+ current_font_name = font_name
415
+ current_page_num = page_num
416
+
417
+ current_phrase += text
418
+
419
+ add_current_phrase()
420
+
421
+ return text_properties
422
+
423
+ def group_similar_fonts(self,text_properties, tolerance=0.5):
424
+ grouped_properties = defaultdict(list)
425
+
426
+ for prop in text_properties:
427
+ rounded_size = round(prop["font_size"] / tolerance) * tolerance
428
+ key = (prop["font_name"], rounded_size)
429
+ grouped_properties[key].append(prop)
430
+
431
+ return grouped_properties
432
+
433
+
434
+
435
+
436
+ def identify_different_fonts_and_sizes(self, grouped_properties):
437
+ most_common_group = max(grouped_properties.values(), key=len)
438
+ most_common_key = None
439
+ for key, group in grouped_properties.items():
440
+ if group == most_common_group:
441
+ most_common_key = key
442
+ break
443
+
444
+ different_texts = []
445
+
446
+ for key, group in grouped_properties.items():
447
+ if group != most_common_group:
448
+ for prop in group:
449
+ reason = []
450
+ if key[1] != most_common_key[1]:
451
+ reason.append(f"size not {most_common_key[1]}")
452
+ if key[0] != most_common_key[0]:
453
+ reason.append(f"font not {most_common_key[0]}")
454
+ different_texts.append({
455
+ "page_num": prop['page_num'],
456
+ "text": prop['text'],
457
+ "found_size": prop['font_size'],
458
+ "found_font_name": prop['font_name'],
459
+ "reason": ", ".join(reason)
460
+ })
461
+
462
+ return different_texts
463
+
464
+ def parse_dates(self, sections_text, section_name):
465
+ # Check if the section is in the text
466
+ suggest = ""
467
+
468
+ # Define the date patterns to match various date formats
469
+ date_pattern = (
470
+ r'\b\d{1,2}/\d{4}\b|' # MM/YYYY
471
+ r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{4}\b|' # Month YYYY
472
+ r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)\s+\d{1,2},?\s*\d{4}\b|' # Month DD, YYYY
473
+ r'\b\d{4}\b|' # YYYY
474
+ r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*/?\d{4}\b|' # Month/YYYY
475
+ r'\b(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\s*-\s*(?:jan(?:uary)?|feb(?:ruary)?|mar(?:ch)?|apr(?:il)?|may|jun(?:e)?|jul(?:y)?|aug(?:ust)?|sep(?:tember)?|oct(?:ober)?|nov(?:ember)?|dec(?:ember)?)[a-z]*\d{4}\b' # Month/YYYY - Month/YYYY
476
+ )
477
+
478
+ all_dates = []
479
+
480
+ # Iterate over the entries in the section_name
481
+ for entry in sections_text[section_name]:
482
+ entry = entry.lower()
483
+ matches = re.findall(date_pattern, entry)
484
+ if matches and len(matches)>1:
485
+ if len(matches) == 2:
486
+ all_dates.append(f"{matches[0]} {matches[1]}")
487
+ else:
488
+ all_dates.extend(matches)
489
+
490
+ return all_dates
491
+
492
+
493
+ def convert_to_date(self, date_str):
494
+ # Mapping of month names and abbreviations to their numeric equivalents
495
+ month_map = {
496
+ 'jan': 1, 'january': 1, 'feb': 2, 'february': 2,
497
+ 'mar': 3, 'march': 3, 'apr': 4, 'april': 4,
498
+ 'may': 5, 'jun': 6, 'june': 6, 'jul': 7,
499
+ 'july': 7, 'aug': 8, 'august': 8, 'sep': 9,
500
+ 'september': 9, 'oct': 10, 'october': 10,
501
+ 'nov': 11, 'november': 11, 'dec': 12, 'december': 12,
502
+ '01': 1, '02': 2, '03': 3, '04': 4,
503
+ '05': 5, '06': 6, '07': 7, '08': 8,
504
+ '09': 9, '10': 10, '11': 11, '12': 12
505
+ }
506
+
507
+ # Regex patterns to match different date formats
508
+ pattern_mm_yyyy = re.compile(r'(\d{1,2})/(\d{4})')
509
+ pattern_mm_yyyy_space = re.compile(r'(\d{1,2})\s(\d{4})')
510
+ pattern_month_yyyy = re.compile(r'([a-zA-Z]+)\s?(\d{4})')
511
+ pattern_yyyy = re.compile(r'(\d{4})')
512
+
513
+ def extract_date(date_str):
514
+ match_mm_yyyy = pattern_mm_yyyy.match(date_str)
515
+ match_mm_yyyy_space = pattern_mm_yyyy_space.match(date_str)
516
+ match_month_yyyy = pattern_month_yyyy.match(date_str)
517
+ match_yyyy = pattern_yyyy.match(date_str)
518
+
519
+ if match_mm_yyyy:
520
+ month = int(match_mm_yyyy.group(1))
521
+ year = int(match_mm_yyyy.group(2))
522
+ elif match_mm_yyyy_space:
523
+ month = int(match_mm_yyyy_space.group(1))
524
+ year = int(match_mm_yyyy_space.group(2))
525
+ elif match_month_yyyy:
526
+ month = month_map.get(match_month_yyyy.group(1).lower())
527
+ year = int(match_month_yyyy.group(2))
528
+ elif match_yyyy:
529
+ month = 1
530
+ year = int(match_yyyy.group(1))
531
+ else:
532
+ return []
533
+
534
+ return datetime.date(year, month, 1)
535
+
536
+ date_parts = re.findall(r'(\d{4}\s[a-zA-Z]+\s?|\d{4}[a-zA-Z]+|\d{4}\/\d{2}|\d{4}\s\d{2}|[a-zA-Z]+\s?\d{4}|\d{4}\s[a-zA-Z]+)', date_str)
537
+ if len(date_parts) == 1:
538
+ # Standalone year or single date
539
+ start_date = extract_date(date_parts[0])
540
+ end_date = datetime.date(start_date.year, start_date.month, start_date.day)
541
+ elif len(date_parts) == 2:
542
+ # Date range
543
+ start_date = extract_date(date_parts[0])
544
+ end_date = extract_date(date_parts[1])
545
+ else:
546
+ return []
547
+
548
+ return start_date, end_date
549
+
550
+
551
+ def date_time(self, date_parts):
552
+ converted_dates = []
553
+ for date_part in date_parts:
554
+ start_date, end_date = self.convert_to_date(date_part)
555
+ converted_dates.append((start_date, end_date))
556
+ return converted_dates
557
+
558
+
559
+ def check_chronological_order(self, converted_dates, section_name ):
560
+ suggestion = ""
561
+ sorted_dates = sorted(converted_dates, key=lambda x: (x[1], x[0]), reverse=True)
562
+ if converted_dates == sorted_dates:
563
+ suggestion = f"{section_name} section is in chronological order."
564
+ else:
565
+ suggestion = f"{section_name} section is not in chronological order."
566
+
567
+ return suggestion
568
+
569
+ def check_common_projects(self, projects_text):
570
+ found_projects = []
571
+ for project in common_projects:
572
+ if project.lower() in projects_text.lower():
573
+ found_projects.append(project)
574
+ return found_projects
575
+
576
+ def recommend_resources():
577
+ # Randomly pick 2 blog articles and 2 YouTube links
578
+ recommended_blogs = random.sample(blog_articles, 2)
579
+ recommended_youtube = random.sample(youtube_links, 2)
580
+
581
+ # Return the recommendations
582
+ return {
583
+ "Recommended Blogs": recommended_blogs,
584
+ "Recommended YouTube Links": recommended_youtube
585
+ }
586
+
587
+ def check_imarticus_certifications(self, certifications_text):
588
+ # Check if "imarticus" is present in the certifications text
589
+ if "imarticus" in certifications_text.lower():
590
+ return {
591
+ "found": True,
592
+ "message": "Imarticus certification found. Please upload it in the academic section."
593
+ }
594
+ return {
595
+ "found": False,
596
+ "message": "No Imarticus certification found in the provided text."
597
+ }
598
+
599
+
600
+ def chronological_order_check(self, sections_text, section_name):
601
+ order_suggestion = ""
602
+ suggestion = ""
603
+ section_name = section_name.upper()
604
+ if section_name in sections_text:
605
+ date = self.parse_dates(sections_text, section_name)
606
+ if date:
607
+ converted_dates = self.date_time(date)
608
+ order_suggestion = self.check_chronological_order(converted_dates, section_name)
609
+ else:
610
+ suggestion = f"No valid dates found in {section_name} section. "
611
+ else:
612
+ suggestion = f"{section_name} is not in section header. "
613
+
614
+ return order_suggestion, suggestion
615
+
616
+
617
+
618
+ # Function to check for spelling mistakes
619
+ def check_spelling(self, headers, section_headers):
620
+ suggestions = []
621
+ for header in headers:
622
+ if header.upper() not in map(str.upper, section_headers):
623
+ suggestions = header
624
+ return suggestions
625
+
626
+ def is_present_name(name):
627
+ """
628
+ Checks if a given name has at least 2 words.
629
+
630
+ Args:
631
+ name: The name string to check.
632
+
633
+ Returns:
634
+ True if it has at least 2 words, false otherwise.
635
+ """
636
+ parts = name.split()
637
+ return len(parts) >= 2
638
+
639
+ def is_sentence_case(name):
640
+
641
+ parts = name.split() # Split into individual words
642
+ for part in parts:
643
+ if not part: # handles empty strings in name
644
+ continue
645
+ if not part[0].isupper() or not part[1:].islower():
646
+ return False # Check if first letter is uppercase and rest are lowercase
647
+ return True
648
+
649
+ def is_present_name(self,name):
650
+ parts = name.split()
651
+ return len(parts) >= 2
652
+
653
+ def is_sentence_case(self,name):
654
+ parts = name.split()
655
+ for part in parts:
656
+ if not part:
657
+ continue
658
+ if not part[0].isupper() or not part[1:].islower():
659
+ return False
660
+ return True
661
+
662
+ def extract_project_links(self,sections_text):
663
+ project_links = {}
664
+
665
+ if "PROJECTS" in sections_text:
666
+ project_list = sections_text.get("PROJECTS", [])
667
+ url_pattern = r"https?://[^\s]+"
668
+ for project in project_list:
669
+ links = re.findall(url_pattern,project)
670
+ if links:
671
+ project_links[project] = links
672
+ return project_links
673
+
674
+ def count_sentences(self,text):
675
+ sentence_endings = r"(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?|!)\s"
676
+ sentences = re.split(sentence_endings, text)
677
+ sentences = [s.strip() for s in sentences if s.strip()]
678
+ return len(sentences)
679
+
680
+ def calculate_summary_score(self,summary):
681
+ if not summary:
682
+ score+=0
683
+
684
+ num_sentences = self.count_sentences(summary)
685
+ if num_sentences<=4:
686
+ return 3
687
+ elif num_sentences>4:
688
+ return 1
689
+ else:
690
+ return 0
691
+
692
+ def calculate_extra_urls_bonus(self,pdf_path):
693
+ domains = [
694
+ r"hackerrank\.com", # Hackerrank
695
+ r"leetcode\.com", # LeetCode
696
+ r"medium\.com" # Medium
697
+ ]
698
+ extra_urls = self.extract_extra_urls_pdf(pdf_path, domains)
699
+ has_extra_urls = any(urls for urls in extra_urls.values())
700
+ return 5 if has_extra_urls else 0
701
+
702
+ def calculate_relevant_experience_score(self, experience_text):
703
+ """
704
+ Assigns a score based on the presence of relevant experience keywords.
705
+
706
+ Args:
707
+ experience_text (str): The extracted work experience section text.
708
+
709
+ Returns:
710
+ int: A score of 5 if relevant keywords are found, otherwise 0.
711
+ """
712
+ if not experience_text:
713
+ return 0 # ✅ No experience section → Score 0
714
+
715
+ if isinstance(experience_text, list):
716
+ experience_text = " ".join(experience_text) # ✅ Convert list to a single string
717
+
718
+ experience_text = experience_text.strip().lower() # ✅ Ensure it's a string and lowercase
719
+
720
+ # ✅ Check if any keyword from 'data_science_skills' or 'essential_skills' exists
721
+ for skill in data_science_skills + essential_skills:
722
+ if skill.lower() in experience_text:
723
+ return 5 # ✅ Found relevant experience → Full score
724
+
725
+ return 0
726
+
727
+ def calculate_ds_skills_score(self, skills_present):
728
+ if not skills_present: # No skills found at all
729
+ return 0
730
+
731
+ # Use skills from config instead of hardcoded list
732
+ ds_skills_list_lower = [skill.lower() for skill in data_science_skills]
733
+ skills_present_lower = [skill.lower() for skill in skills_present]
734
+
735
+ matching_count = sum(1 for skill in skills_present_lower
736
+ if skill in ds_skills_list_lower)
737
+
738
+ if matching_count == 0: # Skills found but none match DS list
739
+ return 2
740
+ elif 1 <= matching_count <= 5:
741
+ return 3
742
+ elif matching_count > 5:
743
+ return 5
744
+ return 0
745
+
746
+ def calculate_project_link_score(self, projects_with_links):
747
+ """
748
+ Assigns a score based on whether project links are present.
749
+
750
+ Args:
751
+ projects_with_links (int): The number of projects with links.
752
+
753
+ Returns:
754
+ int: 2 if project links are found, otherwise 0.
755
+ """
756
+ return 2 if projects_with_links > 0 else 0
757
+
758
+
759
+ def imarticus_review_score(self,name,contact_number,email,linkedin_urls,github_url,missing_sections,sections_not_capitalized,common_projects,section_order_suggestion,sections_text,skills,relevant_experience_score):
760
+ score = 0
761
+ if name:
762
+ name_parts = name.split()
763
+ num_parts = len(name_parts)
764
+
765
+ if num_parts == 0:
766
+ score += 0
767
+ if self.is_sentence_case(name):
768
+ score += 3
769
+ elif self.is_present_name(name):
770
+ score += 1.5
771
+
772
+ if contact_number and isinstance(contact_number, str):
773
+ digits_only = re.sub(r'\D', '', contact_number)
774
+
775
+ if digits_only.startswith("91") and len(digits_only) > 10:
776
+ digits_only = digits_only[2:] # Remove the first two characters ('91')
777
+
778
+ if len(digits_only) == 10 and digits_only[0] in "6789": # Check for valid Indian mobile numbers
779
+ score += 3
780
+
781
+ if email:
782
+ score += 3 if self.is_valid_email(email) else 0
783
+
784
+ score += 3 if linkedin_urls else 0
785
+
786
+ if github_url:
787
+ github_suggestion = self.is_valid_url(github_url)
788
+ score += 3 if not github_suggestion else 0
789
+ else:
790
+ score += 0
791
+
792
+ if len(missing_sections)==0 and len(sections_not_capitalized)==0:
793
+ score+=10
794
+ elif len(missing_sections)==0 and len(sections_not_capitalized)>0:
795
+ score+=8
796
+ elif len(missing_sections)<=3:
797
+ score+=6
798
+ elif len(missing_sections)>4:
799
+ score+=3
800
+
801
+ if common_projects:
802
+ score +=0
803
+ else:
804
+ score +=5
805
+
806
+ if section_order_suggestion:
807
+ score -= 2
808
+ else:
809
+ score
810
+
811
+ """
812
+ ds_skills_list_lower = [skill.lower() for skill in data_science_skills]
813
+ skills_present_lower = [skill.lower() for skill in self.extract_skills_from_resume(skills) ]
814
+
815
+ matching_skill_count = 0
816
+ for skill in skills_present_lower:
817
+ if ds_skills_list_lower:
818
+ matching_skill_count+=1
819
+ if matching_skill_count==0:
820
+ score+=0
821
+
822
+ if matching_skill_count<=5:
823
+ score+=2
824
+ elif matching_skill_count>=10 and matching_skill_count<=15:
825
+ score+5
826
+ else:
827
+ score+=8
828
+ """
829
+
830
+ if "PROJECTS" not in sections_text:
831
+ score+=0
832
+ else:
833
+ project_list = sections_text.get("PROJECTS",[])
834
+ project_count = len([x for x in project_list if "Description" in x])
835
+
836
+ if project_count<=2:
837
+ score+=2
838
+ elif project_count>2 and project_count<=4:
839
+ score+=5
840
+ elif project_count>4:
841
+ score+=3
842
+ """
843
+ project_links = self.extract_project_links(sections_text)
844
+ total_projects = len(sections_text.get("PROJECTS", []))
845
+ projects_with_links = len(project_links)
846
+
847
+ if total_projects > 0:
848
+ if projects_with_links == 0:
849
+ score+=0
850
+ elif projects_with_links / total_projects >= 0.5:
851
+ score += 1.5
852
+ if projects_with_links == total_projects:
853
+ score += 3
854
+ """
855
+ resume_data = {}
856
+ # Extract projects & links
857
+ project_links = self.extract_project_links(sections_text)
858
+ projects_with_links = len(project_links)
859
+
860
+ # ✅ Count only projects with descriptions
861
+ valid_projects = [
862
+ p for p in sections_text.get("PROJECTS", []) if "description" in p.lower()
863
+ ]
864
+ total_projects = len(valid_projects) # ✅ Count projects properly
865
+
866
+ # ✅ Calculate project link score
867
+ project_link_score = self.calculate_project_link_score(projects_with_links)
868
+ resume_data["project_link_score"] = project_link_score
869
+
870
+ # ✅ Prevent division by zero
871
+ if total_projects > 0:
872
+ if projects_with_links == 0:
873
+ score += 0
874
+ elif projects_with_links / total_projects >= 0.5:
875
+ score += 1.5
876
+ if projects_with_links == total_projects:
877
+ score += 3
878
+ else:
879
+ score += 0 # ✅ Ensure no division error if no projects exist
880
+
881
+ """
882
+ profile_summary = sections_text.get("PROFILE SUMMARY", "")
883
+ print(profile_summary)
884
+
885
+ summary_score = self.calculate_summary_score(profile_summary)
886
+ score += summary_score
887
+ """
888
+ ds_skills_score = self.calculate_ds_skills_score(skills)
889
+ score += ds_skills_score
890
+
891
+
892
+ certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", [])
893
+ num_certifications = len(certifications)
894
+
895
+ if num_certifications==0:
896
+ score+=0
897
+ elif 0 < num_certifications <= 2:
898
+ score+=3
899
+ elif 2 < num_certifications <= 4:
900
+ score+=5
901
+ elif num_certifications>4:
902
+ score+=7
903
+
904
+ """
905
+ extra_urls_bonus = self.calculate_extra_urls_bonus(pdf_path)
906
+ score += extra_urls_bonus
907
+ """
908
+
909
+ score += relevant_experience_score
910
+
911
+ score += project_link_score
912
+
913
+ return score
914
+
915
+
916
+
917
+
918
+ def imarticus_detailed_score(self, name, contact_number, email, linkedin_urls, github_url,
919
+ missing_sections=None, sections_not_capitalized=None, common_projects=None,
920
+ section_order_suggestion=None, sections_text=None, skills=None,
921
+ relevant_experience_score=0):
922
+
923
+ # Ensure lists and dictionaries have default values to avoid 'NoneType' errors
924
+ missing_sections = missing_sections or []
925
+ sections_not_capitalized = sections_not_capitalized or []
926
+ common_projects = common_projects or []
927
+ sections_text = sections_text or {}
928
+
929
+ score_breakdown = {
930
+ "name_score": 0,
931
+ "contact_number_score": 0,
932
+ "email_score": 0,
933
+ "linkedin_url_score": 0,
934
+ "github_url_score": 0,
935
+ "missing_sections_score": 0,
936
+ "common_projects_score": 0,
937
+ "section_order_score": 0,
938
+ "projects_score": 0,
939
+ "certifications_score": 0,
940
+ "relevant_experience_score": 0,
941
+ "ds_skills_score": 0,
942
+ "extra_urls_bonus": 0,
943
+ "summary_score": 0,
944
+ "project_link_score": 0
945
+ }
946
+
947
+ # ✅ Name Score (3 Points)
948
+ if name:
949
+ if self.is_sentence_case(name):
950
+ score_breakdown["name_score"] = 3
951
+ elif self.is_present_name(name):
952
+ score_breakdown["name_score"] = 1.5
953
+
954
+
955
+ # ✅ Contact Number Score (3 Points)
956
+ if contact_number and isinstance(contact_number, str):
957
+ digits_only = re.sub(r'\D', '', contact_number)
958
+ if digits_only.startswith("91") and len(digits_only) > 10:
959
+ digits_only = digits_only[2:]
960
+ if len(digits_only) == 10 and digits_only[0] in "6789":
961
+ score_breakdown["contact_number_score"] = 3
962
+
963
+ # ✅ Email Score (3 Points)
964
+ score_breakdown["email_score"] = 3 if email and self.is_valid_email(email) else 0
965
+
966
+ # ✅ LinkedIn URL Score (3 Points)
967
+ score_breakdown["linkedin_url_score"] = 3 if linkedin_urls else 0
968
+
969
+ # ✅ GitHub URL Score (3 Points)
970
+ if github_url and self.is_valid_url(github_url):
971
+ score_breakdown["github_url_score"] = 3
972
+
973
+ # ✅ Missing Sections Score (10 Points)
974
+ if not missing_sections and not sections_not_capitalized:
975
+ score_breakdown["missing_sections_score"] = 10
976
+ elif not missing_sections and sections_not_capitalized:
977
+ score_breakdown["missing_sections_score"] = 8
978
+ elif len(missing_sections) <= 3:
979
+ score_breakdown["missing_sections_score"] = 6
980
+ else:
981
+ score_breakdown["missing_sections_score"] = 3
982
+
983
+ # ✅ Common Projects Score (5 Points)
984
+ score_breakdown["common_projects_score"] = 0 if common_projects else 5
985
+
986
+ # ✅ Section Order Score (2 Points)
987
+ score_breakdown["section_order_score"] = -2 if section_order_suggestion else 0
988
+
989
+ # ✅ Projects Score (5 Points)
990
+ if "PROJECTS" in sections_text:
991
+ project_list = sections_text.get("PROJECTS", [])
992
+ project_count = len([x for x in project_list if "Description" in x])
993
+ if project_count <= 2:
994
+ score_breakdown["projects_score"] = 2
995
+ elif 2 < project_count <= 4:
996
+ score_breakdown["projects_score"] = 5
997
+ else:
998
+ score_breakdown["projects_score"] = 3
999
+
1000
+ # ✅ Certifications Score (7 Points)
1001
+ certifications = sections_text.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", [])
1002
+ num_certifications = len(certifications)
1003
+ if num_certifications == 0:
1004
+ score_breakdown["certifications_score"] = 0
1005
+ elif 0 < num_certifications <= 2:
1006
+ score_breakdown["certifications_score"] = 3
1007
+ elif 2 < num_certifications <= 4:
1008
+ score_breakdown["certifications_score"] = 5
1009
+ else:
1010
+ score_breakdown["certifications_score"] = 7
1011
+
1012
+ # ✅ Relevant Experience Score (5 Points)
1013
+ score_breakdown["relevant_experience_score"] = relevant_experience_score if relevant_experience_score is not None else 0
1014
+
1015
+ # ✅ Data Science Skills Score (5 Points)
1016
+ score_breakdown["ds_skills_score"] = self.calculate_ds_skills_score(skills)
1017
+
1018
+ # ✅ Extra URLs Bonus (5 Points)
1019
+ score_breakdown["extra_urls_bonus"] = self.calculate_extra_urls_bonus(sections_text)
1020
+
1021
+ # ✅ Summary Score (5 Points)
1022
+ profile_summary = sections_text.get("PROFILE SUMMARY", "")
1023
+ score_breakdown["summary_score"] = self.calculate_summary_score(profile_summary)
1024
+
1025
+ # ✅ Project Link Score (2 Points)
1026
+ project_links = self.extract_project_links(sections_text)
1027
+ projects_with_links = len(project_links)
1028
+ score_breakdown["project_link_score"] = self.calculate_project_link_score(projects_with_links)
1029
+
1030
+ return score_breakdown
1031
+
1032
+ def calculate_name_score(self,name):
1033
+ if not name:
1034
+ return 0
1035
+
1036
+ name_parts = name.split()
1037
+ num_parts = len(name_parts)
1038
+
1039
+ if num_parts == 0:
1040
+ return 0
1041
+ elif self.is_sentence_case(name):
1042
+ return 3
1043
+ elif self.is_present_name(name):
1044
+ return 1.5
1045
+ else:
1046
+ return 0
1047
+
1048
+
1049
+ def calculate_contact(self,contact_number):
1050
+ if contact_number and isinstance(contact_number, str):
1051
+ digits_only = re.sub(r'\D', '', contact_number)
1052
+
1053
+ if digits_only.startswith("91") and len(digits_only) > 10:
1054
+ digits_only = digits_only[2:] # Remove the first two characters ('91')
1055
+
1056
+ if len(digits_only) == 10 and digits_only[0] in "6789": # Check for valid Indian mobile numbers
1057
+ return 3
1058
+ else:
1059
+ return 0
1060
+
1061
+ def calculate_email(self,email):
1062
+ if email:
1063
+ if self.is_valid_email(email):
1064
+ return 3
1065
+ else:
1066
+ return 0
1067
+
1068
+ def calculate_github_url_score(self,github_url):
1069
+ if github_url:
1070
+ github_suggestion = self.is_valid_url(github_url)
1071
+ return 3 if not github_suggestion else 0
1072
+ return 0
1073
+
1074
+ def parse_text(self, path):
1075
+ logger = logging.getLogger(__name__)
1076
+ logging.getLogger("pdfminer").setLevel(logging.WARNING)
1077
+ resume_data = {}
1078
+ logger.debug('parsing text')
1079
+ text = self.extract_text_from_pdf(path)
1080
+ text1 = " ".join(text.split("\n"))
1081
+ skills_found = self.extract_skills_from_resume(text)
1082
+ found_keywords = self.extract_keyword_variations_from_resume(text)
1083
+ sections_text = self.segregate_sections(text)
1084
+ formatted_text = self.extract_and_format_sections(sections_text, Extract_sections)
1085
+ found_keyword_section = self.extract_keyword_variations_from_formatted_text(formatted_text)
1086
+
1087
+ parsed_sections = self.segregate_sections(text)
1088
+ projects = parsed_sections.get("PROJECTS", [])
1089
+ certifications = parsed_sections.get("CERTIFICATIONS & ACADEMIC ENDEAVOURS", [])
1090
+ projects_text = "\n".join(projects)
1091
+ certifications_text = "\n".join(certifications)
1092
+ found_imarticus_certification = self.check_imarticus_certifications(certifications_text)
1093
+ found_projects = self.check_common_projects(projects_text)
1094
+
1095
+ name, name_suggestion = self.extract_name(text)
1096
+ contact_number, contact_suggestion = self.extract_contact_number_from_resume(text)
1097
+ email, email_suggestion = self.extract_email_from_resume(path)
1098
+ github_urls = self.extract_github_urls_from_pdf(path)
1099
+ github_urls_suggestions = self.is_valid_url(github_urls)
1100
+ linkedin_urls = self.extract_linkedIn_urls_from_pdf(path)
1101
+ section_by_grammer_issues = self.grammar_issue_check(text, found_keyword_section, Extract_sections)
1102
+
1103
+
1104
+ domains = [
1105
+ r"hackerrank\.com", # Hackerrank
1106
+ r"leetcode\.com", # LeetCode
1107
+ r"medium\.com" # Medium
1108
+ ]
1109
+ extra_urls = self.extract_extra_urls_pdf(path, domains)
1110
+
1111
+ education_order_suggestion, education_suggestion = self.chronological_order_check(sections_text, "ACADEMIC PROFILE")
1112
+ experience_order_suggestion, experience_suggestion = self.chronological_order_check(sections_text, "WORK EXPERIENCE")
1113
+
1114
+ headers = list(sections_text.keys())
1115
+ spelling_suggestions = self.check_spelling(headers, section_headers)
1116
+
1117
+ predefined_terms = [name, email]
1118
+ predefined_terms.extend(required_sections)
1119
+ text_properties = self.extract_text_properties(path, predefined_terms)
1120
+ grouped_properties = self.group_similar_fonts(text_properties)
1121
+ different_texts = self.identify_different_fonts_and_sizes(grouped_properties)
1122
+
1123
+ font_suggestions = []
1124
+ for item in different_texts:
1125
+ font_suggestion = f"Formatting issue at Page: {item['page_num']}, Text: {item['text']}, Reason: {item['reason']}, Found font size: {item['found_size']}, Found font name: {item['found_font_name']}"
1126
+ font_suggestions.append(font_suggestion)
1127
+
1128
+ missing_sections, sections_not_capitalized = self.extract_sections_from_resume(text)
1129
+
1130
+ linkedin_urls_suggestion = str()
1131
+ common_project = str()
1132
+ if not name:
1133
+ name_suggestion = "Please add name to the resume."
1134
+ if not contact_number:
1135
+ contact_suggestion = "Please add the contact number to the resume."
1136
+ if not email:
1137
+ email_suggestion = "Please add the email address to the resume."
1138
+ if not github_urls:
1139
+ github_urls_suggestions = "Add the github_urls to the resume."
1140
+ if not linkedin_urls:
1141
+ linkedin_urls_suggestion = "Add the linkedin_urls to the resume."
1142
+ if found_projects:
1143
+ common_project = "Common projects found in Projects section: "
1144
+ for project in found_projects:
1145
+ common_project += project
1146
+
1147
+ # Replace the existing project length suggestion code with:
1148
+ project_list = sections_text.get("PROJECTS", [])
1149
+ projects_with_description = [
1150
+ p for p in project_list
1151
+ if "description" in p.lower()
1152
+ ]
1153
+ project_count = len(projects_with_description)
1154
+
1155
+ if project_count == 0:
1156
+ project_length_suggestion = "No projects found. Consider at least 2 projects."
1157
+ elif project_count == 1:
1158
+ project_length_suggestion = "Only 1 project found. Consider adding 1 more project."
1159
+ else:
1160
+ project_length_suggestion = f"{project_count} projects found."
1161
+
1162
+ # Store in resume data (keeps your existing URL extraction)
1163
+ resume_data["project_length_suggestion"] = project_length_suggestion
1164
+
1165
+ experience_text = sections_text.get("WORK EXPERIENCE", "") # ✅ Extract work experience section
1166
+ relevant_experience_score = self.calculate_relevant_experience_score(experience_text) # ✅ Calculate score
1167
+
1168
+ # ✅ Store in the final resume data output
1169
+ resume_data["relevant_experience_score"] = relevant_experience_score
1170
+
1171
+ section_grammar_check_issues = self.grammar_check(sections_text.keys())
1172
+
1173
+ recommended_blogs = random.sample(blog_articles, 2)
1174
+ recommended_youtube = random.sample(youtube_links, 2)
1175
+
1176
+ name_score = self.calculate_name_score(name)
1177
+
1178
+ contact_score = self.calculate_contact(contact_number)
1179
+
1180
+ email_score = self.calculate_email(email)
1181
+
1182
+ github_url_score = self.calculate_github_url_score(github_urls)
1183
+
1184
+ # Calculate imarticus_score
1185
+ imarticus_score = self.imarticus_review_score(
1186
+ name,
1187
+ contact_number,
1188
+ email,
1189
+ linkedin_urls,
1190
+ github_urls,
1191
+ missing_sections,
1192
+ sections_not_capitalized,
1193
+ common_projects=found_projects, # Ensure to pass found projects
1194
+ section_order_suggestion=experience_order_suggestion,
1195
+ sections_text=sections_text,
1196
+ skills=skills_found,
1197
+ relevant_experience_score=relevant_experience_score,
1198
+ #pdf_path=path
1199
+ #relevant_keywords_found=bool(found_keywords), # Convert to boolean
1200
+ #experience_orderly_arranged=experience_order_suggestion, # Pass orderly arrangement check
1201
+ #experience_section_present="WORK EXPERIENCE" in sections_text # Check if experience section is present
1202
+ )
1203
+
1204
+
1205
+
1206
+ # Populate resume data dictionary
1207
+ resume_data = {
1208
+ "name": name,
1209
+ "contact_number": contact_number,
1210
+ "email": email,
1211
+ "linkedin_urls": linkedin_urls,
1212
+ "experience_order_suggestion": experience_order_suggestion,
1213
+ "education_order_suggestion": education_order_suggestion,
1214
+ "grammer_issues_by_section": section_by_grammer_issues,
1215
+ "github_urls": github_urls,
1216
+ "skills": skills_found,
1217
+ "spelling_suggestions": spelling_suggestions,
1218
+ "found_keywords": found_keywords,
1219
+ "text": text,
1220
+ "font_suggestions": font_suggestions,
1221
+ "name_suggestion": name_suggestion,
1222
+ "contact_suggestion": contact_suggestion,
1223
+ "email_suggestion": email_suggestion,
1224
+ "github_urls_suggestions": github_urls_suggestions,
1225
+ "linkedin_urls_suggestion": "Add the LinkedIn URLs to the resume." if not linkedin_urls else "",
1226
+ "missing_sections": missing_sections,
1227
+ "common_projects": "Common projects found in Projects section: " + ", ".join(found_projects) if found_projects else "",
1228
+ "project_length_suggestion": project_length_suggestion,
1229
+ "section_grammar_check_issues": section_grammar_check_issues,
1230
+ "imarticus_score": imarticus_score, # Add the score to resume data
1231
+ "extra_urls": extra_urls,
1232
+ "certifications": {
1233
+ "found": found_imarticus_certification["found"],
1234
+ "message": found_imarticus_certification["message"],
1235
+ "text": certifications_text # Store extracted certification text
1236
+ },
1237
+ "recommended_blogs": recommended_blogs,
1238
+ "recommended_youtube_links": recommended_youtube,
1239
+ "name_score":name_score,
1240
+ "contact_score":contact_score,
1241
+ "email_score":email_score,
1242
+ "github_urls_score":github_url_score
1243
+
1244
+ }
1245
+
1246
+ # Additional checks and data additions
1247
+ if "WORK EXPERIENCE" in sections_text.keys() and "WORK EXPERIENCE" != list(sections_text.keys())[2]:
1248
+ section_order_suggestion = f"WORK EXPERIENCE should come before {list(sections_text.keys())[2]}"
1249
+ resume_data["section_order_suggestion"] = section_order_suggestion
1250
+
1251
+ missing_important_sections = self.check_missing_sections(resume_data)
1252
+ resume_data["basic_information_section"] = missing_important_sections or "Basic information is Found"
1253
+
1254
+ missing_skills = list(set(essential_skills) - set(skills_found))
1255
+ resume_data["missing_skills"] = missing_skills
1256
+
1257
+ found_keywords_count = len(resume_data["found_keywords"])
1258
+ num_keywords = len(keyword_variations)
1259
+ quality_mapping = {"Low": 0.2, "Medium": 0.5, "High": 0.8} # Assuming some quality mapping
1260
+ for quality, threshold in quality_mapping.items():
1261
+ if found_keywords_count < num_keywords * threshold:
1262
+ resume_data["quality"] = quality
1263
+ break
1264
+
1265
+ found_certification = "Imarticus certification found in Certifications section." if found_imarticus_certification else "No Imarticus certification found in Certifications section."
1266
+ resume_data["found_certification"] = found_certification
1267
+
1268
+ # Experience relevance check
1269
+ Extract_exp_sections = ['WORK EXPERIENCE']
1270
+ experience_text = self.extract_and_format_sections(sections_text, Extract_exp_sections)
1271
+ if experience_text:
1272
+ resume_data["work_experience_check"] = "Experience is relevant to Data science." if any(variation.lower() in experience_text.lower() for keyword, variations in keyword_variations.items() for variation in variations) else "Experience is not relevant to Data science."
1273
+
1274
+ return jsonify(resume_data)
src/reviewer.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import language_tool_python, json, re, logging
2
+ from flask import jsonify
3
+ tool = language_tool_python.LanguageTool('en-US')
4
+ # from pdfminer.high_level import extract_text
5
+
6
+
7
+ class ResumeReviewer:
8
+ def __init__(self):
9
+ pass
10
+
11
+ def review(self):
12
+ pass
13
+
14
+ def grammar_check(self, parsed_resume):
15
+ text = parsed_resume["text"]
16
+ matches = tool.check(text)
17
+ corrected_text = tool.correct(text)
18
+ grammar_issues = []
19
+ for match in matches:
20
+ issue = {
21
+ "error": match.message,
22
+ "suggested_correction": match.replacements,
23
+ "context": match.context,
24
+ "rule_id": match.ruleId
25
+ }
26
+ grammar_issues.append(issue)
27
+ grammar_result = {
28
+ "original_text": text,
29
+ "grammar_issues": grammar_issues,
30
+ "corrected_text": corrected_text
31
+ }
32
+
33
+ return jsonify(grammar_result)
src/submitter.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from .config import UPLOAD_FOLDER, ALLOWED_EXTENSIONS
3
+ from flask import render_template, redirect, flash, request, url_for
4
+ from werkzeug.utils import secure_filename
5
+
6
+ class ResumeSubmitter:
7
+ def __init__(self):
8
+ if not os.path.exists(UPLOAD_FOLDER):
9
+ os.makedirs(UPLOAD_FOLDER)
10
+
11
+ def allowed_file(self, filename):
12
+ return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
13
+
14
+ def upload_form(self):
15
+ return render_template("upload_resume.html")
16
+
17
+ def upload_file(self):
18
+ if 'file' not in request.files:
19
+ return 'No file part'
20
+
21
+ file = request.files['file']
22
+ if file.filename == '':
23
+ return 'No selected file'
24
+
25
+ if file and self.allowed_file(file.filename):
26
+ filename = secure_filename(file.filename)
27
+ file.save(os.path.join(UPLOAD_FOLDER, filename))
28
+ flash('File successfully uploaded')
29
+ # return file path
30
+ return os.path.join(UPLOAD_FOLDER, filename)
31
+ else:
32
+ return "Allowed file types are PDF as of now"