Spaces:
Running
Running
| from flashtext import KeywordProcessor | |
| import json | |
| import nltk | |
| from nltk.tokenize import word_tokenize,LineTokenizer | |
| from utils import get_average_words_per_line, get_average_line_len | |
| import wordninja | |
| nltk.download('punkt') | |
| class ResumeSegmenter(): | |
| def __init__(self): | |
| #has to be reiniialized for each resume !!! could just check the intialization in get_parsed_sections | |
| self.resume_segments = { | |
| 'objective': [], | |
| 'work_and_employment': [], | |
| 'education_and_training': [], | |
| 'skills': [], | |
| 'accomplishments': [], | |
| 'misc': [] | |
| } | |
| self.resume_indices = [] | |
| with open(r"./sections.json") as f: | |
| data = json.load(f) | |
| self.section_headers = data["section_headers"] | |
| f.close() | |
| self.keyword_processor = KeywordProcessor() | |
| self.keyword_processor.add_keywords_from_dict(keyword_dict=self.section_headers) | |
| def find_segment_indices(self, text_list): | |
| average_words_per_line = get_average_words_per_line(text_list) | |
| average_sentence_length = get_average_line_len(text_list) | |
| for i, line in enumerate(text_list): | |
| line_tokenized = LineTokenizer(blanklines='discard').tokenize(line) | |
| if line[0].islower() or line[-1] == '.': | |
| continue | |
| kys = self.keyword_processor.extract_keywords(line) | |
| if self.keyword_processor.extract_keywords(' '.join(word_tokenize(line))) != []: | |
| text_list[i] = line = ' '.join(word_tokenize(line)) | |
| kys = self.keyword_processor.extract_keywords(line) | |
| if len(kys) > 0: | |
| if len(word_tokenize(line)) > average_words_per_line * 0.75 and len(line) > average_sentence_length: | |
| continue | |
| self.resume_indices.append(i) | |
| self.resume_segments[kys[0]].append(i) | |
| def slice_segments(self, lines): | |
| sections = {} | |
| if len(self.resume_indices) == 0: | |
| return None | |
| for section, points in self.resume_segments.items(): | |
| if len(points) == 0: continue | |
| start_point = points[0] | |
| tmp_end_point = points[-1] | |
| end_point = self.resume_indices[min(self.resume_indices.index(tmp_end_point)+1, | |
| len(self.resume_indices)-1)] | |
| if start_point == self.resume_indices[-1]: | |
| end_point = len(lines) | |
| sections[section] = (start_point, end_point) | |
| sections["basics_info"] = (0, self.resume_indices[0]) | |
| return sections | |
| def get_interval_intersection(self, sections, interval): | |
| for section in sections: | |
| s = section[1] | |
| if s[0] >= interval[1] or interval[0] >= s[1]: | |
| return None | |
| else: | |
| start = max(s[0], interval[0]) | |
| end = min(s[1], interval[1]) | |
| return [start, end], section | |
| def segment(self, resume_lines): | |
| self.find_segment_indices(resume_lines) | |
| sections = self.slice_segments(resume_lines) | |
| if sections is None: | |
| if len(self.resume_indices) == 0: | |
| return None | |
| else: | |
| for key, value in self.resume_segments.items(): | |
| if len(value) > 0: | |
| sections[key] = [min(value), max(value)] | |
| sections["basics_info"] = (0, self.resume_indices[0]) | |
| sections_list = [(k, v) for k,v in sections.items() if len(v) > 0 ] | |
| """intersection_intervals = [] | |
| for i, s in enumerate(sections_list[:-1]): | |
| result = self.get_interval_intersection(sections_list[i+1:], s[1]) | |
| if result is None: | |
| continue | |
| else: | |
| a,b = result | |
| print(a,b,s[0]) | |
| intersection_intervals.append((a,b,s[0])) | |
| if len(intersection_intervals) > 0: | |
| print("there are intersections", intersection_intervals)""" | |
| #needs last method of cleaning overlapping intervals with zero shot | |
| #classifier + substract intervals | |
| return sections | |
| def get_parsed_sections(self, resume_lines): | |
| text_segments = {} | |
| sections = self.segment(resume_lines) | |
| if sections is None: | |
| return None, None | |
| for header_title, section in sections.items(): | |
| lines = resume_lines[section[0]:section[1]] | |
| text_segments[header_title] = lines | |
| return text_segments, sections |