# from transformers import RobertaTokenizerFast, AutoModelForTokenClassification # import re # import torch # from itertools import cycle # tokenizer = RobertaTokenizerFast.from_pretrained("mrfirdauss/robert-base-finetuned-cv") # model = AutoModelForTokenClassification.from_pretrained("mrfirdauss/robert-base-finetuned-cv") # id2label = {0: 'O', # 1: 'B-NAME', # 3: 'B-NATION', # 5: 'B-EMAIL', # 7: 'B-URL', # 9: 'B-CAMPUS', # 11: 'B-MAJOR', # 13: 'B-COMPANY', # 15: 'B-DESIGNATION', # 17: 'B-GPA', # 19: 'B-PHONE NUMBER', # 21: 'B-ACHIEVEMENT', # 23: 'B-EXPERIENCES DESC', # 25: 'B-SKILLS', # 27: 'B-PROJECTS', # 2: 'I-NAME', # 4: 'I-NATION', # 6: 'I-EMAIL', # 8: 'I-URL', # 10: 'I-CAMPUS', # 12: 'I-MAJOR', # 14: 'I-COMPANY', # 16: 'I-DESIGNATION', # 18: 'I-GPA', # 20: 'I-PHONE NUMBER', # 22: 'I-ACHIEVEMENT', # 24: 'I-EXPERIENCES DESC', # 26: 'I-SKILLS', # 28: 'I-PROJECTS'} # def merge_subwords(tokens, labels): # merged_tokens = [] # merged_labels = [] # current_token = "" # current_label = "" # for token, label in zip(tokens, labels): # if token.startswith("Ġ"): # if current_token: # # Append the accumulated subwords as a new token and label # merged_tokens.append(current_token) # merged_labels.append(current_label) # # Start a new token and label # current_token = token[1:] # Remove the 'Ġ' # current_label = label # else: # # Continue accumulating subwords into the current token # current_token += token # # Append the last token and label # if current_token: # merged_tokens.append(current_token) # merged_labels.append(current_label) # return merged_tokens, merged_labels # def chunked_inference(text, tokenizer, model, max_length=512): # # Tokenize the text with truncation=False to get the full list of tokens # tok = re.findall(r'\w+|[^\w\s]', text, re.UNICODE) # tokens = tokenizer.tokenize(tok, is_split_into_words=True) # # Initialize containers for tokenized inputs # input_ids_chunks = [] # # Create chunks of tokens that fit within the model's maximum input size # for i in range(0, len(tokens), max_length - 2): # -2 accounts for special tokens [CLS] and [SEP] # chunk = tokens[i:i + max_length - 2] # # Encode the chunks. Add special tokens via the tokenizer # chunk_ids = tokenizer.convert_tokens_to_ids(chunk) # chunk_ids = tokenizer.build_inputs_with_special_tokens(chunk_ids) # input_ids_chunks.append(chunk_ids) # # Convert list of token ids into a tensor # input_ids_chunks = [torch.tensor(chunk_ids).unsqueeze(0) for chunk_ids in input_ids_chunks] # # Predictions container # predictions = [] # # Process each chunk # for input_ids in input_ids_chunks: # attention_mask = torch.ones_like(input_ids) # Create an attention mask for the inputs # output = model(input_ids, attention_mask=attention_mask) # logits = output[0] if isinstance(output, tuple) else output.logits # predictions_chunk = torch.argmax(logits, dim=-1).squeeze(0) # predictions.append(predictions_chunk[1:-1]) # # Optionally, you can convert predictions to labels here # # Flatten the list of tensors into one long tensor for label mapping # predictions = torch.cat(predictions, dim=0) # predicted_labels = [id2label[pred.item()] for pred in predictions] # return merge_subwords(tokens,predicted_labels) # def process_tokens(tokens, tag_prefix): # # Process tokens to extract entities based on the tag prefix # entities = [] # current_entity = {} # for token, tag in tokens: # if tag.startswith('B-') and tag.endswith(tag_prefix): # # Start a new entity # if current_entity: # # Append the current entity before starting a new one # entities.append(current_entity) # current_entity = {} # current_entity['text'] = token # current_entity['type'] = tag # elif tag.startswith('I-') and (('GPA') == tag_prefix or tag_prefix == ('URL')) and tag.endswith(tag_prefix) and current_entity: # current_entity['text'] += '' + token # elif tag.startswith('I-') and tag.endswith(tag_prefix) and current_entity: # # Continue the current entity # current_entity['text'] += ' ' + token # # Append the last entity if there is one # if current_entity: # entities.append(current_entity) # return entities # def predict(text): # tokens, predictions = chunked_inference(text, tokenizer, model) # data = list(zip(tokens, predictions)) # profile = { # "name": "", # "links": [], # "skills": [], # "experiences": [], # "educations": [] # } # profile['name'] = ' '.join([t for t, p in data if p.endswith('NAME')]) # for skills in process_tokens(data, 'SKILLS'): # profile['skills'].append(skills['text']) # #Links # for links in process_tokens(data, 'URL'): # profile['links'].append(links['text']) # # Process experiences and education # workzip = [] # exp = process_tokens(data, 'EXPERIENCES DESC') # designation = process_tokens(data, 'DESIGNATION') # comp = process_tokens(data, 'COMPANY') # if len(exp) >= len (designation) and len(exp) >= len(comp): # workzip = zip(cycle(designation),cycle(comp),exp) # elif len(designation)>=len(comp): # workzip = zip((designation),cycle(comp),cycle(exp)) # else: # workzip = zip(cycle(designation),(comp),cycle(exp)) # for designation, company, experience_desc in workzip: # profile['experiences'].append({ # "start": None, # "end": None, # "designation": designation['text'], # "company": company['text'], # To be filled in similarly # "experience_description": experience_desc['text'] # To be filled in similarly # }) # for major, gpa, campus in zip(process_tokens(data, 'MAJOR'), process_tokens(data, 'GPA'), process_tokens(data, 'CAMPUS')): # profile['educations'].append({ # "start": None, # "end": None, # "major": major['text'], # "campus": campus['text'], # To be filled in similarly # "GPA": gpa['text'] # To be filled in similarly # }) # return profile