| | import os |
| | import torch |
| | from transformers import LayoutLMv3ForTokenClassification, LayoutLMv3FeatureExtractor, LayoutLMv3Tokenizer |
| | from PIL import Image |
| | import pytesseract |
| | from pdf2image import convert_from_path |
| | import re |
| |
|
| | |
| | |
| |
|
| | |
| | pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract' |
| |
|
| | |
| | model_name = "microsoft/layoutlmv3-base" |
| | model = LayoutLMv3ForTokenClassification.from_pretrained(model_name, num_labels=5) |
| | feature_extractor = LayoutLMv3FeatureExtractor.from_pretrained(model_name) |
| | tokenizer = LayoutLMv3Tokenizer.from_pretrained(model_name) |
| |
|
| | |
| | id2label = {0: "O", 1: "COMPANY", 2: "EDUCATION", 3: "POSITION", 4: "DATE"} |
| | label2id = {v: k for k, v in id2label.items()} |
| |
|
| | def preprocess_document(file_path): |
| | if file_path.lower().endswith('.pdf'): |
| | images = convert_from_path(file_path) |
| | image = images[0] |
| | else: |
| | image = Image.open(file_path) |
| | |
| | |
| | ocr_result = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT) |
| | |
| | words = ocr_result['text'] |
| | boxes = [] |
| | for i in range(len(words)): |
| | x, y, w, h = ocr_result['left'][i], ocr_result['top'][i], ocr_result['width'][i], ocr_result['height'][i] |
| | boxes.append([x, y, x+w, y+h]) |
| | |
| | return image, words, boxes |
| |
|
| | def process_resume(file_path): |
| | image, words, boxes = preprocess_document(file_path) |
| | |
| | |
| | encoding = feature_extractor(image, words, boxes=boxes, return_tensors="pt") |
| | input_ids = encoding["input_ids"] |
| | attention_mask = encoding["attention_mask"] |
| | token_type_ids = encoding["token_type_ids"] |
| | bbox = encoding["bbox"] |
| | |
| | |
| | outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, bbox=bbox) |
| | |
| | |
| | predictions = outputs.logits.argmax(-1).squeeze().tolist() |
| | |
| | |
| | parsed_info = {"COMPANY": [], "EDUCATION": [], "POSITION": [], "DATE": []} |
| | current_entity = None |
| | current_text = "" |
| | |
| | for word, label_id in zip(words, predictions): |
| | if label_id != 0: |
| | label = id2label[label_id] |
| | if label != current_entity: |
| | if current_entity: |
| | parsed_info[current_entity].append(current_text.strip()) |
| | current_entity = label |
| | current_text = word + " " |
| | else: |
| | current_text += word + " " |
| | else: |
| | if current_entity: |
| | parsed_info[current_entity].append(current_text.strip()) |
| | current_entity = None |
| | current_text = "" |
| | |
| | return parsed_info |
| |
|
| | def main(): |
| | resume_path = input("Enter the path to your resume file (PDF or image): ") |
| | if not os.path.exists(resume_path): |
| | print("File not found. Please check the path and try again.") |
| | return |
| | |
| | parsed_info = process_resume(resume_path) |
| | |
| | print("\nExtracted Information:") |
| | print("Companies worked for:", ", ".join(parsed_info["COMPANY"])) |
| | print("Education:", ", ".join(parsed_info["EDUCATION"])) |
| | print("Positions held:", ", ".join(parsed_info["POSITION"])) |
| | print("Relevant dates:", ", ".join(parsed_info["DATE"])) |
| |
|
| | if __name__ == "__main__": |
| | main() |