Spaces:

ShreehariS754
/

Extract_Resume_Data

Build error

App Files Files Community

Extract_Resume_Data / app.py

ShreehariS754

Upload folder using huggingface_hub

c416fc6 verified over 1 year ago

raw

history blame contribute delete

11.3 kB

	import os
	import gradio as gr
	import pdfplumber
	from pdf2image import convert_from_path
	from PIL import Image
	import pytesseract
	import PyPDF2
	from typing import Optional, Dict, Callable
	import logging
	import tempfile
	from docx import Document
	import subprocess
	from odf import text, teletype
	from odf.opendocument import load
	import mammoth
	import textract
	from huggingface_hub import InferenceClient
	import json
	import re

	# Set up logging
	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Initialize the Hugging Face Inference Client
	client = InferenceClient(api_key=os.environ.get("HF_TOKEN"))
	class ResumeExtractor:
	def __init__(self, upload_dir: str = "./uploaded_files"):
	"""Initialize the ResumeExtractor with upload directory."""
	self.upload_dir = upload_dir
	self._ensure_upload_dir()
	self.supported_formats = {
	'pdf': self.extract_text_from_pdf,
	'image': self.extract_text_from_image,
	'docx': self.extract_text_from_docx,
	'doc': self.extract_text_from_doc,
	'odt': self.extract_text_from_odt
	}

	def _ensure_upload_dir(self) -> None:
	"""Create upload directory if it doesn't exist."""
	if not os.path.exists(self.upload_dir):
	os.makedirs(self.upload_dir)

	@staticmethod
	def check_file_type(file_path: str) -> str:
	"""Check file extension and return file type."""
	ext = os.path.splitext(file_path)[-1].lower()
	format_mapping = {
	'.pdf': 'pdf',
	'.jpg': 'image',
	'.jpeg': 'image',
	'.png': 'image',
	'.docx': 'docx',
	'.doc': 'doc',
	'.odt': 'odt'
	}
	if ext in format_mapping:
	return format_mapping[ext]
	raise ValueError(f"Unsupported file type: {ext}")

	def extract_text(self, file_path: str, file_type: str) -> str:
	"""Extract text using appropriate method based on file type."""
	if file_type not in self.supported_formats:
	raise ValueError(f"Unsupported format: {file_type}")

	return self.supported_formats[file_type](file_path)

	def extract_text_from_pdf(self, file_path: str) -> str:
	"""Extract text from PDF using multiple methods."""
	methods = [
	(self._extract_with_pdfplumber, "pdfplumber"),
	(self._extract_with_pypdf2, "PyPDF2"),
	(self._extract_with_ocr, "OCR")
	]

	for extract_method, method_name in methods:
	try:
	text = extract_method(file_path)
	if text.strip():
	logger.info(f"Successfully extracted text using {method_name}")
	return text
	logger.info(f"No text found using {method_name}, trying next method...")
	except Exception as e:
	logger.error(f"Error with {method_name}: {str(e)}")

	return ""

	@staticmethod
	def _extract_with_pdfplumber(file_path: str) -> str:
	"""Extract text using pdfplumber."""
	with pdfplumber.open(file_path) as pdf:
	return ' '.join(page.extract_text() or '' for page in pdf.pages)

	@staticmethod
	def _extract_with_pypdf2(file_path: str) -> str:
	"""Extract text using PyPDF2."""
	with open(file_path, 'rb') as pdf_file:
	reader = PyPDF2.PdfReader(pdf_file)
	return ' '.join(page.extract_text() or '' for page in reader.pages)

	@staticmethod
	def _extract_with_ocr(file_path: str) -> str:
	"""Extract text using OCR."""
	images = convert_from_path(file_path)
	return ' '.join(pytesseract.image_to_string(image) for image in images)

	def extract_text_from_image(self, file_path: str) -> str:
	"""Extract text from image using pytesseract."""
	try:
	with Image.open(file_path) as image:
	return pytesseract.image_to_string(image)
	except Exception as e:
	logger.error(f"Error extracting text from image: {str(e)}")
	return ""

	def extract_text_from_docx(self, file_path: str) -> str:
	"""Extract text from DOCX file."""
	try:
	# Try using mammoth first for better formatting preservation
	with open(file_path, "rb") as docx_file:
	result = mammoth.extract_raw_text(docx_file)
	text = result.value

	if text.strip():
	return text

	# Fallback to python-docx if mammoth fails
	doc = Document(file_path)
	return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
	except Exception as e:
	logger.error(f"Error extracting text from DOCX: {str(e)}")
	# Final fallback to textract
	try:
	return textract.process(file_path).decode('utf-8')
	except Exception as e2:
	logger.error(f"Textract fallback failed: {str(e2)}")
	return ""

	def extract_text_from_doc(self, file_path: str) -> str:
	"""Extract text from DOC file."""
	try:
	# Try textract first
	return textract.process(file_path).decode('utf-8')
	except Exception as e:
	logger.error(f"Error extracting text from DOC with textract: {str(e)}")
	try:
	# Fallback to antiword if available
	return subprocess.check_output(['antiword', file_path]).decode('utf-8')
	except Exception as e2:
	logger.error(f"Antiword fallback failed: {str(e2)}")
	return ""

	def extract_text_from_odt(self, file_path: str) -> str:
	"""Extract text from ODT file."""
	try:
	textdoc = load(file_path)
	allparas = textdoc.getElementsByType(text.P)
	return '\n'.join([teletype.extractText(para) for para in allparas])
	except Exception as e:
	logger.error(f"Error extracting text from ODT: {str(e)}")
	# Fallback to textract
	try:
	return textract.process(file_path).decode('utf-8')
	except Exception as e2:
	logger.error(f"Textract fallback failed: {str(e2)}")
	return ""

	def extract_text_from_resume(file):
	extractor = ResumeExtractor()

	try:
	file_type = extractor.check_file_type(file.name)
	extracted_text = extractor.extract_text(file.name, file_type)

	if extracted_text.strip():
	word_count = len(extracted_text.split())
	char_count = len(extracted_text)

	# Generate JSON using Hugging Face API
	json_data = generate_json_from_text(extracted_text)

	return extracted_text, word_count, char_count, json_data
	else:
	return "No text could be extracted from the file.", 0, 0, "{}"
	except Exception as e:
	return f"An error occurred: {str(e)}", 0, 0, "{}"

	def clean_json_string(json_str):
	# Remove any leading or trailing whitespace
	json_str = json_str.strip()

	# Ensure the string starts with { and ends with }
	if not json_str.startswith('{'):
	json_str = '{' + json_str
	if not json_str.endswith('}'):
	json_str = json_str + '}'

	# Replace any single quotes with double quotes
	json_str = json_str.replace("'", '"')

	# Fix common formatting issues
	json_str = re.sub(r'(\w+):', r'"\1":', json_str) # Add quotes to keys
	json_str = re.sub(r',\s*}', '}', json_str) # Remove trailing commas

	return json_str

	def generate_json_from_text(text):
	prompt = f"""
	Given the following resume text, create a JSON object that organizes the information into relevant categories. Include fields for personal information, objective, education, experience, skills, and any other relevant sections. If information for a field is not provided, use "NOT PROVIDED" as the value.

	Resume text:
	{text}

	Generate the JSON response:
	"""

	try:
	response = client.text_generation(
	model="mistralai/Mixtral-8x7B-Instruct-v0.1",
	prompt=prompt,
	max_new_tokens=1000,
	temperature=0.1
	)

	# Extract the JSON part from the response
	json_start = response.find('{')
	json_end = response.rfind('}') + 1
	json_str = response[json_start:json_end]

	# Clean and fix the JSON string
	cleaned_json_str = clean_json_string(json_str)

	# Parse and format the JSON
	try:
	parsed_json = json.loads(cleaned_json_str)
	formatted_json = json.dumps(parsed_json, indent=2)
	return formatted_json
	except json.JSONDecodeError as e:
	logger.error(f"Error parsing JSON after cleaning (lack of infos): {str(e)}")
	return json.dumps({"Warning": "Not all data fetchable", "raw_text": cleaned_json_str}, indent=2)

	except Exception as e:
	logger.error(f"Error generating JSON: {str(e)}")
	return json.dumps({"error": str(e)}, indent=2)

	# Custom CSS for better aesthetics
	custom_css = """
	#component-0 { max-width: 800px; margin: auto; }
	.gradio-container { font-family: 'Arial', sans-serif; }
	.uploadbuttonwrap { background-color: #f0f0f0; border-radius: 10px; padding: 20px; }
	.uploadbuttonwrap label { background-color: #4CAF50; color: white; padding: 10px 15px; border-radius: 5px; cursor: pointer; }
	.uploadbuttonwrap label:hover { background-color: #45a049; }
	.output-markdown { background-color: #f9f9f9; border: 1px solid #ddd; border-radius: 5px; padding: 15px; }
	.output-html { max-height: 400px; overflow-y: auto; }
	"""

	# Define the Gradio interface with improved aesthetics
	with gr.Blocks(css=custom_css) as iface:
	gr.Markdown(
	"""
	# 📄 Resume Text Extractor and Analyzer

	Upload your resume (PDF, DOC, DOCX, ODT, JPG, or PNG) to extract the text content and generate structured data.
	"""
	)

	with gr.Row():
	file_input = gr.File(label="Upload Resume")

	with gr.Row():
	extract_button = gr.Button("Extract and Analyze", variant="primary")

	with gr.Row():
	with gr.Column(scale=2):
	text_output = gr.Textbox(label="Extracted Text", lines=10)
	with gr.Column(scale=1):
	word_count = gr.Number(label="Word Count")
	char_count = gr.Number(label="Character Count")

	with gr.Row():
	json_output = gr.JSON(label="Structured Resume Data")

	extract_button.click(
	fn=extract_text_from_resume,
	inputs=[file_input],
	outputs=[text_output, word_count, char_count, json_output]
	)

	gr.Markdown(
	"""
	### How it works
	1. Upload your resume file
	2. Click "Extract and Analyze"
	3. View the extracted text and structured data

	This tool uses advanced NLP techniques to parse your resume and provide insights.
	"""
	)


	iface.launch(share=True)