Spaces:
Build error
Build error
File size: 11,268 Bytes
c416fc6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 |
import os
import gradio as gr
import pdfplumber
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import PyPDF2
from typing import Optional, Dict, Callable
import logging
import tempfile
from docx import Document
import subprocess
from odf import text, teletype
from odf.opendocument import load
import mammoth
import textract
from huggingface_hub import InferenceClient
import json
import re
# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# Initialize the Hugging Face Inference Client
client = InferenceClient(api_key=os.environ.get("HF_TOKEN"))
class ResumeExtractor:
def __init__(self, upload_dir: str = "./uploaded_files"):
"""Initialize the ResumeExtractor with upload directory."""
self.upload_dir = upload_dir
self._ensure_upload_dir()
self.supported_formats = {
'pdf': self.extract_text_from_pdf,
'image': self.extract_text_from_image,
'docx': self.extract_text_from_docx,
'doc': self.extract_text_from_doc,
'odt': self.extract_text_from_odt
}
def _ensure_upload_dir(self) -> None:
"""Create upload directory if it doesn't exist."""
if not os.path.exists(self.upload_dir):
os.makedirs(self.upload_dir)
@staticmethod
def check_file_type(file_path: str) -> str:
"""Check file extension and return file type."""
ext = os.path.splitext(file_path)[-1].lower()
format_mapping = {
'.pdf': 'pdf',
'.jpg': 'image',
'.jpeg': 'image',
'.png': 'image',
'.docx': 'docx',
'.doc': 'doc',
'.odt': 'odt'
}
if ext in format_mapping:
return format_mapping[ext]
raise ValueError(f"Unsupported file type: {ext}")
def extract_text(self, file_path: str, file_type: str) -> str:
"""Extract text using appropriate method based on file type."""
if file_type not in self.supported_formats:
raise ValueError(f"Unsupported format: {file_type}")
return self.supported_formats[file_type](file_path)
def extract_text_from_pdf(self, file_path: str) -> str:
"""Extract text from PDF using multiple methods."""
methods = [
(self._extract_with_pdfplumber, "pdfplumber"),
(self._extract_with_pypdf2, "PyPDF2"),
(self._extract_with_ocr, "OCR")
]
for extract_method, method_name in methods:
try:
text = extract_method(file_path)
if text.strip():
logger.info(f"Successfully extracted text using {method_name}")
return text
logger.info(f"No text found using {method_name}, trying next method...")
except Exception as e:
logger.error(f"Error with {method_name}: {str(e)}")
return ""
@staticmethod
def _extract_with_pdfplumber(file_path: str) -> str:
"""Extract text using pdfplumber."""
with pdfplumber.open(file_path) as pdf:
return ' '.join(page.extract_text() or '' for page in pdf.pages)
@staticmethod
def _extract_with_pypdf2(file_path: str) -> str:
"""Extract text using PyPDF2."""
with open(file_path, 'rb') as pdf_file:
reader = PyPDF2.PdfReader(pdf_file)
return ' '.join(page.extract_text() or '' for page in reader.pages)
@staticmethod
def _extract_with_ocr(file_path: str) -> str:
"""Extract text using OCR."""
images = convert_from_path(file_path)
return ' '.join(pytesseract.image_to_string(image) for image in images)
def extract_text_from_image(self, file_path: str) -> str:
"""Extract text from image using pytesseract."""
try:
with Image.open(file_path) as image:
return pytesseract.image_to_string(image)
except Exception as e:
logger.error(f"Error extracting text from image: {str(e)}")
return ""
def extract_text_from_docx(self, file_path: str) -> str:
"""Extract text from DOCX file."""
try:
# Try using mammoth first for better formatting preservation
with open(file_path, "rb") as docx_file:
result = mammoth.extract_raw_text(docx_file)
text = result.value
if text.strip():
return text
# Fallback to python-docx if mammoth fails
doc = Document(file_path)
return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
except Exception as e:
logger.error(f"Error extracting text from DOCX: {str(e)}")
# Final fallback to textract
try:
return textract.process(file_path).decode('utf-8')
except Exception as e2:
logger.error(f"Textract fallback failed: {str(e2)}")
return ""
def extract_text_from_doc(self, file_path: str) -> str:
"""Extract text from DOC file."""
try:
# Try textract first
return textract.process(file_path).decode('utf-8')
except Exception as e:
logger.error(f"Error extracting text from DOC with textract: {str(e)}")
try:
# Fallback to antiword if available
return subprocess.check_output(['antiword', file_path]).decode('utf-8')
except Exception as e2:
logger.error(f"Antiword fallback failed: {str(e2)}")
return ""
def extract_text_from_odt(self, file_path: str) -> str:
"""Extract text from ODT file."""
try:
textdoc = load(file_path)
allparas = textdoc.getElementsByType(text.P)
return '\n'.join([teletype.extractText(para) for para in allparas])
except Exception as e:
logger.error(f"Error extracting text from ODT: {str(e)}")
# Fallback to textract
try:
return textract.process(file_path).decode('utf-8')
except Exception as e2:
logger.error(f"Textract fallback failed: {str(e2)}")
return ""
def extract_text_from_resume(file):
extractor = ResumeExtractor()
try:
file_type = extractor.check_file_type(file.name)
extracted_text = extractor.extract_text(file.name, file_type)
if extracted_text.strip():
word_count = len(extracted_text.split())
char_count = len(extracted_text)
# Generate JSON using Hugging Face API
json_data = generate_json_from_text(extracted_text)
return extracted_text, word_count, char_count, json_data
else:
return "No text could be extracted from the file.", 0, 0, "{}"
except Exception as e:
return f"An error occurred: {str(e)}", 0, 0, "{}"
def clean_json_string(json_str):
# Remove any leading or trailing whitespace
json_str = json_str.strip()
# Ensure the string starts with { and ends with }
if not json_str.startswith('{'):
json_str = '{' + json_str
if not json_str.endswith('}'):
json_str = json_str + '}'
# Replace any single quotes with double quotes
json_str = json_str.replace("'", '"')
# Fix common formatting issues
json_str = re.sub(r'(\w+):', r'"\1":', json_str) # Add quotes to keys
json_str = re.sub(r',\s*}', '}', json_str) # Remove trailing commas
return json_str
def generate_json_from_text(text):
prompt = f"""
Given the following resume text, create a JSON object that organizes the information into relevant categories. Include fields for personal information, objective, education, experience, skills, and any other relevant sections. If information for a field is not provided, use "NOT PROVIDED" as the value.
Resume text:
{text}
Generate the JSON response:
"""
try:
response = client.text_generation(
model="mistralai/Mixtral-8x7B-Instruct-v0.1",
prompt=prompt,
max_new_tokens=1000,
temperature=0.1
)
# Extract the JSON part from the response
json_start = response.find('{')
json_end = response.rfind('}') + 1
json_str = response[json_start:json_end]
# Clean and fix the JSON string
cleaned_json_str = clean_json_string(json_str)
# Parse and format the JSON
try:
parsed_json = json.loads(cleaned_json_str)
formatted_json = json.dumps(parsed_json, indent=2)
return formatted_json
except json.JSONDecodeError as e:
logger.error(f"Error parsing JSON after cleaning (lack of infos): {str(e)}")
return json.dumps({"Warning": "Not all data fetchable", "raw_text": cleaned_json_str}, indent=2)
except Exception as e:
logger.error(f"Error generating JSON: {str(e)}")
return json.dumps({"error": str(e)}, indent=2)
# Custom CSS for better aesthetics
custom_css = """
#component-0 { max-width: 800px; margin: auto; }
.gradio-container { font-family: 'Arial', sans-serif; }
.uploadbuttonwrap { background-color: #f0f0f0; border-radius: 10px; padding: 20px; }
.uploadbuttonwrap label { background-color: #4CAF50; color: white; padding: 10px 15px; border-radius: 5px; cursor: pointer; }
.uploadbuttonwrap label:hover { background-color: #45a049; }
.output-markdown { background-color: #f9f9f9; border: 1px solid #ddd; border-radius: 5px; padding: 15px; }
.output-html { max-height: 400px; overflow-y: auto; }
"""
# Define the Gradio interface with improved aesthetics
with gr.Blocks(css=custom_css) as iface:
gr.Markdown(
"""
# 📄 Resume Text Extractor and Analyzer
Upload your resume (PDF, DOC, DOCX, ODT, JPG, or PNG) to extract the text content and generate structured data.
"""
)
with gr.Row():
file_input = gr.File(label="Upload Resume")
with gr.Row():
extract_button = gr.Button("Extract and Analyze", variant="primary")
with gr.Row():
with gr.Column(scale=2):
text_output = gr.Textbox(label="Extracted Text", lines=10)
with gr.Column(scale=1):
word_count = gr.Number(label="Word Count")
char_count = gr.Number(label="Character Count")
with gr.Row():
json_output = gr.JSON(label="Structured Resume Data")
extract_button.click(
fn=extract_text_from_resume,
inputs=[file_input],
outputs=[text_output, word_count, char_count, json_output]
)
gr.Markdown(
"""
### How it works
1. Upload your resume file
2. Click "Extract and Analyze"
3. View the extracted text and structured data
This tool uses advanced NLP techniques to parse your resume and provide insights.
"""
)
iface.launch(share=True)
|