Spaces:
Build error
Build error
| import os | |
| import gradio as gr | |
| import pdfplumber | |
| from pdf2image import convert_from_path | |
| from PIL import Image | |
| import pytesseract | |
| import PyPDF2 | |
| from typing import Optional, Dict, Callable | |
| import logging | |
| import tempfile | |
| from docx import Document | |
| import subprocess | |
| from odf import text, teletype | |
| from odf.opendocument import load | |
| import mammoth | |
| import textract | |
| from huggingface_hub import InferenceClient | |
| import json | |
| import re | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # Initialize the Hugging Face Inference Client | |
| client = InferenceClient(api_key=os.environ.get("HF_TOKEN")) | |
| class ResumeExtractor: | |
| def __init__(self, upload_dir: str = "./uploaded_files"): | |
| """Initialize the ResumeExtractor with upload directory.""" | |
| self.upload_dir = upload_dir | |
| self._ensure_upload_dir() | |
| self.supported_formats = { | |
| 'pdf': self.extract_text_from_pdf, | |
| 'image': self.extract_text_from_image, | |
| 'docx': self.extract_text_from_docx, | |
| 'doc': self.extract_text_from_doc, | |
| 'odt': self.extract_text_from_odt | |
| } | |
| def _ensure_upload_dir(self) -> None: | |
| """Create upload directory if it doesn't exist.""" | |
| if not os.path.exists(self.upload_dir): | |
| os.makedirs(self.upload_dir) | |
| def check_file_type(file_path: str) -> str: | |
| """Check file extension and return file type.""" | |
| ext = os.path.splitext(file_path)[-1].lower() | |
| format_mapping = { | |
| '.pdf': 'pdf', | |
| '.jpg': 'image', | |
| '.jpeg': 'image', | |
| '.png': 'image', | |
| '.docx': 'docx', | |
| '.doc': 'doc', | |
| '.odt': 'odt' | |
| } | |
| if ext in format_mapping: | |
| return format_mapping[ext] | |
| raise ValueError(f"Unsupported file type: {ext}") | |
| def extract_text(self, file_path: str, file_type: str) -> str: | |
| """Extract text using appropriate method based on file type.""" | |
| if file_type not in self.supported_formats: | |
| raise ValueError(f"Unsupported format: {file_type}") | |
| return self.supported_formats[file_type](file_path) | |
| def extract_text_from_pdf(self, file_path: str) -> str: | |
| """Extract text from PDF using multiple methods.""" | |
| methods = [ | |
| (self._extract_with_pdfplumber, "pdfplumber"), | |
| (self._extract_with_pypdf2, "PyPDF2"), | |
| (self._extract_with_ocr, "OCR") | |
| ] | |
| for extract_method, method_name in methods: | |
| try: | |
| text = extract_method(file_path) | |
| if text.strip(): | |
| logger.info(f"Successfully extracted text using {method_name}") | |
| return text | |
| logger.info(f"No text found using {method_name}, trying next method...") | |
| except Exception as e: | |
| logger.error(f"Error with {method_name}: {str(e)}") | |
| return "" | |
| def _extract_with_pdfplumber(file_path: str) -> str: | |
| """Extract text using pdfplumber.""" | |
| with pdfplumber.open(file_path) as pdf: | |
| return ' '.join(page.extract_text() or '' for page in pdf.pages) | |
| def _extract_with_pypdf2(file_path: str) -> str: | |
| """Extract text using PyPDF2.""" | |
| with open(file_path, 'rb') as pdf_file: | |
| reader = PyPDF2.PdfReader(pdf_file) | |
| return ' '.join(page.extract_text() or '' for page in reader.pages) | |
| def _extract_with_ocr(file_path: str) -> str: | |
| """Extract text using OCR.""" | |
| images = convert_from_path(file_path) | |
| return ' '.join(pytesseract.image_to_string(image) for image in images) | |
| def extract_text_from_image(self, file_path: str) -> str: | |
| """Extract text from image using pytesseract.""" | |
| try: | |
| with Image.open(file_path) as image: | |
| return pytesseract.image_to_string(image) | |
| except Exception as e: | |
| logger.error(f"Error extracting text from image: {str(e)}") | |
| return "" | |
| def extract_text_from_docx(self, file_path: str) -> str: | |
| """Extract text from DOCX file.""" | |
| try: | |
| # Try using mammoth first for better formatting preservation | |
| with open(file_path, "rb") as docx_file: | |
| result = mammoth.extract_raw_text(docx_file) | |
| text = result.value | |
| if text.strip(): | |
| return text | |
| # Fallback to python-docx if mammoth fails | |
| doc = Document(file_path) | |
| return '\n'.join([paragraph.text for paragraph in doc.paragraphs]) | |
| except Exception as e: | |
| logger.error(f"Error extracting text from DOCX: {str(e)}") | |
| # Final fallback to textract | |
| try: | |
| return textract.process(file_path).decode('utf-8') | |
| except Exception as e2: | |
| logger.error(f"Textract fallback failed: {str(e2)}") | |
| return "" | |
| def extract_text_from_doc(self, file_path: str) -> str: | |
| """Extract text from DOC file.""" | |
| try: | |
| # Try textract first | |
| return textract.process(file_path).decode('utf-8') | |
| except Exception as e: | |
| logger.error(f"Error extracting text from DOC with textract: {str(e)}") | |
| try: | |
| # Fallback to antiword if available | |
| return subprocess.check_output(['antiword', file_path]).decode('utf-8') | |
| except Exception as e2: | |
| logger.error(f"Antiword fallback failed: {str(e2)}") | |
| return "" | |
| def extract_text_from_odt(self, file_path: str) -> str: | |
| """Extract text from ODT file.""" | |
| try: | |
| textdoc = load(file_path) | |
| allparas = textdoc.getElementsByType(text.P) | |
| return '\n'.join([teletype.extractText(para) for para in allparas]) | |
| except Exception as e: | |
| logger.error(f"Error extracting text from ODT: {str(e)}") | |
| # Fallback to textract | |
| try: | |
| return textract.process(file_path).decode('utf-8') | |
| except Exception as e2: | |
| logger.error(f"Textract fallback failed: {str(e2)}") | |
| return "" | |
| def extract_text_from_resume(file): | |
| extractor = ResumeExtractor() | |
| try: | |
| file_type = extractor.check_file_type(file.name) | |
| extracted_text = extractor.extract_text(file.name, file_type) | |
| if extracted_text.strip(): | |
| word_count = len(extracted_text.split()) | |
| char_count = len(extracted_text) | |
| # Generate JSON using Hugging Face API | |
| json_data = generate_json_from_text(extracted_text) | |
| return extracted_text, word_count, char_count, json_data | |
| else: | |
| return "No text could be extracted from the file.", 0, 0, "{}" | |
| except Exception as e: | |
| return f"An error occurred: {str(e)}", 0, 0, "{}" | |
| def clean_json_string(json_str): | |
| # Remove any leading or trailing whitespace | |
| json_str = json_str.strip() | |
| # Ensure the string starts with { and ends with } | |
| if not json_str.startswith('{'): | |
| json_str = '{' + json_str | |
| if not json_str.endswith('}'): | |
| json_str = json_str + '}' | |
| # Replace any single quotes with double quotes | |
| json_str = json_str.replace("'", '"') | |
| # Fix common formatting issues | |
| json_str = re.sub(r'(\w+):', r'"\1":', json_str) # Add quotes to keys | |
| json_str = re.sub(r',\s*}', '}', json_str) # Remove trailing commas | |
| return json_str | |
| def generate_json_from_text(text): | |
| prompt = f""" | |
| Given the following resume text, create a JSON object that organizes the information into relevant categories. Include fields for personal information, objective, education, experience, skills, and any other relevant sections. If information for a field is not provided, use "NOT PROVIDED" as the value. | |
| Resume text: | |
| {text} | |
| Generate the JSON response: | |
| """ | |
| try: | |
| response = client.text_generation( | |
| model="mistralai/Mixtral-8x7B-Instruct-v0.1", | |
| prompt=prompt, | |
| max_new_tokens=1000, | |
| temperature=0.1 | |
| ) | |
| # Extract the JSON part from the response | |
| json_start = response.find('{') | |
| json_end = response.rfind('}') + 1 | |
| json_str = response[json_start:json_end] | |
| # Clean and fix the JSON string | |
| cleaned_json_str = clean_json_string(json_str) | |
| # Parse and format the JSON | |
| try: | |
| parsed_json = json.loads(cleaned_json_str) | |
| formatted_json = json.dumps(parsed_json, indent=2) | |
| return formatted_json | |
| except json.JSONDecodeError as e: | |
| logger.error(f"Error parsing JSON after cleaning (lack of infos): {str(e)}") | |
| return json.dumps({"Warning": "Not all data fetchable", "raw_text": cleaned_json_str}, indent=2) | |
| except Exception as e: | |
| logger.error(f"Error generating JSON: {str(e)}") | |
| return json.dumps({"error": str(e)}, indent=2) | |
| # Custom CSS for better aesthetics | |
| custom_css = """ | |
| #component-0 { max-width: 800px; margin: auto; } | |
| .gradio-container { font-family: 'Arial', sans-serif; } | |
| .uploadbuttonwrap { background-color: #f0f0f0; border-radius: 10px; padding: 20px; } | |
| .uploadbuttonwrap label { background-color: #4CAF50; color: white; padding: 10px 15px; border-radius: 5px; cursor: pointer; } | |
| .uploadbuttonwrap label:hover { background-color: #45a049; } | |
| .output-markdown { background-color: #f9f9f9; border: 1px solid #ddd; border-radius: 5px; padding: 15px; } | |
| .output-html { max-height: 400px; overflow-y: auto; } | |
| """ | |
| # Define the Gradio interface with improved aesthetics | |
| with gr.Blocks(css=custom_css) as iface: | |
| gr.Markdown( | |
| """ | |
| # 📄 Resume Text Extractor and Analyzer | |
| Upload your resume (PDF, DOC, DOCX, ODT, JPG, or PNG) to extract the text content and generate structured data. | |
| """ | |
| ) | |
| with gr.Row(): | |
| file_input = gr.File(label="Upload Resume") | |
| with gr.Row(): | |
| extract_button = gr.Button("Extract and Analyze", variant="primary") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| text_output = gr.Textbox(label="Extracted Text", lines=10) | |
| with gr.Column(scale=1): | |
| word_count = gr.Number(label="Word Count") | |
| char_count = gr.Number(label="Character Count") | |
| with gr.Row(): | |
| json_output = gr.JSON(label="Structured Resume Data") | |
| extract_button.click( | |
| fn=extract_text_from_resume, | |
| inputs=[file_input], | |
| outputs=[text_output, word_count, char_count, json_output] | |
| ) | |
| gr.Markdown( | |
| """ | |
| ### How it works | |
| 1. Upload your resume file | |
| 2. Click "Extract and Analyze" | |
| 3. View the extracted text and structured data | |
| This tool uses advanced NLP techniques to parse your resume and provide insights. | |
| """ | |
| ) | |
| iface.launch(share=True) | |