Spaces:

ShreehariS754
/

Extract_Resume_Data

Build error

App Files Files Community

ShreehariS754 commited on Oct 17, 2024

Commit

c416fc6

verified ·

1 Parent(s): 12a6157

Upload folder using huggingface_hub

Browse files

Files changed (4) hide show

README.md +6 -4
app.py +302 -0
packages.txt +4 -0
requirements.txt +10 -0

README.md CHANGED Viewed

@@ -1,12 +1,14 @@
 ---
-title: Extract Resume Data
-emoji: 🐨
-colorFrom: yellow
-colorTo: red
 sdk: gradio
 sdk_version: 5.1.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Extract_Resume_Data
+emoji: 👁
+colorFrom: blue
+colorTo: yellow
 sdk: gradio
 sdk_version: 5.1.0
 app_file: app.py
 pinned: false
+license: mit
+short_description: Extracts text from resume of various file types, makes json
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,302 @@

+import os
+import gradio as gr
+import pdfplumber
+from pdf2image import convert_from_path
+from PIL import Image
+import pytesseract
+import PyPDF2
+from typing import Optional, Dict, Callable
+import logging
+import tempfile
+from docx import Document
+import subprocess
+from odf import text, teletype
+from odf.opendocument import load
+import mammoth
+import textract
+from huggingface_hub import InferenceClient
+import json
+import re
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Initialize the Hugging Face Inference Client
+client = InferenceClient(api_key=os.environ.get("HF_TOKEN"))
+class ResumeExtractor:
+    def __init__(self, upload_dir: str = "./uploaded_files"):
+        """Initialize the ResumeExtractor with upload directory."""
+        self.upload_dir = upload_dir
+        self._ensure_upload_dir()
+        self.supported_formats = {
+            'pdf': self.extract_text_from_pdf,
+            'image': self.extract_text_from_image,
+            'docx': self.extract_text_from_docx,
+            'doc': self.extract_text_from_doc,
+            'odt': self.extract_text_from_odt
+        }
+    def _ensure_upload_dir(self) -> None:
+        """Create upload directory if it doesn't exist."""
+        if not os.path.exists(self.upload_dir):
+            os.makedirs(self.upload_dir)
+    @staticmethod
+    def check_file_type(file_path: str) -> str:
+        """Check file extension and return file type."""
+        ext = os.path.splitext(file_path)[-1].lower()
+        format_mapping = {
+            '.pdf': 'pdf',
+            '.jpg': 'image',
+            '.jpeg': 'image',
+            '.png': 'image',
+            '.docx': 'docx',
+            '.doc': 'doc',
+            '.odt': 'odt'
+        }
+        if ext in format_mapping:
+            return format_mapping[ext]
+        raise ValueError(f"Unsupported file type: {ext}")
+    def extract_text(self, file_path: str, file_type: str) -> str:
+        """Extract text using appropriate method based on file type."""
+        if file_type not in self.supported_formats:
+            raise ValueError(f"Unsupported format: {file_type}")
+        return self.supported_formats[file_type](file_path)
+    def extract_text_from_pdf(self, file_path: str) -> str:
+        """Extract text from PDF using multiple methods."""
+        methods = [
+            (self._extract_with_pdfplumber, "pdfplumber"),
+            (self._extract_with_pypdf2, "PyPDF2"),
+            (self._extract_with_ocr, "OCR")
+        ]
+        for extract_method, method_name in methods:
+            try:
+                text = extract_method(file_path)
+                if text.strip():
+                    logger.info(f"Successfully extracted text using {method_name}")
+                    return text
+                logger.info(f"No text found using {method_name}, trying next method...")
+            except Exception as e:
+                logger.error(f"Error with {method_name}: {str(e)}")
+        return ""
+    @staticmethod
+    def _extract_with_pdfplumber(file_path: str) -> str:
+        """Extract text using pdfplumber."""
+        with pdfplumber.open(file_path) as pdf:
+            return ' '.join(page.extract_text() or '' for page in pdf.pages)
+    @staticmethod
+    def _extract_with_pypdf2(file_path: str) -> str:
+        """Extract text using PyPDF2."""
+        with open(file_path, 'rb') as pdf_file:
+            reader = PyPDF2.PdfReader(pdf_file)
+            return ' '.join(page.extract_text() or '' for page in reader.pages)
+    @staticmethod
+    def _extract_with_ocr(file_path: str) -> str:
+        """Extract text using OCR."""
+        images = convert_from_path(file_path)
+        return ' '.join(pytesseract.image_to_string(image) for image in images)
+    def extract_text_from_image(self, file_path: str) -> str:
+        """Extract text from image using pytesseract."""
+        try:
+            with Image.open(file_path) as image:
+                return pytesseract.image_to_string(image)
+        except Exception as e:
+            logger.error(f"Error extracting text from image: {str(e)}")
+            return ""
+    def extract_text_from_docx(self, file_path: str) -> str:
+        """Extract text from DOCX file."""
+        try:
+            # Try using mammoth first for better formatting preservation
+            with open(file_path, "rb") as docx_file:
+                result = mammoth.extract_raw_text(docx_file)
+                text = result.value
+            if text.strip():
+                return text
+            # Fallback to python-docx if mammoth fails
+            doc = Document(file_path)
+            return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
+        except Exception as e:
+            logger.error(f"Error extracting text from DOCX: {str(e)}")
+            # Final fallback to textract
+            try:
+                return textract.process(file_path).decode('utf-8')
+            except Exception as e2:
+                logger.error(f"Textract fallback failed: {str(e2)}")
+                return ""
+    def extract_text_from_doc(self, file_path: str) -> str:
+        """Extract text from DOC file."""
+        try:
+            # Try textract first
+            return textract.process(file_path).decode('utf-8')
+        except Exception as e:
+            logger.error(f"Error extracting text from DOC with textract: {str(e)}")
+            try:
+                # Fallback to antiword if available
+                return subprocess.check_output(['antiword', file_path]).decode('utf-8')
+            except Exception as e2:
+                logger.error(f"Antiword fallback failed: {str(e2)}")
+                return ""
+    def extract_text_from_odt(self, file_path: str) -> str:
+        """Extract text from ODT file."""
+        try:
+            textdoc = load(file_path)
+            allparas = textdoc.getElementsByType(text.P)
+            return '\n'.join([teletype.extractText(para) for para in allparas])
+        except Exception as e:
+            logger.error(f"Error extracting text from ODT: {str(e)}")
+            # Fallback to textract
+            try:
+                return textract.process(file_path).decode('utf-8')
+            except Exception as e2:
+                logger.error(f"Textract fallback failed: {str(e2)}")
+                return ""
+def extract_text_from_resume(file):
+    extractor = ResumeExtractor()
+    try:
+        file_type = extractor.check_file_type(file.name)
+        extracted_text = extractor.extract_text(file.name, file_type)
+        if extracted_text.strip():
+            word_count = len(extracted_text.split())
+            char_count = len(extracted_text)
+            # Generate JSON using Hugging Face API
+            json_data = generate_json_from_text(extracted_text)
+            return extracted_text, word_count, char_count, json_data
+        else:
+            return "No text could be extracted from the file.", 0, 0, "{}"
+    except Exception as e:
+        return f"An error occurred: {str(e)}", 0, 0, "{}"
+def clean_json_string(json_str):
+    # Remove any leading or trailing whitespace
+    json_str = json_str.strip()
+    # Ensure the string starts with { and ends with }
+    if not json_str.startswith('{'):
+        json_str = '{' + json_str
+    if not json_str.endswith('}'):
+        json_str = json_str + '}'
+    # Replace any single quotes with double quotes
+    json_str = json_str.replace("'", '"')
+    # Fix common formatting issues
+    json_str = re.sub(r'(\w+):', r'"\1":', json_str)  # Add quotes to keys
+    json_str = re.sub(r',\s*}', '}', json_str)  # Remove trailing commas
+    return json_str
+def generate_json_from_text(text):
+    prompt = f"""
+    Given the following resume text, create a JSON object that organizes the information into relevant categories. Include fields for personal information, objective, education, experience, skills, and any other relevant sections. If information for a field is not provided, use "NOT PROVIDED" as the value.
+    Resume text:
+    {text}
+    Generate the JSON response:
+    """
+    try:
+        response = client.text_generation(
+            model="mistralai/Mixtral-8x7B-Instruct-v0.1",
+            prompt=prompt,
+            max_new_tokens=1000,
+            temperature=0.1
+        )
+        # Extract the JSON part from the response
+        json_start = response.find('{')
+        json_end = response.rfind('}') + 1
+        json_str = response[json_start:json_end]
+        # Clean and fix the JSON string
+        cleaned_json_str = clean_json_string(json_str)
+        # Parse and format the JSON
+        try:
+            parsed_json = json.loads(cleaned_json_str)
+            formatted_json = json.dumps(parsed_json, indent=2)
+            return formatted_json
+        except json.JSONDecodeError as e:
+            logger.error(f"Error parsing JSON after cleaning (lack of infos): {str(e)}")
+            return json.dumps({"Warning": "Not all data fetchable", "raw_text": cleaned_json_str}, indent=2)
+    except Exception as e:
+        logger.error(f"Error generating JSON: {str(e)}")
+        return json.dumps({"error": str(e)}, indent=2)
+# Custom CSS for better aesthetics
+custom_css = """
+    #component-0 { max-width: 800px; margin: auto; }
+    .gradio-container { font-family: 'Arial', sans-serif; }
+    .uploadbuttonwrap { background-color: #f0f0f0; border-radius: 10px; padding: 20px; }
+    .uploadbuttonwrap label { background-color: #4CAF50; color: white; padding: 10px 15px; border-radius: 5px; cursor: pointer; }
+    .uploadbuttonwrap label:hover { background-color: #45a049; }
+    .output-markdown { background-color: #f9f9f9; border: 1px solid #ddd; border-radius: 5px; padding: 15px; }
+    .output-html { max-height: 400px; overflow-y: auto; }
+"""
+# Define the Gradio interface with improved aesthetics
+with gr.Blocks(css=custom_css) as iface:
+    gr.Markdown(
+        """
+        # 📄 Resume Text Extractor and Analyzer
+        Upload your resume (PDF, DOC, DOCX, ODT, JPG, or PNG) to extract the text content and generate structured data.
+        """
+    )
+    with gr.Row():
+        file_input = gr.File(label="Upload Resume")
+    with gr.Row():
+        extract_button = gr.Button("Extract and Analyze", variant="primary")
+    with gr.Row():
+        with gr.Column(scale=2):
+            text_output = gr.Textbox(label="Extracted Text", lines=10)
+        with gr.Column(scale=1):
+            word_count = gr.Number(label="Word Count")
+            char_count = gr.Number(label="Character Count")
+    with gr.Row():
+        json_output = gr.JSON(label="Structured Resume Data")
+    extract_button.click(
+        fn=extract_text_from_resume,
+        inputs=[file_input],
+        outputs=[text_output, word_count, char_count, json_output]
+    )
+    gr.Markdown(
+        """
+        ### How it works
+        1. Upload your resume file
+        2. Click "Extract and Analyze"
+        3. View the extracted text and structured data
+        This tool uses advanced NLP techniques to parse your resume and provide insights.
+        """
+    )
+iface.launch(share=True)

packages.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+tesseract-ocr
+poppler-utils
+swig
+libpulse-dev

requirements.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+gradio
+huggingface_hub
+pdfplumber
+pdf2image
+pytesseract
+PyPDF2
+python-docx
+mammoth
+textract
+odfpy