File size: 11,268 Bytes
c416fc6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
import os
import gradio as gr
import pdfplumber
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
import PyPDF2
from typing import Optional, Dict, Callable
import logging
import tempfile
from docx import Document
import subprocess
from odf import text, teletype
from odf.opendocument import load
import mammoth
import textract
from huggingface_hub import InferenceClient
import json
import re

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Initialize the Hugging Face Inference Client
client = InferenceClient(api_key=os.environ.get("HF_TOKEN"))
class ResumeExtractor:
    def __init__(self, upload_dir: str = "./uploaded_files"):
        """Initialize the ResumeExtractor with upload directory."""
        self.upload_dir = upload_dir
        self._ensure_upload_dir()
        self.supported_formats = {
            'pdf': self.extract_text_from_pdf,
            'image': self.extract_text_from_image,
            'docx': self.extract_text_from_docx,
            'doc': self.extract_text_from_doc,
            'odt': self.extract_text_from_odt
        }
        
    def _ensure_upload_dir(self) -> None:
        """Create upload directory if it doesn't exist."""
        if not os.path.exists(self.upload_dir):
            os.makedirs(self.upload_dir)
    
    @staticmethod
    def check_file_type(file_path: str) -> str:
        """Check file extension and return file type."""
        ext = os.path.splitext(file_path)[-1].lower()
        format_mapping = {
            '.pdf': 'pdf',
            '.jpg': 'image',
            '.jpeg': 'image',
            '.png': 'image',
            '.docx': 'docx',
            '.doc': 'doc',
            '.odt': 'odt'
        }
        if ext in format_mapping:
            return format_mapping[ext]
        raise ValueError(f"Unsupported file type: {ext}")
    
    def extract_text(self, file_path: str, file_type: str) -> str:
        """Extract text using appropriate method based on file type."""
        if file_type not in self.supported_formats:
            raise ValueError(f"Unsupported format: {file_type}")
        
        return self.supported_formats[file_type](file_path)
    
    def extract_text_from_pdf(self, file_path: str) -> str:
        """Extract text from PDF using multiple methods."""
        methods = [
            (self._extract_with_pdfplumber, "pdfplumber"),
            (self._extract_with_pypdf2, "PyPDF2"),
            (self._extract_with_ocr, "OCR")
        ]
        
        for extract_method, method_name in methods:
            try:
                text = extract_method(file_path)
                if text.strip():
                    logger.info(f"Successfully extracted text using {method_name}")
                    return text
                logger.info(f"No text found using {method_name}, trying next method...")
            except Exception as e:
                logger.error(f"Error with {method_name}: {str(e)}")
        
        return ""
    
    @staticmethod
    def _extract_with_pdfplumber(file_path: str) -> str:
        """Extract text using pdfplumber."""
        with pdfplumber.open(file_path) as pdf:
            return ' '.join(page.extract_text() or '' for page in pdf.pages)
    
    @staticmethod
    def _extract_with_pypdf2(file_path: str) -> str:
        """Extract text using PyPDF2."""
        with open(file_path, 'rb') as pdf_file:
            reader = PyPDF2.PdfReader(pdf_file)
            return ' '.join(page.extract_text() or '' for page in reader.pages)
    
    @staticmethod
    def _extract_with_ocr(file_path: str) -> str:
        """Extract text using OCR."""
        images = convert_from_path(file_path)
        return ' '.join(pytesseract.image_to_string(image) for image in images)
    
    def extract_text_from_image(self, file_path: str) -> str:
        """Extract text from image using pytesseract."""
        try:
            with Image.open(file_path) as image:
                return pytesseract.image_to_string(image)
        except Exception as e:
            logger.error(f"Error extracting text from image: {str(e)}")
            return ""

    def extract_text_from_docx(self, file_path: str) -> str:
        """Extract text from DOCX file."""
        try:
            # Try using mammoth first for better formatting preservation
            with open(file_path, "rb") as docx_file:
                result = mammoth.extract_raw_text(docx_file)
                text = result.value
                
            if text.strip():
                return text
            
            # Fallback to python-docx if mammoth fails
            doc = Document(file_path)
            return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
        except Exception as e:
            logger.error(f"Error extracting text from DOCX: {str(e)}")
            # Final fallback to textract
            try:
                return textract.process(file_path).decode('utf-8')
            except Exception as e2:
                logger.error(f"Textract fallback failed: {str(e2)}")
                return ""

    def extract_text_from_doc(self, file_path: str) -> str:
        """Extract text from DOC file."""
        try:
            # Try textract first
            return textract.process(file_path).decode('utf-8')
        except Exception as e:
            logger.error(f"Error extracting text from DOC with textract: {str(e)}")
            try:
                # Fallback to antiword if available
                return subprocess.check_output(['antiword', file_path]).decode('utf-8')
            except Exception as e2:
                logger.error(f"Antiword fallback failed: {str(e2)}")
                return ""

    def extract_text_from_odt(self, file_path: str) -> str:
        """Extract text from ODT file."""
        try:
            textdoc = load(file_path)
            allparas = textdoc.getElementsByType(text.P)
            return '\n'.join([teletype.extractText(para) for para in allparas])
        except Exception as e:
            logger.error(f"Error extracting text from ODT: {str(e)}")
            # Fallback to textract
            try:
                return textract.process(file_path).decode('utf-8')
            except Exception as e2:
                logger.error(f"Textract fallback failed: {str(e2)}")
                return ""

def extract_text_from_resume(file):
    extractor = ResumeExtractor()
    
    try:
        file_type = extractor.check_file_type(file.name)
        extracted_text = extractor.extract_text(file.name, file_type)
        
        if extracted_text.strip():
            word_count = len(extracted_text.split())
            char_count = len(extracted_text)
            
            # Generate JSON using Hugging Face API
            json_data = generate_json_from_text(extracted_text)
            
            return extracted_text, word_count, char_count, json_data
        else:
            return "No text could be extracted from the file.", 0, 0, "{}"
    except Exception as e:
        return f"An error occurred: {str(e)}", 0, 0, "{}"

def clean_json_string(json_str):
    # Remove any leading or trailing whitespace
    json_str = json_str.strip()
    
    # Ensure the string starts with { and ends with }
    if not json_str.startswith('{'):
        json_str = '{' + json_str
    if not json_str.endswith('}'):
        json_str = json_str + '}'
    
    # Replace any single quotes with double quotes
    json_str = json_str.replace("'", '"')
    
    # Fix common formatting issues
    json_str = re.sub(r'(\w+):', r'"\1":', json_str)  # Add quotes to keys
    json_str = re.sub(r',\s*}', '}', json_str)  # Remove trailing commas
    
    return json_str

def generate_json_from_text(text):
    prompt = f"""
    Given the following resume text, create a JSON object that organizes the information into relevant categories. Include fields for personal information, objective, education, experience, skills, and any other relevant sections. If information for a field is not provided, use "NOT PROVIDED" as the value.

    Resume text:
    {text}

    Generate the JSON response:
    """
    
    try:
        response = client.text_generation(
            model="mistralai/Mixtral-8x7B-Instruct-v0.1",
            prompt=prompt,
            max_new_tokens=1000,
            temperature=0.1
        )
        
        # Extract the JSON part from the response
        json_start = response.find('{')
        json_end = response.rfind('}') + 1
        json_str = response[json_start:json_end]
        
        # Clean and fix the JSON string
        cleaned_json_str = clean_json_string(json_str)
        
        # Parse and format the JSON
        try:
            parsed_json = json.loads(cleaned_json_str)
            formatted_json = json.dumps(parsed_json, indent=2)
            return formatted_json
        except json.JSONDecodeError as e:
            logger.error(f"Error parsing JSON after cleaning (lack of infos): {str(e)}")
            return json.dumps({"Warning": "Not all data fetchable", "raw_text": cleaned_json_str}, indent=2)
        
    except Exception as e:
        logger.error(f"Error generating JSON: {str(e)}")
        return json.dumps({"error": str(e)}, indent=2)

# Custom CSS for better aesthetics
custom_css = """
    #component-0 { max-width: 800px; margin: auto; }
    .gradio-container { font-family: 'Arial', sans-serif; }
    .uploadbuttonwrap { background-color: #f0f0f0; border-radius: 10px; padding: 20px; }
    .uploadbuttonwrap label { background-color: #4CAF50; color: white; padding: 10px 15px; border-radius: 5px; cursor: pointer; }
    .uploadbuttonwrap label:hover { background-color: #45a049; }
    .output-markdown { background-color: #f9f9f9; border: 1px solid #ddd; border-radius: 5px; padding: 15px; }
    .output-html { max-height: 400px; overflow-y: auto; }
"""

# Define the Gradio interface with improved aesthetics
with gr.Blocks(css=custom_css) as iface:
    gr.Markdown(
        """
        # 📄 Resume Text Extractor and Analyzer
        
        Upload your resume (PDF, DOC, DOCX, ODT, JPG, or PNG) to extract the text content and generate structured data.
        """
    )
    
    with gr.Row():
        file_input = gr.File(label="Upload Resume")
    
    with gr.Row():
        extract_button = gr.Button("Extract and Analyze", variant="primary")
    
    with gr.Row():
        with gr.Column(scale=2):
            text_output = gr.Textbox(label="Extracted Text", lines=10)
        with gr.Column(scale=1):
            word_count = gr.Number(label="Word Count")
            char_count = gr.Number(label="Character Count")
    
    with gr.Row():
        json_output = gr.JSON(label="Structured Resume Data")
    
    extract_button.click(
        fn=extract_text_from_resume,
        inputs=[file_input],
        outputs=[text_output, word_count, char_count, json_output]
    )
    
    gr.Markdown(
        """
        ### How it works
        1. Upload your resume file
        2. Click "Extract and Analyze"
        3. View the extracted text and structured data
        
        This tool uses advanced NLP techniques to parse your resume and provide insights.
        """
    )


iface.launch(share=True)