ShreehariS754 commited on
Commit
c416fc6
·
verified ·
1 Parent(s): 12a6157

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +6 -4
  2. app.py +302 -0
  3. packages.txt +4 -0
  4. requirements.txt +10 -0
README.md CHANGED
@@ -1,12 +1,14 @@
1
  ---
2
- title: Extract Resume Data
3
- emoji: 🐨
4
- colorFrom: yellow
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 5.1.0
8
  app_file: app.py
9
  pinned: false
 
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Extract_Resume_Data
3
+ emoji: 👁
4
+ colorFrom: blue
5
+ colorTo: yellow
6
  sdk: gradio
7
  sdk_version: 5.1.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+ short_description: Extracts text from resume of various file types, makes json
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import pdfplumber
4
+ from pdf2image import convert_from_path
5
+ from PIL import Image
6
+ import pytesseract
7
+ import PyPDF2
8
+ from typing import Optional, Dict, Callable
9
+ import logging
10
+ import tempfile
11
+ from docx import Document
12
+ import subprocess
13
+ from odf import text, teletype
14
+ from odf.opendocument import load
15
+ import mammoth
16
+ import textract
17
+ from huggingface_hub import InferenceClient
18
+ import json
19
+ import re
20
+
21
+ # Set up logging
22
+ logging.basicConfig(level=logging.INFO)
23
+ logger = logging.getLogger(__name__)
24
+
25
+ # Initialize the Hugging Face Inference Client
26
+ client = InferenceClient(api_key=os.environ.get("HF_TOKEN"))
27
+ class ResumeExtractor:
28
+ def __init__(self, upload_dir: str = "./uploaded_files"):
29
+ """Initialize the ResumeExtractor with upload directory."""
30
+ self.upload_dir = upload_dir
31
+ self._ensure_upload_dir()
32
+ self.supported_formats = {
33
+ 'pdf': self.extract_text_from_pdf,
34
+ 'image': self.extract_text_from_image,
35
+ 'docx': self.extract_text_from_docx,
36
+ 'doc': self.extract_text_from_doc,
37
+ 'odt': self.extract_text_from_odt
38
+ }
39
+
40
+ def _ensure_upload_dir(self) -> None:
41
+ """Create upload directory if it doesn't exist."""
42
+ if not os.path.exists(self.upload_dir):
43
+ os.makedirs(self.upload_dir)
44
+
45
+ @staticmethod
46
+ def check_file_type(file_path: str) -> str:
47
+ """Check file extension and return file type."""
48
+ ext = os.path.splitext(file_path)[-1].lower()
49
+ format_mapping = {
50
+ '.pdf': 'pdf',
51
+ '.jpg': 'image',
52
+ '.jpeg': 'image',
53
+ '.png': 'image',
54
+ '.docx': 'docx',
55
+ '.doc': 'doc',
56
+ '.odt': 'odt'
57
+ }
58
+ if ext in format_mapping:
59
+ return format_mapping[ext]
60
+ raise ValueError(f"Unsupported file type: {ext}")
61
+
62
+ def extract_text(self, file_path: str, file_type: str) -> str:
63
+ """Extract text using appropriate method based on file type."""
64
+ if file_type not in self.supported_formats:
65
+ raise ValueError(f"Unsupported format: {file_type}")
66
+
67
+ return self.supported_formats[file_type](file_path)
68
+
69
+ def extract_text_from_pdf(self, file_path: str) -> str:
70
+ """Extract text from PDF using multiple methods."""
71
+ methods = [
72
+ (self._extract_with_pdfplumber, "pdfplumber"),
73
+ (self._extract_with_pypdf2, "PyPDF2"),
74
+ (self._extract_with_ocr, "OCR")
75
+ ]
76
+
77
+ for extract_method, method_name in methods:
78
+ try:
79
+ text = extract_method(file_path)
80
+ if text.strip():
81
+ logger.info(f"Successfully extracted text using {method_name}")
82
+ return text
83
+ logger.info(f"No text found using {method_name}, trying next method...")
84
+ except Exception as e:
85
+ logger.error(f"Error with {method_name}: {str(e)}")
86
+
87
+ return ""
88
+
89
+ @staticmethod
90
+ def _extract_with_pdfplumber(file_path: str) -> str:
91
+ """Extract text using pdfplumber."""
92
+ with pdfplumber.open(file_path) as pdf:
93
+ return ' '.join(page.extract_text() or '' for page in pdf.pages)
94
+
95
+ @staticmethod
96
+ def _extract_with_pypdf2(file_path: str) -> str:
97
+ """Extract text using PyPDF2."""
98
+ with open(file_path, 'rb') as pdf_file:
99
+ reader = PyPDF2.PdfReader(pdf_file)
100
+ return ' '.join(page.extract_text() or '' for page in reader.pages)
101
+
102
+ @staticmethod
103
+ def _extract_with_ocr(file_path: str) -> str:
104
+ """Extract text using OCR."""
105
+ images = convert_from_path(file_path)
106
+ return ' '.join(pytesseract.image_to_string(image) for image in images)
107
+
108
+ def extract_text_from_image(self, file_path: str) -> str:
109
+ """Extract text from image using pytesseract."""
110
+ try:
111
+ with Image.open(file_path) as image:
112
+ return pytesseract.image_to_string(image)
113
+ except Exception as e:
114
+ logger.error(f"Error extracting text from image: {str(e)}")
115
+ return ""
116
+
117
+ def extract_text_from_docx(self, file_path: str) -> str:
118
+ """Extract text from DOCX file."""
119
+ try:
120
+ # Try using mammoth first for better formatting preservation
121
+ with open(file_path, "rb") as docx_file:
122
+ result = mammoth.extract_raw_text(docx_file)
123
+ text = result.value
124
+
125
+ if text.strip():
126
+ return text
127
+
128
+ # Fallback to python-docx if mammoth fails
129
+ doc = Document(file_path)
130
+ return '\n'.join([paragraph.text for paragraph in doc.paragraphs])
131
+ except Exception as e:
132
+ logger.error(f"Error extracting text from DOCX: {str(e)}")
133
+ # Final fallback to textract
134
+ try:
135
+ return textract.process(file_path).decode('utf-8')
136
+ except Exception as e2:
137
+ logger.error(f"Textract fallback failed: {str(e2)}")
138
+ return ""
139
+
140
+ def extract_text_from_doc(self, file_path: str) -> str:
141
+ """Extract text from DOC file."""
142
+ try:
143
+ # Try textract first
144
+ return textract.process(file_path).decode('utf-8')
145
+ except Exception as e:
146
+ logger.error(f"Error extracting text from DOC with textract: {str(e)}")
147
+ try:
148
+ # Fallback to antiword if available
149
+ return subprocess.check_output(['antiword', file_path]).decode('utf-8')
150
+ except Exception as e2:
151
+ logger.error(f"Antiword fallback failed: {str(e2)}")
152
+ return ""
153
+
154
+ def extract_text_from_odt(self, file_path: str) -> str:
155
+ """Extract text from ODT file."""
156
+ try:
157
+ textdoc = load(file_path)
158
+ allparas = textdoc.getElementsByType(text.P)
159
+ return '\n'.join([teletype.extractText(para) for para in allparas])
160
+ except Exception as e:
161
+ logger.error(f"Error extracting text from ODT: {str(e)}")
162
+ # Fallback to textract
163
+ try:
164
+ return textract.process(file_path).decode('utf-8')
165
+ except Exception as e2:
166
+ logger.error(f"Textract fallback failed: {str(e2)}")
167
+ return ""
168
+
169
+ def extract_text_from_resume(file):
170
+ extractor = ResumeExtractor()
171
+
172
+ try:
173
+ file_type = extractor.check_file_type(file.name)
174
+ extracted_text = extractor.extract_text(file.name, file_type)
175
+
176
+ if extracted_text.strip():
177
+ word_count = len(extracted_text.split())
178
+ char_count = len(extracted_text)
179
+
180
+ # Generate JSON using Hugging Face API
181
+ json_data = generate_json_from_text(extracted_text)
182
+
183
+ return extracted_text, word_count, char_count, json_data
184
+ else:
185
+ return "No text could be extracted from the file.", 0, 0, "{}"
186
+ except Exception as e:
187
+ return f"An error occurred: {str(e)}", 0, 0, "{}"
188
+
189
+ def clean_json_string(json_str):
190
+ # Remove any leading or trailing whitespace
191
+ json_str = json_str.strip()
192
+
193
+ # Ensure the string starts with { and ends with }
194
+ if not json_str.startswith('{'):
195
+ json_str = '{' + json_str
196
+ if not json_str.endswith('}'):
197
+ json_str = json_str + '}'
198
+
199
+ # Replace any single quotes with double quotes
200
+ json_str = json_str.replace("'", '"')
201
+
202
+ # Fix common formatting issues
203
+ json_str = re.sub(r'(\w+):', r'"\1":', json_str) # Add quotes to keys
204
+ json_str = re.sub(r',\s*}', '}', json_str) # Remove trailing commas
205
+
206
+ return json_str
207
+
208
+ def generate_json_from_text(text):
209
+ prompt = f"""
210
+ Given the following resume text, create a JSON object that organizes the information into relevant categories. Include fields for personal information, objective, education, experience, skills, and any other relevant sections. If information for a field is not provided, use "NOT PROVIDED" as the value.
211
+
212
+ Resume text:
213
+ {text}
214
+
215
+ Generate the JSON response:
216
+ """
217
+
218
+ try:
219
+ response = client.text_generation(
220
+ model="mistralai/Mixtral-8x7B-Instruct-v0.1",
221
+ prompt=prompt,
222
+ max_new_tokens=1000,
223
+ temperature=0.1
224
+ )
225
+
226
+ # Extract the JSON part from the response
227
+ json_start = response.find('{')
228
+ json_end = response.rfind('}') + 1
229
+ json_str = response[json_start:json_end]
230
+
231
+ # Clean and fix the JSON string
232
+ cleaned_json_str = clean_json_string(json_str)
233
+
234
+ # Parse and format the JSON
235
+ try:
236
+ parsed_json = json.loads(cleaned_json_str)
237
+ formatted_json = json.dumps(parsed_json, indent=2)
238
+ return formatted_json
239
+ except json.JSONDecodeError as e:
240
+ logger.error(f"Error parsing JSON after cleaning (lack of infos): {str(e)}")
241
+ return json.dumps({"Warning": "Not all data fetchable", "raw_text": cleaned_json_str}, indent=2)
242
+
243
+ except Exception as e:
244
+ logger.error(f"Error generating JSON: {str(e)}")
245
+ return json.dumps({"error": str(e)}, indent=2)
246
+
247
+ # Custom CSS for better aesthetics
248
+ custom_css = """
249
+ #component-0 { max-width: 800px; margin: auto; }
250
+ .gradio-container { font-family: 'Arial', sans-serif; }
251
+ .uploadbuttonwrap { background-color: #f0f0f0; border-radius: 10px; padding: 20px; }
252
+ .uploadbuttonwrap label { background-color: #4CAF50; color: white; padding: 10px 15px; border-radius: 5px; cursor: pointer; }
253
+ .uploadbuttonwrap label:hover { background-color: #45a049; }
254
+ .output-markdown { background-color: #f9f9f9; border: 1px solid #ddd; border-radius: 5px; padding: 15px; }
255
+ .output-html { max-height: 400px; overflow-y: auto; }
256
+ """
257
+
258
+ # Define the Gradio interface with improved aesthetics
259
+ with gr.Blocks(css=custom_css) as iface:
260
+ gr.Markdown(
261
+ """
262
+ # 📄 Resume Text Extractor and Analyzer
263
+
264
+ Upload your resume (PDF, DOC, DOCX, ODT, JPG, or PNG) to extract the text content and generate structured data.
265
+ """
266
+ )
267
+
268
+ with gr.Row():
269
+ file_input = gr.File(label="Upload Resume")
270
+
271
+ with gr.Row():
272
+ extract_button = gr.Button("Extract and Analyze", variant="primary")
273
+
274
+ with gr.Row():
275
+ with gr.Column(scale=2):
276
+ text_output = gr.Textbox(label="Extracted Text", lines=10)
277
+ with gr.Column(scale=1):
278
+ word_count = gr.Number(label="Word Count")
279
+ char_count = gr.Number(label="Character Count")
280
+
281
+ with gr.Row():
282
+ json_output = gr.JSON(label="Structured Resume Data")
283
+
284
+ extract_button.click(
285
+ fn=extract_text_from_resume,
286
+ inputs=[file_input],
287
+ outputs=[text_output, word_count, char_count, json_output]
288
+ )
289
+
290
+ gr.Markdown(
291
+ """
292
+ ### How it works
293
+ 1. Upload your resume file
294
+ 2. Click "Extract and Analyze"
295
+ 3. View the extracted text and structured data
296
+
297
+ This tool uses advanced NLP techniques to parse your resume and provide insights.
298
+ """
299
+ )
300
+
301
+
302
+ iface.launch(share=True)
packages.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ tesseract-ocr
2
+ poppler-utils
3
+ swig
4
+ libpulse-dev
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ huggingface_hub
3
+ pdfplumber
4
+ pdf2image
5
+ pytesseract
6
+ PyPDF2
7
+ python-docx
8
+ mammoth
9
+ textract
10
+ odfpy