Spaces:
Build error
Build error
Update extract_resume_content.py
Browse files- extract_resume_content.py +5 -15
extract_resume_content.py
CHANGED
|
@@ -1,14 +1,11 @@
|
|
| 1 |
-
import
|
| 2 |
from docx import Document
|
|
|
|
| 3 |
from bs4 import BeautifulSoup
|
| 4 |
-
import os
|
| 5 |
-
import re
|
| 6 |
|
| 7 |
def extract_text_from_pdf(file):
|
| 8 |
with pdfplumber.open(file) as pdf:
|
| 9 |
-
return "\n".join(
|
| 10 |
-
page.extract_text() for page in pdf.pages if page.extract_text()
|
| 11 |
-
)
|
| 12 |
|
| 13 |
def extract_text_from_docx(file):
|
| 14 |
doc = Document(file)
|
|
@@ -34,20 +31,13 @@ def extract_resume_text(file, file_type):
|
|
| 34 |
return ""
|
| 35 |
|
| 36 |
def extract_fields_from_text(text):
|
| 37 |
-
# Extract email
|
| 38 |
email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
|
| 39 |
-
email = email_match.group() if email_match else "Not found"
|
| 40 |
-
|
| 41 |
-
# Extract phone number
|
| 42 |
phone_match = re.search(r'(\+?\d{1,3})?[\s\-]?\(?\d{2,4}\)?[\s\-]?\d{3,4}[\s\-]?\d{3,4}', text)
|
| 43 |
-
phone = phone_match.group() if phone_match else "Not found"
|
| 44 |
-
|
| 45 |
-
# Try to find a name (optional heuristic - first line of text)
|
| 46 |
lines = text.strip().split("\n")
|
| 47 |
name = lines[0] if lines and len(lines[0].split()) <= 4 else "Not found"
|
| 48 |
|
| 49 |
return {
|
| 50 |
"name": name,
|
| 51 |
-
"email":
|
| 52 |
-
"phone":
|
| 53 |
}
|
|
|
|
| 1 |
+
import re
|
| 2 |
from docx import Document
|
| 3 |
+
import pdfplumber
|
| 4 |
from bs4 import BeautifulSoup
|
|
|
|
|
|
|
| 5 |
|
| 6 |
def extract_text_from_pdf(file):
|
| 7 |
with pdfplumber.open(file) as pdf:
|
| 8 |
+
return "\n".join(page.extract_text() for page in pdf.pages if page.extract_text())
|
|
|
|
|
|
|
| 9 |
|
| 10 |
def extract_text_from_docx(file):
|
| 11 |
doc = Document(file)
|
|
|
|
| 31 |
return ""
|
| 32 |
|
| 33 |
def extract_fields_from_text(text):
|
|
|
|
| 34 |
email_match = re.search(r'[\w\.-]+@[\w\.-]+', text)
|
|
|
|
|
|
|
|
|
|
| 35 |
phone_match = re.search(r'(\+?\d{1,3})?[\s\-]?\(?\d{2,4}\)?[\s\-]?\d{3,4}[\s\-]?\d{3,4}', text)
|
|
|
|
|
|
|
|
|
|
| 36 |
lines = text.strip().split("\n")
|
| 37 |
name = lines[0] if lines and len(lines[0].split()) <= 4 else "Not found"
|
| 38 |
|
| 39 |
return {
|
| 40 |
"name": name,
|
| 41 |
+
"email": email_match.group() if email_match else "Not found",
|
| 42 |
+
"phone": phone_match.group() if phone_match else "Not found"
|
| 43 |
}
|