Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -12,7 +12,6 @@ download("en_core_web_sm")
|
|
| 12 |
# Load the spaCy model
|
| 13 |
nlp = spacy.load("en_core_web_sm")
|
| 14 |
|
| 15 |
-
|
| 16 |
# Set of English words
|
| 17 |
nltk.download('words', quiet=True)
|
| 18 |
english_words = set(words.words())
|
|
@@ -40,37 +39,32 @@ def extract_text_from_docx(file):
|
|
| 40 |
return "\n".join([para.text for para in doc.paragraphs])
|
| 41 |
|
| 42 |
def extract_companies(text):
|
| 43 |
-
# Process the text with the spaCy model
|
| 44 |
doc = nlp(text)
|
| 45 |
companies = []
|
| 46 |
|
| 47 |
-
# Define a regex pattern for common company name suffixes
|
| 48 |
company_pattern = re.compile(
|
| 49 |
r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE)
|
| 50 |
|
| 51 |
-
# Iterate over the identified entities in the text
|
| 52 |
for ent in doc.ents:
|
| 53 |
-
if ent.label_ == "ORG":
|
| 54 |
-
|
| 55 |
-
if company_pattern.search(ent.text):
|
| 56 |
-
companies.append(ent.text)
|
| 57 |
|
| 58 |
-
|
|
|
|
| 59 |
|
| 60 |
def extract_colleges(text):
|
| 61 |
doc = nlp(text)
|
| 62 |
colleges = []
|
| 63 |
|
| 64 |
-
# Extended list of education-related keywords
|
| 65 |
edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
|
| 66 |
|
| 67 |
for sent in doc.sents:
|
| 68 |
-
# Extract entities labeled as ORG and check if they contain education-related keywords
|
| 69 |
edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
|
| 70 |
for edu in edu_ents:
|
| 71 |
colleges.append(edu.text)
|
| 72 |
-
|
| 73 |
-
|
|
|
|
| 74 |
|
| 75 |
def extract_years_of_experience(text):
|
| 76 |
years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
|
|
@@ -131,28 +125,27 @@ def parse_resume(file):
|
|
| 131 |
summary = extract_summary(doc)
|
| 132 |
linkedin = extract_linkedin(text)
|
| 133 |
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
"Colleges Attended": colleges,
|
| 137 |
-
"Years of Experience": years_of_experience,
|
| 138 |
-
"Phone Number": phone,
|
| 139 |
-
"Email ID": email,
|
| 140 |
-
"Summary": summary,
|
| 141 |
-
"LinkedIn ID": linkedin
|
| 142 |
-
}
|
| 143 |
-
|
| 144 |
-
return result
|
| 145 |
except Exception as e:
|
| 146 |
import traceback
|
| 147 |
-
return
|
| 148 |
|
| 149 |
-
# Create Gradio interface
|
| 150 |
iface = gr.Interface(
|
| 151 |
fn=parse_resume,
|
| 152 |
inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
|
| 153 |
-
outputs=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
title="Advanced Resume Parser",
|
| 155 |
description="Upload a resume in PDF or DOCX format to extract key information."
|
| 156 |
)
|
| 157 |
|
| 158 |
-
iface.launch(share=True)
|
|
|
|
| 12 |
# Load the spaCy model
|
| 13 |
nlp = spacy.load("en_core_web_sm")
|
| 14 |
|
|
|
|
| 15 |
# Set of English words
|
| 16 |
nltk.download('words', quiet=True)
|
| 17 |
english_words = set(words.words())
|
|
|
|
| 39 |
return "\n".join([para.text for para in doc.paragraphs])
|
| 40 |
|
| 41 |
def extract_companies(text):
|
|
|
|
| 42 |
doc = nlp(text)
|
| 43 |
companies = []
|
| 44 |
|
|
|
|
| 45 |
company_pattern = re.compile(
|
| 46 |
r'\b(?:Inc|Corp|LLC|Ltd|Co|Company|Group|Services|Technologies|Pvt|Solutions|Consulting)\b', re.IGNORECASE)
|
| 47 |
|
|
|
|
| 48 |
for ent in doc.ents:
|
| 49 |
+
if ent.label_ == "ORG" and company_pattern.search(ent.text):
|
| 50 |
+
companies.append(ent.text)
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
# Join companies with new lines
|
| 53 |
+
return "\n".join(companies)
|
| 54 |
|
| 55 |
def extract_colleges(text):
|
| 56 |
doc = nlp(text)
|
| 57 |
colleges = []
|
| 58 |
|
|
|
|
| 59 |
edu_keywords = ["university", "college", "institute", "school", "academy", "polytechnic", "faculty", "department", "center", "centre", "campus", "educational", "institute of technology"]
|
| 60 |
|
| 61 |
for sent in doc.sents:
|
|
|
|
| 62 |
edu_ents = [ent for ent in sent.ents if ent.label_ == "ORG" and any(keyword in ent.text.lower() for keyword in edu_keywords)]
|
| 63 |
for edu in edu_ents:
|
| 64 |
colleges.append(edu.text)
|
| 65 |
+
|
| 66 |
+
# Join colleges with new lines
|
| 67 |
+
return "\n".join(colleges)
|
| 68 |
|
| 69 |
def extract_years_of_experience(text):
|
| 70 |
years = re.findall(r'(\d+)\s+year[s]*', text, re.IGNORECASE)
|
|
|
|
| 125 |
summary = extract_summary(doc)
|
| 126 |
linkedin = extract_linkedin(text)
|
| 127 |
|
| 128 |
+
return companies, colleges, years_of_experience, phone, email, summary, linkedin
|
| 129 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 130 |
except Exception as e:
|
| 131 |
import traceback
|
| 132 |
+
return f"An error occurred while parsing the resume: {str(e)}\n\nTraceback:\n{traceback.format_exc()}"
|
| 133 |
|
| 134 |
+
# Create Gradio interface with separate output components
|
| 135 |
iface = gr.Interface(
|
| 136 |
fn=parse_resume,
|
| 137 |
inputs=gr.File(label="Upload Resume (PDF or DOCX)"),
|
| 138 |
+
outputs=[
|
| 139 |
+
gr.Textbox(label="Companies Worked For", lines=10),
|
| 140 |
+
gr.Textbox(label="Colleges Attended", lines=10),
|
| 141 |
+
gr.Textbox(label="Years of Experience"),
|
| 142 |
+
gr.Textbox(label="Phone Number"),
|
| 143 |
+
gr.Textbox(label="Email ID"),
|
| 144 |
+
gr.Textbox(label="Summary", lines=3),
|
| 145 |
+
gr.Textbox(label="LinkedIn ID")
|
| 146 |
+
],
|
| 147 |
title="Advanced Resume Parser",
|
| 148 |
description="Upload a resume in PDF or DOCX format to extract key information."
|
| 149 |
)
|
| 150 |
|
| 151 |
+
iface.launch(share=True)
|