Spaces:
Sleeping
Sleeping
File size: 6,311 Bytes
46917c3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 |
import re
import urllib.request
import spacy
from .utils import TextCleaner
# Load the English model
nlp = spacy.load("en_core_web_sm")
RESUME_SECTIONS = [
"Contact Information",
"Objective",
"Summary",
"Education",
"Experience",
"Skills",
"Projects",
"Certifications",
"Licenses",
"Awards",
"Honors",
"Publications",
"References",
"Technical Skills",
"Computer Skills",
"Programming Languages",
"Software Skills",
"Soft Skills",
"Language Skills",
"Professional Skills",
"Transferable Skills",
"Work Experience",
"Professional Experience",
"Employment History",
"Internship Experience",
"Volunteer Experience",
"Leadership Experience",
"Research Experience",
"Teaching Experience",
]
class DataExtractor:
"""
A class for extracting various types of data from text.
"""
def __init__(self, raw_text: str):
"""
Initialize the DataExtractor object.
Args:
raw_text (str): The raw input text.
"""
self.text = raw_text
self.clean_text = TextCleaner.clean_text(self.text)
self.doc = nlp(self.clean_text)
def extract_links(self):
"""
Find links of any type in a given string.
Args:
text (str): The string to search for links.
Returns:
list: A list containing all the found links.
"""
link_pattern = r"\b(?:https?://|www\.)\S+\b"
links = re.findall(link_pattern, self.text)
return links
def extract_links_extended(self):
"""
Extract links of all kinds (HTTP, HTTPS, FTP, email, www.linkedin.com,
and github.com/user_name) from a webpage.
Args:
url (str): The URL of the webpage.
Returns:
list: A list containing all the extracted links.
"""
links = []
try:
response = urllib.request.urlopen(self.text)
html_content = response.read().decode("utf-8")
pattern = r'href=[\'"]?([^\'" >]+)'
raw_links = re.findall(pattern, html_content)
for link in raw_links:
if link.startswith(
(
"http://",
"https://",
"ftp://",
"mailto:",
"www.linkedin.com",
"github.com/",
"twitter.com",
)
):
links.append(link)
except Exception as e:
print(f"Error extracting links: {str(e)}")
return links
def extract_names(self):
"""Extracts and returns a list of names from the given
text using spaCy's named entity recognition.
Args:
text (str): The text to extract names from.
Returns:
list: A list of strings representing the names extracted from the text.
"""
names = [ent.text for ent in self.doc.ents if ent.label_ == "PERSON"]
return names
def extract_emails(self):
"""
Extract email addresses from a given string.
Args:
text (str): The string from which to extract email addresses.
Returns:
list: A list containing all the extracted email addresses.
"""
email_pattern = r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b"
emails = re.findall(email_pattern, self.text)
return emails
def extract_phone_numbers(self):
"""
Extract phone numbers from a given string.
Args:
text (str): The string from which to extract phone numbers.
Returns:
list: A list containing all the extracted phone numbers.
"""
phone_number_pattern = (
r"^(\+\d{1,3})?[-.\s]?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}$"
)
phone_numbers = re.findall(phone_number_pattern, self.text)
return phone_numbers
def extract_experience(self):
"""
Extract experience from a given string. It does so by using the Spacy module.
Args:
text (str): The string from which to extract experience.
Returns:
str: A string containing all the extracted experience.
"""
experience_section = []
in_experience_section = False
for token in self.doc:
if token.text in RESUME_SECTIONS:
if token.text == "Experience" or "EXPERIENCE" or "experience":
in_experience_section = True
else:
in_experience_section = False
if in_experience_section:
experience_section.append(token.text)
return " ".join(experience_section)
def extract_position_year(self):
"""
Extract position and year from a given string.
Args:
text (str): The string from which to extract position and year.
Returns:
list: A list containing the extracted position and year.
"""
position_year_search_pattern = (
r"(\b\w+\b\s+\b\w+\b),\s+(\d{4})\s*-\s*(\d{4}|\bpresent\b)"
)
position_year = re.findall(position_year_search_pattern, self.text)
return position_year
def extract_particular_words(self):
"""
Extract nouns and proper nouns from the given text.
Args:
text (str): The input text to extract nouns from.
Returns:
list: A list of extracted nouns.
"""
pos_tags = ["NOUN", "PROPN"]
nouns = [token.text for token in self.doc if token.pos_ in pos_tags]
return nouns
def extract_entities(self):
"""
Extract named entities of types 'GPE' (geopolitical entity) and 'ORG' (organization) from the given text.
Args:
text (str): The input text to extract entities from.
Returns:
list: A list of extracted entities.
"""
entity_labels = ["GPE", "ORG"]
entities = [
token.text for token in self.doc.ents if token.label_ in entity_labels
]
return list(set(entities))
|