Spaces:
Sleeping
Sleeping
| from langchain_community.document_loaders import PyMuPDFLoader | |
| from nltk.corpus import stopwords | |
| from nltk.stem import PorterStemmer | |
| from nltk.stem import WordNetLemmatizer | |
| import re | |
| import string | |
| def load_pdf(file_path): | |
| loader = PyMuPDFLoader(file_path) | |
| data = loader.load() | |
| return data | |
| def clean_text(text): | |
| # Remove special characters (customize as needed) | |
| special_characters = "βββ’β¦" | |
| text = re.sub(f"[{re.escape(special_characters)}]", "", text) | |
| # Remove punctuation | |
| text = text.translate(str.maketrans("", "", string.punctuation)) | |
| # Remove numbers | |
| text = re.sub(r'\d+', '', text) | |
| # Remove extra whitespace | |
| text = " ".join(text.split()) | |
| # Convert text to lowercase | |
| text = text.lower() | |
| # Remove stopwords (optional) | |
| stop_words = set(stopwords.words('english')) | |
| text = " ".join(word for word in text.split() if word not in stop_words) | |
| # Stemming (optional) | |
| #ps = PorterStemmer() | |
| #text = " ".join(ps.stem(word) for word in text.split()) | |
| #Lemmatization | |
| lemmatizer = WordNetLemmatizer() | |
| text= " ".join(lemmatizer.lemmatize(word) for word in text.split()) | |
| return text | |
| def get_full_resume_text(file_path): | |
| resume_pages = load_pdf(file_path) | |
| resume_text = "" | |
| for page in resume_pages: | |
| resume_text += page.page_content | |
| resume_text += "\n\n" | |
| resume_text = clean_text(resume_text) | |
| return resume_text | |
| def process_pdf(file): | |
| return get_full_resume_text(file.name) | |