Spaces:
Sleeping
Sleeping
| import pypdf | |
| import docx | |
| import re | |
| from io import BytesIO | |
| class ResumeParser: | |
| def __init__(self): | |
| pass | |
| def extract_text_from_pdf(self, pdf_file): | |
| try: | |
| # Handle different file input types | |
| if hasattr(pdf_file, 'read'): | |
| # If it's a file-like object | |
| file_content = pdf_file.read() | |
| pdf_file.seek(0) # Reset file pointer | |
| else: | |
| # If it's already bytes | |
| file_content = pdf_file | |
| pdf_reader = pypdf.PdfReader(BytesIO(file_content)) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| page_text = page.extract_text() | |
| if page_text: | |
| text += page_text + "\n" | |
| else: | |
| # Handle empty page text | |
| text += "\n" | |
| return text.strip() | |
| except Exception as e: | |
| print(f"Error extracting text from PDF: {e}") | |
| return "" | |
| def extract_text_from_docx(self, docx_file): | |
| try: | |
| doc = docx.Document(BytesIO(docx_file.read())) | |
| text = "" | |
| for paragraph in doc.paragraphs: | |
| text += paragraph.text + "\n" | |
| return text.strip() | |
| except Exception as e: | |
| print(f"Error extracting text from DOCX: {e}") | |
| return "" | |
| def extract_text(self, file): | |
| # Reset file pointer to beginning | |
| file.seek(0) | |
| if file.name.endswith('.pdf'): | |
| return self.extract_text_from_pdf(file) | |
| elif file.name.endswith('.docx'): | |
| return self.extract_text_from_docx(file) | |
| else: | |
| return "" | |
| def parse(self, file): | |
| text = self.extract_text(file) | |
| # Simple keyword-based parsing | |
| skills = [] | |
| experience = [] | |
| education = [] | |
| # Common programming languages and tools | |
| skill_keywords = ['python', 'java', 'javascript', 'html', 'css', 'sql', 'react', 'angular', 'vue', | |
| 'node', 'express', 'django', 'flask', 'spring', 'docker', 'kubernetes', 'aws', | |
| 'azure', 'git', 'jenkins', 'jira'] | |
| # Look for skills | |
| text_lower = text.lower() | |
| for skill in skill_keywords: | |
| if skill in text_lower: | |
| skills.append(skill) | |
| return { | |
| "skills": skills, | |
| "experience": experience, | |
| "education": education, | |
| "raw_text": text | |
| } |