Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| from docx import Document | |
| from pathlib import Path | |
| import io | |
| def extract_text_from_pdf(file_content): | |
| """Extract text from PDF file""" | |
| try: | |
| pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) | |
| text = "" | |
| for page in pdf_reader.pages: | |
| text += page.extract_text() + "\n" | |
| return text.strip() | |
| except Exception as e: | |
| raise Exception(f"Error reading PDF: {str(e)}") | |
| def extract_text_from_docx(file_content): | |
| """Extract text from DOCX file""" | |
| try: | |
| doc = Document(io.BytesIO(file_content)) | |
| text = "" | |
| for para in doc.paragraphs: | |
| text += para.text + "\n" | |
| for table in doc.tables: | |
| for row in table.rows: | |
| for cell in row.cells: | |
| text += cell.text + " " | |
| text += "\n" | |
| return text.strip() | |
| except Exception as e: | |
| raise Exception(f"Error reading DOCX: {str(e)}") | |
| def extract_text_from_txt(file_content): | |
| """Extract text from TXT file""" | |
| try: | |
| return file_content.decode('utf-8').strip() | |
| except Exception as e: | |
| raise Exception(f"Error reading TXT: {str(e)}") | |
| def parse_resume(file_content, file_extension): | |
| """ | |
| Parse resume based on file type | |
| Args: | |
| file_content: Binary file content | |
| file_extension: File extension (.pdf, .docx, .txt) | |
| Returns: | |
| Extracted text from resume | |
| """ | |
| file_extension = file_extension.lower() | |
| if file_extension == ".pdf": | |
| return extract_text_from_pdf(file_content) | |
| elif file_extension in [".docx", ".doc"]: | |
| return extract_text_from_docx(file_content) | |
| elif file_extension == ".txt": | |
| return extract_text_from_txt(file_content) | |
| else: | |
| raise ValueError(f"Unsupported file format: {file_extension}") | |
| def extract_from_uploaded_file(uploaded_file): | |
| """ | |
| Extract text from uploaded file object | |
| Args: | |
| uploaded_file: Streamlit uploaded file object | |
| Returns: | |
| Extracted text | |
| """ | |
| file_extension = Path(uploaded_file.name).suffix.lower() | |
| file_content = uploaded_file.read() | |
| return parse_resume(file_content, file_extension) | |