Spaces:
Sleeping
Sleeping
| import PyPDF2 | |
| import docx | |
| import io | |
| def parse_pdf(file_stream): | |
| """ | |
| Extracts text from a PDF file stream. | |
| Args: | |
| file_stream: A file-like object (e.g., from st.file_uploader). | |
| Returns: | |
| str: The extracted text from the PDF. | |
| """ | |
| text = "" | |
| try: | |
| reader = PyPDF2.PdfReader(file_stream) | |
| for page in reader.pages: | |
| text += page.extract_text() or "" | |
| except Exception as e: | |
| print(f"Error reading PDF: {e}") | |
| raise ValueError("Could not parse the PDF file. It might be corrupted or image-based.") | |
| return text | |
| def parse_docx(file_stream): | |
| """ | |
| Extracts text from a DOCX file stream. | |
| Args: | |
| file_stream: A file-like object. | |
| Returns: | |
| str: The extracted text from the DOCX file. | |
| """ | |
| text = "" | |
| try: | |
| doc = docx.Document(file_stream) | |
| for para in doc.paragraphs: | |
| text += para.text + "\n" | |
| except Exception as e: | |
| print(f"Error reading DOCX: {e}") | |
| raise ValueError("Could not parse the DOCX file.") | |
| return text | |
| def parse_resume(uploaded_file): | |
| """ | |
| Parses an uploaded resume file (PDF or DOCX) and returns its text content. | |
| Args: | |
| uploaded_file: The file object from Streamlit's file_uploader. | |
| Returns: | |
| str: The text content of the resume. | |
| Raises: | |
| ValueError: If the file type is not supported or parsing fails. | |
| """ | |
| if uploaded_file is None: | |
| raise ValueError("No file uploaded.") | |
| file_extension = uploaded_file.name.split('.')[-1].lower() | |
| # We use BytesIO to handle the file in memory | |
| file_stream = io.BytesIO(uploaded_file.getvalue()) | |
| if file_extension == 'pdf': | |
| return parse_pdf(file_stream) | |
| elif file_extension == 'docx': | |
| return parse_docx(file_stream) | |
| else: | |
| raise ValueError(f"Unsupported file type: '{file_extension}'. Please upload a PDF or DOCX file.") | |