import PyPDF2 import docx import io def parse_pdf(file_stream): """ Extracts text from a PDF file stream. Args: file_stream: A file-like object (e.g., from st.file_uploader). Returns: str: The extracted text from the PDF. """ text = "" try: reader = PyPDF2.PdfReader(file_stream) for page in reader.pages: text += page.extract_text() or "" except Exception as e: print(f"Error reading PDF: {e}") raise ValueError("Could not parse the PDF file. It might be corrupted or image-based.") return text def parse_docx(file_stream): """ Extracts text from a DOCX file stream. Args: file_stream: A file-like object. Returns: str: The extracted text from the DOCX file. """ text = "" try: doc = docx.Document(file_stream) for para in doc.paragraphs: text += para.text + "\n" except Exception as e: print(f"Error reading DOCX: {e}") raise ValueError("Could not parse the DOCX file.") return text def parse_resume(uploaded_file): """ Parses an uploaded resume file (PDF or DOCX) and returns its text content. Args: uploaded_file: The file object from Streamlit's file_uploader. Returns: str: The text content of the resume. Raises: ValueError: If the file type is not supported or parsing fails. """ if uploaded_file is None: raise ValueError("No file uploaded.") file_extension = uploaded_file.name.split('.')[-1].lower() # We use BytesIO to handle the file in memory file_stream = io.BytesIO(uploaded_file.getvalue()) if file_extension == 'pdf': return parse_pdf(file_stream) elif file_extension == 'docx': return parse_docx(file_stream) else: raise ValueError(f"Unsupported file type: '{file_extension}'. Please upload a PDF or DOCX file.")