import PyPDF2 from docx import Document from pathlib import Path import io def extract_text_from_pdf(file_content): """Extract text from PDF file""" try: pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content)) text = "" for page in pdf_reader.pages: text += page.extract_text() + "\n" return text.strip() except Exception as e: raise Exception(f"Error reading PDF: {str(e)}") def extract_text_from_docx(file_content): """Extract text from DOCX file""" try: doc = Document(io.BytesIO(file_content)) text = "" for para in doc.paragraphs: text += para.text + "\n" for table in doc.tables: for row in table.rows: for cell in row.cells: text += cell.text + " " text += "\n" return text.strip() except Exception as e: raise Exception(f"Error reading DOCX: {str(e)}") def extract_text_from_txt(file_content): """Extract text from TXT file""" try: return file_content.decode('utf-8').strip() except Exception as e: raise Exception(f"Error reading TXT: {str(e)}") def parse_resume(file_content, file_extension): """ Parse resume based on file type Args: file_content: Binary file content file_extension: File extension (.pdf, .docx, .txt) Returns: Extracted text from resume """ file_extension = file_extension.lower() if file_extension == ".pdf": return extract_text_from_pdf(file_content) elif file_extension in [".docx", ".doc"]: return extract_text_from_docx(file_content) elif file_extension == ".txt": return extract_text_from_txt(file_content) else: raise ValueError(f"Unsupported file format: {file_extension}") def extract_from_uploaded_file(uploaded_file): """ Extract text from uploaded file object Args: uploaded_file: Streamlit uploaded file object Returns: Extracted text """ file_extension = Path(uploaded_file.name).suffix.lower() file_content = uploaded_file.read() return parse_resume(file_content, file_extension)