Spaces:
Sleeping
Sleeping
File size: 2,227 Bytes
1aea493 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 | import PyPDF2
from docx import Document
from pathlib import Path
import io
def extract_text_from_pdf(file_content):
"""Extract text from PDF file"""
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_content))
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text.strip()
except Exception as e:
raise Exception(f"Error reading PDF: {str(e)}")
def extract_text_from_docx(file_content):
"""Extract text from DOCX file"""
try:
doc = Document(io.BytesIO(file_content))
text = ""
for para in doc.paragraphs:
text += para.text + "\n"
for table in doc.tables:
for row in table.rows:
for cell in row.cells:
text += cell.text + " "
text += "\n"
return text.strip()
except Exception as e:
raise Exception(f"Error reading DOCX: {str(e)}")
def extract_text_from_txt(file_content):
"""Extract text from TXT file"""
try:
return file_content.decode('utf-8').strip()
except Exception as e:
raise Exception(f"Error reading TXT: {str(e)}")
def parse_resume(file_content, file_extension):
"""
Parse resume based on file type
Args:
file_content: Binary file content
file_extension: File extension (.pdf, .docx, .txt)
Returns:
Extracted text from resume
"""
file_extension = file_extension.lower()
if file_extension == ".pdf":
return extract_text_from_pdf(file_content)
elif file_extension in [".docx", ".doc"]:
return extract_text_from_docx(file_content)
elif file_extension == ".txt":
return extract_text_from_txt(file_content)
else:
raise ValueError(f"Unsupported file format: {file_extension}")
def extract_from_uploaded_file(uploaded_file):
"""
Extract text from uploaded file object
Args:
uploaded_file: Streamlit uploaded file object
Returns:
Extracted text
"""
file_extension = Path(uploaded_file.name).suffix.lower()
file_content = uploaded_file.read()
return parse_resume(file_content, file_extension)
|