Spaces:
Sleeping
Sleeping
File size: 1,747 Bytes
0326035 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 |
"""
File parsers for PDF and DOCX resume files
"""
from typing import Optional
import io
def extract_text_from_pdf(file_content: bytes) -> str:
"""Extract text from PDF file"""
try:
import PyPDF2
pdf_file = io.BytesIO(file_content)
pdf_reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text.strip()
except Exception as e:
print(f"PDF extraction error: {e}")
return ""
def extract_text_from_docx(file_content: bytes) -> str:
"""Extract text from DOCX file"""
try:
import docx
doc_file = io.BytesIO(file_content)
doc = docx.Document(doc_file)
text = ""
for paragraph in doc.paragraphs:
text += paragraph.text + "\n"
return text.strip()
except Exception as e:
print(f"DOCX extraction error: {e}")
return ""
def extract_text_from_file(file_content: bytes, file_type: str) -> str:
"""Extract text based on file type"""
if file_type == "application/pdf" or file_type.endswith(".pdf"):
return extract_text_from_pdf(file_content)
elif file_type in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document", "application/msword"] or file_type.endswith(".docx"):
return extract_text_from_docx(file_content)
elif file_type.startswith("text/"):
# Plain text file
return file_content.decode("utf-8", errors="ignore")
else:
raise ValueError(f"Unsupported file type: {file_type}")
|