File size: 2,987 Bytes
bd91918 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 | import fitz # PyMuPDF
import docx
import pandas as pd
from pptx import Presentation
from typing import BinaryIO
class DocumentParser:
@staticmethod
def extract_text_from_pdf(file: BinaryIO) -> str:
try:
pdf_document = fitz.open(stream=file.read(), filetype="pdf")
text = []
for page in pdf_document:
text.append(page.get_text())
return "\n".join(text)
except Exception as e:
raise ValueError(f"Error extracting text from PDF: {str(e)}")
@staticmethod
def extract_text_from_word(file: BinaryIO) -> str:
try:
doc = docx.Document(file)
full_text = []
# Extract paragraphs
for paragraph in doc.paragraphs:
if paragraph.text.strip():
full_text.append(paragraph.text)
# Extract tables
for table in doc.tables:
for row in table.rows:
row_text = " | ".join([cell.text.strip() for cell in row.cells if cell.text.strip()])
if row_text:
full_text.append(row_text)
return "\n".join(full_text)
except Exception as e:
raise ValueError(f"Error extracting text from Word: {str(e)}")
@staticmethod
def extract_text_from_excel(file: BinaryIO) -> str:
try:
df = pd.read_excel(file)
return df.to_string(index=False)
except Exception as e:
raise ValueError(f"Error extracting text from Excel: {str(e)}")
@staticmethod
def extract_text_from_ppt(file: BinaryIO) -> str:
try:
presentation = Presentation(file)
text = []
for slide in presentation.slides:
for shape in slide.shapes:
if hasattr(shape, "text_frame"):
for paragraph in shape.text_frame.paragraphs:
text.append(paragraph.text)
return "\n".join(text)
except Exception as e:
raise ValueError(f"Error extracting text from PowerPoint: {str(e)}")
@staticmethod
def extract_text_from_txt(file: BinaryIO) -> str:
try:
return file.read().decode('utf-8')
except Exception as e:
raise ValueError(f"Error extracting text from TXT: {str(e)}")
@classmethod
def parse_document(cls, file: BinaryIO, file_type: str) -> str:
parsers = {
'pdf': cls.extract_text_from_pdf,
'docx': cls.extract_text_from_word,
'xlsx': cls.extract_text_from_excel,
'pptx': cls.extract_text_from_ppt,
'txt': cls.extract_text_from_txt
}
parser = parsers.get(file_type.lower())
if not parser:
raise ValueError(f"Unsupported file type: {file_type}")
return parser(file)
|