| | import fitz |
| | import docx |
| | import pandas as pd |
| | from pptx import Presentation |
| | from typing import BinaryIO |
| |
|
| |
|
| | class DocumentParser: |
| |
|
| | @staticmethod |
| | def extract_text_from_pdf(file: BinaryIO) -> str: |
| | try: |
| | pdf_document = fitz.open(stream=file.read(), filetype="pdf") |
| | text = [] |
| | for page in pdf_document: |
| | text.append(page.get_text()) |
| | return "\n".join(text) |
| | except Exception as e: |
| | raise ValueError(f"Error extracting text from PDF: {str(e)}") |
| | |
| | @staticmethod |
| | def extract_text_from_word(file: BinaryIO) -> str: |
| | try: |
| | doc = docx.Document(file) |
| | full_text = [] |
| | |
| | |
| | for paragraph in doc.paragraphs: |
| | if paragraph.text.strip(): |
| | full_text.append(paragraph.text) |
| | |
| | |
| | for table in doc.tables: |
| | for row in table.rows: |
| | row_text = " | ".join([cell.text.strip() for cell in row.cells if cell.text.strip()]) |
| | if row_text: |
| | full_text.append(row_text) |
| | |
| | return "\n".join(full_text) |
| | except Exception as e: |
| | raise ValueError(f"Error extracting text from Word: {str(e)}") |
| | |
| | @staticmethod |
| | def extract_text_from_excel(file: BinaryIO) -> str: |
| | try: |
| | df = pd.read_excel(file) |
| | return df.to_string(index=False) |
| | except Exception as e: |
| | raise ValueError(f"Error extracting text from Excel: {str(e)}") |
| | |
| | @staticmethod |
| | def extract_text_from_ppt(file: BinaryIO) -> str: |
| | try: |
| | presentation = Presentation(file) |
| | text = [] |
| | for slide in presentation.slides: |
| | for shape in slide.shapes: |
| | if hasattr(shape, "text_frame"): |
| | for paragraph in shape.text_frame.paragraphs: |
| | text.append(paragraph.text) |
| | return "\n".join(text) |
| | except Exception as e: |
| | raise ValueError(f"Error extracting text from PowerPoint: {str(e)}") |
| | |
| | @staticmethod |
| | def extract_text_from_txt(file: BinaryIO) -> str: |
| | try: |
| | return file.read().decode('utf-8') |
| | except Exception as e: |
| | raise ValueError(f"Error extracting text from TXT: {str(e)}") |
| | |
| | @classmethod |
| | def parse_document(cls, file: BinaryIO, file_type: str) -> str: |
| | parsers = { |
| | 'pdf': cls.extract_text_from_pdf, |
| | 'docx': cls.extract_text_from_word, |
| | 'xlsx': cls.extract_text_from_excel, |
| | 'pptx': cls.extract_text_from_ppt, |
| | 'txt': cls.extract_text_from_txt |
| | } |
| | |
| | parser = parsers.get(file_type.lower()) |
| | if not parser: |
| | raise ValueError(f"Unsupported file type: {file_type}") |
| | |
| | return parser(file) |
| |
|