from pptx import Presentation import pdfplumber from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas from io import BytesIO import docx from pathlib import Path import openpyxl def extract_text(file_path: Path, file_type: str) -> str: text = "" if file_type == "txt": with open(file_path, "r", encoding="utf-8") as f: text = f.read() elif file_type == "docx": doc = docx.Document(file_path) text = "\n".join([para.text for para in doc.paragraphs if para.text]) elif file_type == "xlsx": wb = openpyxl.load_workbook(file_path) sheet = wb.active for row in sheet.rows: for cell in row: if cell.value is not None: text += str(cell.value) + " " elif file_type == "pptx": prs = Presentation(file_path) for slide in prs.slides: for shape in slide.shapes: if shape.has_text_frame: for paragraph in shape.text_frame.paragraphs: if (clean_text := paragraph.text.strip()): text += clean_text + "\n" elif shape.has_table: for row in shape.table.rows: for cell in row.cells: if (cell_text := cell.text.strip()): text += cell_text + "\n" elif file_type == "pdf": with pdfplumber.open(file_path) as pdf: text = "\n".join( page.extract_text() for page in pdf.pages if page.extract_text() ) return text.strip() def save_file(text: str, original_path: Path, file_type: str, output_path: Path): if file_type == "docx": doc = docx.Document() doc.add_paragraph(text) doc.save(output_path) elif file_type == "xlsx": wb = openpyxl.Workbook() sheet = wb.active text_lines = text.split( "\n" ) for i, line in enumerate(text_lines, start=1): sheet.cell(row=i, column=1, value=line) wb.save(output_path) elif file_type == "pptx": prs = Presentation() slide_layout = prs.slide_layouts[1] slide = prs.slides.add_slide(slide_layout) content = slide.shapes.placeholders[1] content.text = text prs.save(output_path) elif file_type == "pdf": with open(output_path, "wb") as f: pdf_buffer = BytesIO() c = canvas.Canvas(pdf_buffer, pagesize=letter) text_lines = text.split("\n") y = 750 for line in text_lines: c.drawString(72, y, line) y -= 12 if y < 50: c.showPage() y = 750 c.save() f.write(pdf_buffer.getvalue()) else: with open(output_path, "w", encoding="utf-8") as f: f.write(text)