Spaces:
Sleeping
Sleeping
| from pptx import Presentation | |
| import pdfplumber | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.pdfgen import canvas | |
| from io import BytesIO | |
| import docx | |
| from pathlib import Path | |
| import openpyxl | |
| def extract_text(file_path: Path, file_type: str) -> str: | |
| text = "" | |
| if file_type == "txt": | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| elif file_type == "docx": | |
| doc = docx.Document(file_path) | |
| text = "\n".join([para.text for para in doc.paragraphs if para.text]) | |
| elif file_type == "xlsx": | |
| wb = openpyxl.load_workbook(file_path) | |
| sheet = wb.active | |
| for row in sheet.rows: | |
| for cell in row: | |
| if cell.value is not None: | |
| text += str(cell.value) + " " | |
| elif file_type == "pptx": | |
| prs = Presentation(file_path) | |
| for slide in prs.slides: | |
| for shape in slide.shapes: | |
| if shape.has_text_frame: | |
| for paragraph in shape.text_frame.paragraphs: | |
| if (clean_text := paragraph.text.strip()): | |
| text += clean_text + "\n" | |
| elif shape.has_table: | |
| for row in shape.table.rows: | |
| for cell in row.cells: | |
| if (cell_text := cell.text.strip()): | |
| text += cell_text + "\n" | |
| elif file_type == "pdf": | |
| with pdfplumber.open(file_path) as pdf: | |
| text = "\n".join( | |
| page.extract_text() | |
| for page in pdf.pages | |
| if page.extract_text() | |
| ) | |
| return text.strip() | |
| def save_file(text: str, original_path: Path, file_type: str, output_path: Path): | |
| if file_type == "docx": | |
| doc = docx.Document() | |
| doc.add_paragraph(text) | |
| doc.save(output_path) | |
| elif file_type == "xlsx": | |
| wb = openpyxl.Workbook() | |
| sheet = wb.active | |
| text_lines = text.split( | |
| "\n" | |
| ) | |
| for i, line in enumerate(text_lines, start=1): | |
| sheet.cell(row=i, column=1, value=line) | |
| wb.save(output_path) | |
| elif file_type == "pptx": | |
| prs = Presentation() | |
| slide_layout = prs.slide_layouts[1] | |
| slide = prs.slides.add_slide(slide_layout) | |
| content = slide.shapes.placeholders[1] | |
| content.text = text | |
| prs.save(output_path) | |
| elif file_type == "pdf": | |
| with open(output_path, "wb") as f: | |
| pdf_buffer = BytesIO() | |
| c = canvas.Canvas(pdf_buffer, pagesize=letter) | |
| text_lines = text.split("\n") | |
| y = 750 | |
| for line in text_lines: | |
| c.drawString(72, y, line) | |
| y -= 12 | |
| if y < 50: | |
| c.showPage() | |
| y = 750 | |
| c.save() | |
| f.write(pdf_buffer.getvalue()) | |
| else: | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(text) | |