Spaces:
Sleeping
Sleeping
File size: 3,058 Bytes
04ac6c3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 | from pptx import Presentation
import pdfplumber
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from io import BytesIO
import docx
from pathlib import Path
import openpyxl
def extract_text(file_path: Path, file_type: str) -> str:
text = ""
if file_type == "txt":
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
elif file_type == "docx":
doc = docx.Document(file_path)
text = "\n".join([para.text for para in doc.paragraphs if para.text])
elif file_type == "xlsx":
wb = openpyxl.load_workbook(file_path)
sheet = wb.active
for row in sheet.rows:
for cell in row:
if cell.value is not None:
text += str(cell.value) + " "
elif file_type == "pptx":
prs = Presentation(file_path)
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
if (clean_text := paragraph.text.strip()):
text += clean_text + "\n"
elif shape.has_table:
for row in shape.table.rows:
for cell in row.cells:
if (cell_text := cell.text.strip()):
text += cell_text + "\n"
elif file_type == "pdf":
with pdfplumber.open(file_path) as pdf:
text = "\n".join(
page.extract_text()
for page in pdf.pages
if page.extract_text()
)
return text.strip()
def save_file(text: str, original_path: Path, file_type: str, output_path: Path):
if file_type == "docx":
doc = docx.Document()
doc.add_paragraph(text)
doc.save(output_path)
elif file_type == "xlsx":
wb = openpyxl.Workbook()
sheet = wb.active
text_lines = text.split(
"\n"
)
for i, line in enumerate(text_lines, start=1):
sheet.cell(row=i, column=1, value=line)
wb.save(output_path)
elif file_type == "pptx":
prs = Presentation()
slide_layout = prs.slide_layouts[1]
slide = prs.slides.add_slide(slide_layout)
content = slide.shapes.placeholders[1]
content.text = text
prs.save(output_path)
elif file_type == "pdf":
with open(output_path, "wb") as f:
pdf_buffer = BytesIO()
c = canvas.Canvas(pdf_buffer, pagesize=letter)
text_lines = text.split("\n")
y = 750
for line in text_lines:
c.drawString(72, y, line)
y -= 12
if y < 50:
c.showPage()
y = 750
c.save()
f.write(pdf_buffer.getvalue())
else:
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)
|