WallTD-v.1 / backend /utils.py
Feriel080's picture
Upload 2 files
04ac6c3 verified
raw
history blame
3.06 kB
from pptx import Presentation
import pdfplumber
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from io import BytesIO
import docx
from pathlib import Path
import openpyxl
def extract_text(file_path: Path, file_type: str) -> str:
text = ""
if file_type == "txt":
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
elif file_type == "docx":
doc = docx.Document(file_path)
text = "\n".join([para.text for para in doc.paragraphs if para.text])
elif file_type == "xlsx":
wb = openpyxl.load_workbook(file_path)
sheet = wb.active
for row in sheet.rows:
for cell in row:
if cell.value is not None:
text += str(cell.value) + " "
elif file_type == "pptx":
prs = Presentation(file_path)
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
if (clean_text := paragraph.text.strip()):
text += clean_text + "\n"
elif shape.has_table:
for row in shape.table.rows:
for cell in row.cells:
if (cell_text := cell.text.strip()):
text += cell_text + "\n"
elif file_type == "pdf":
with pdfplumber.open(file_path) as pdf:
text = "\n".join(
page.extract_text()
for page in pdf.pages
if page.extract_text()
)
return text.strip()
def save_file(text: str, original_path: Path, file_type: str, output_path: Path):
if file_type == "docx":
doc = docx.Document()
doc.add_paragraph(text)
doc.save(output_path)
elif file_type == "xlsx":
wb = openpyxl.Workbook()
sheet = wb.active
text_lines = text.split(
"\n"
)
for i, line in enumerate(text_lines, start=1):
sheet.cell(row=i, column=1, value=line)
wb.save(output_path)
elif file_type == "pptx":
prs = Presentation()
slide_layout = prs.slide_layouts[1]
slide = prs.slides.add_slide(slide_layout)
content = slide.shapes.placeholders[1]
content.text = text
prs.save(output_path)
elif file_type == "pdf":
with open(output_path, "wb") as f:
pdf_buffer = BytesIO()
c = canvas.Canvas(pdf_buffer, pagesize=letter)
text_lines = text.split("\n")
y = 750
for line in text_lines:
c.drawString(72, y, line)
y -= 12
if y < 50:
c.showPage()
y = 750
c.save()
f.write(pdf_buffer.getvalue())
else:
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)