WallTD-v.1 / utils.py
Feriel080's picture
Update utils.py
30476c0 verified
raw
history blame
5.09 kB
from pptx import Presentation
import pdfplumber
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from io import BytesIO
import docx
from pathlib import Path
import openpyxl
import re
def extract_text(file_path: Path, file_type: str) -> str:
text = ""
if file_type == "txt":
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
elif file_type == "docx":
doc = docx.Document(file_path)
text = "\n".join([para.text for para in doc.paragraphs if para.text])
elif file_type == "xlsx":
wb = openpyxl.load_workbook(file_path)
sheet = wb.active
for row in sheet.rows:
for cell in row:
if cell.value is not None:
text += str(cell.value) + " "
elif file_type == "pptx":
prs = Presentation(file_path)
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
if (clean_text := paragraph.text.strip()):
text += clean_text + "\n"
elif shape.has_table:
for row in shape.table.rows:
for cell in row.cells:
if (cell_text := cell.text.strip()):
text += cell_text + "\n"
elif file_type == "pdf":
with pdfplumber.open(file_path) as pdf:
text = "\n".join(
page.extract_text()
for page in pdf.pages
if page.extract_text()
)
return text.strip()
def save_file(text: str, original_path: Path, file_type: str, output_path: Path):
if file_type == "docx":
doc = docx.Document()
doc.add_paragraph(text)
doc.save(output_path)
elif file_type == "xlsx":
wb = openpyxl.Workbook()
sheet = wb.active
text_lines = text.split(
"\n"
)
for i, line in enumerate(text_lines, start=1):
sheet.cell(row=i, column=1, value=line)
wb.save(output_path)
elif file_type == "pptx":
prs = Presentation()
slide_layout = prs.slide_layouts[1]
slide = prs.slides.add_slide(slide_layout)
content = slide.shapes.placeholders[1]
content.text = text
prs.save(output_path)
elif file_type == "pdf":
with open(output_path, "wb") as f:
pdf_buffer = BytesIO()
c = canvas.Canvas(pdf_buffer, pagesize=letter)
text_lines = text.split("\n")
y = 750
for line in text_lines:
c.drawString(72, y, line)
y -= 12
if y < 50:
c.showPage()
y = 750
c.save()
f.write(pdf_buffer.getvalue())
else:
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)
def verify_summary(summary: str, original: str) -> str:
"""Simplified verification using word matching"""
original_lower = original.lower()
verified = []
for sentence in summary.split('.'):
sentence = sentence.strip()
if not sentence:
continue
# Count matching words (minimum 3 letters)
matches = 0
total_words = 0
for word in sentence.lower().split():
if len(word) >= 3 and word in original_lower:
matches += 1
total_words += 1
# Keep sentence if at least 30% of significant words match
if total_words > 0 and (matches / total_words) >= 0.3:
verified.append(sentence)
return '. '.join(verified) if verified else summary[:500]
def ensure_complete_sentences(text: str) -> str:
"""Guarantees proper sentence structure with robust error handling"""
if not text or not isinstance(text, str):
return ""
try:
# Normalize whitespace
text = ' '.join(text.split())
# Split on sentence boundaries
sentences = re.split(r'(?<=[.!?])\s+', text)
# Filter and validate sentences
valid_sentences = [
s.strip() for s in sentences
if s.strip() and s[-1] in {'.', '!', '?'}
]
# Reconstruct text with proper spacing
reconstructed = ' '.join(valid_sentences)
# Final safety check
if not reconstructed.endswith(('.', '!', '?')):
last_break = max(
reconstructed.rfind('.'),
reconstructed.rfind('!'),
reconstructed.rfind('?')
)
if last_break > 0:
reconstructed = reconstructed[:last_break + 1]
else:
reconstructed = reconstructed + '.' if reconstructed else ""
return reconstructed
except Exception:
return text