Spaces:

Tech-di
/

WallTD-v.1

Sleeping

App Files Files Community

WallTD-v.1 / utils.py

Feriel080

Update utils.py

30476c0 verified 10 months ago

raw

history blame

5.09 kB

	from pptx import Presentation
	import pdfplumber
	from reportlab.lib.pagesizes import letter
	from reportlab.pdfgen import canvas
	from io import BytesIO
	import docx
	from pathlib import Path
	import openpyxl
	import re

	def extract_text(file_path: Path, file_type: str) -> str:
	text = ""

	if file_type == "txt":
	with open(file_path, "r", encoding="utf-8") as f:
	text = f.read()

	elif file_type == "docx":
	doc = docx.Document(file_path)
	text = "\n".join([para.text for para in doc.paragraphs if para.text])

	elif file_type == "xlsx":
	wb = openpyxl.load_workbook(file_path)
	sheet = wb.active
	for row in sheet.rows:
	for cell in row:
	if cell.value is not None:
	text += str(cell.value) + " "

	elif file_type == "pptx":
	prs = Presentation(file_path)
	for slide in prs.slides:
	for shape in slide.shapes:
	if shape.has_text_frame:
	for paragraph in shape.text_frame.paragraphs:
	if (clean_text := paragraph.text.strip()):
	text += clean_text + "\n"

	elif shape.has_table:
	for row in shape.table.rows:
	for cell in row.cells:
	if (cell_text := cell.text.strip()):
	text += cell_text + "\n"


	elif file_type == "pdf":
	with pdfplumber.open(file_path) as pdf:
	text = "\n".join(
	page.extract_text()
	for page in pdf.pages
	if page.extract_text()
	)

	return text.strip()

	def save_file(text: str, original_path: Path, file_type: str, output_path: Path):
	if file_type == "docx":
	doc = docx.Document()
	doc.add_paragraph(text)
	doc.save(output_path)

	elif file_type == "xlsx":
	wb = openpyxl.Workbook()
	sheet = wb.active
	text_lines = text.split(
	"\n"
	)
	for i, line in enumerate(text_lines, start=1):
	sheet.cell(row=i, column=1, value=line)
	wb.save(output_path)

	elif file_type == "pptx":
	prs = Presentation()
	slide_layout = prs.slide_layouts[1]
	slide = prs.slides.add_slide(slide_layout)
	content = slide.shapes.placeholders[1]
	content.text = text
	prs.save(output_path)

	elif file_type == "pdf":
	with open(output_path, "wb") as f:
	pdf_buffer = BytesIO()
	c = canvas.Canvas(pdf_buffer, pagesize=letter)
	text_lines = text.split("\n")
	y = 750
	for line in text_lines:
	c.drawString(72, y, line)
	y -= 12
	if y < 50:
	c.showPage()
	y = 750
	c.save()
	f.write(pdf_buffer.getvalue())

	else:
	with open(output_path, "w", encoding="utf-8") as f:
	f.write(text)


	def verify_summary(summary: str, original: str) -> str:
	"""Simplified verification using word matching"""
	original_lower = original.lower()
	verified = []

	for sentence in summary.split('.'):
	sentence = sentence.strip()
	if not sentence:
	continue

	# Count matching words (minimum 3 letters)
	matches = 0
	total_words = 0
	for word in sentence.lower().split():
	if len(word) >= 3 and word in original_lower:
	matches += 1
	total_words += 1

	# Keep sentence if at least 30% of significant words match
	if total_words > 0 and (matches / total_words) >= 0.3:
	verified.append(sentence)

	return '. '.join(verified) if verified else summary[:500]

	def ensure_complete_sentences(text: str) -> str:
	"""Guarantees proper sentence structure with robust error handling"""
	if not text or not isinstance(text, str):
	return ""

	try:
	# Normalize whitespace
	text = ' '.join(text.split())

	# Split on sentence boundaries
	sentences = re.split(r'(?<=[.!?])\s+', text)

	# Filter and validate sentences
	valid_sentences = [
	s.strip() for s in sentences
	if s.strip() and s[-1] in {'.', '!', '?'}
	]

	# Reconstruct text with proper spacing
	reconstructed = ' '.join(valid_sentences)

	# Final safety check
	if not reconstructed.endswith(('.', '!', '?')):
	last_break = max(
	reconstructed.rfind('.'),
	reconstructed.rfind('!'),
	reconstructed.rfind('?')
	)
	if last_break > 0:
	reconstructed = reconstructed[:last_break + 1]
	else:
	reconstructed = reconstructed + '.' if reconstructed else ""

	return reconstructed

	except Exception:
	return text