Spaces:
Sleeping
Sleeping
File size: 5,086 Bytes
04ac6c3 da5250c 04ac6c3 61c5b48 30476c0 61c5b48 30476c0 61c5b48 30476c0 61c5b48 30476c0 da5250c 30476c0 da5250c 30476c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 |
from pptx import Presentation
import pdfplumber
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from io import BytesIO
import docx
from pathlib import Path
import openpyxl
import re
def extract_text(file_path: Path, file_type: str) -> str:
text = ""
if file_type == "txt":
with open(file_path, "r", encoding="utf-8") as f:
text = f.read()
elif file_type == "docx":
doc = docx.Document(file_path)
text = "\n".join([para.text for para in doc.paragraphs if para.text])
elif file_type == "xlsx":
wb = openpyxl.load_workbook(file_path)
sheet = wb.active
for row in sheet.rows:
for cell in row:
if cell.value is not None:
text += str(cell.value) + " "
elif file_type == "pptx":
prs = Presentation(file_path)
for slide in prs.slides:
for shape in slide.shapes:
if shape.has_text_frame:
for paragraph in shape.text_frame.paragraphs:
if (clean_text := paragraph.text.strip()):
text += clean_text + "\n"
elif shape.has_table:
for row in shape.table.rows:
for cell in row.cells:
if (cell_text := cell.text.strip()):
text += cell_text + "\n"
elif file_type == "pdf":
with pdfplumber.open(file_path) as pdf:
text = "\n".join(
page.extract_text()
for page in pdf.pages
if page.extract_text()
)
return text.strip()
def save_file(text: str, original_path: Path, file_type: str, output_path: Path):
if file_type == "docx":
doc = docx.Document()
doc.add_paragraph(text)
doc.save(output_path)
elif file_type == "xlsx":
wb = openpyxl.Workbook()
sheet = wb.active
text_lines = text.split(
"\n"
)
for i, line in enumerate(text_lines, start=1):
sheet.cell(row=i, column=1, value=line)
wb.save(output_path)
elif file_type == "pptx":
prs = Presentation()
slide_layout = prs.slide_layouts[1]
slide = prs.slides.add_slide(slide_layout)
content = slide.shapes.placeholders[1]
content.text = text
prs.save(output_path)
elif file_type == "pdf":
with open(output_path, "wb") as f:
pdf_buffer = BytesIO()
c = canvas.Canvas(pdf_buffer, pagesize=letter)
text_lines = text.split("\n")
y = 750
for line in text_lines:
c.drawString(72, y, line)
y -= 12
if y < 50:
c.showPage()
y = 750
c.save()
f.write(pdf_buffer.getvalue())
else:
with open(output_path, "w", encoding="utf-8") as f:
f.write(text)
def verify_summary(summary: str, original: str) -> str:
"""Simplified verification using word matching"""
original_lower = original.lower()
verified = []
for sentence in summary.split('.'):
sentence = sentence.strip()
if not sentence:
continue
# Count matching words (minimum 3 letters)
matches = 0
total_words = 0
for word in sentence.lower().split():
if len(word) >= 3 and word in original_lower:
matches += 1
total_words += 1
# Keep sentence if at least 30% of significant words match
if total_words > 0 and (matches / total_words) >= 0.3:
verified.append(sentence)
return '. '.join(verified) if verified else summary[:500]
def ensure_complete_sentences(text: str) -> str:
"""Guarantees proper sentence structure with robust error handling"""
if not text or not isinstance(text, str):
return ""
try:
# Normalize whitespace
text = ' '.join(text.split())
# Split on sentence boundaries
sentences = re.split(r'(?<=[.!?])\s+', text)
# Filter and validate sentences
valid_sentences = [
s.strip() for s in sentences
if s.strip() and s[-1] in {'.', '!', '?'}
]
# Reconstruct text with proper spacing
reconstructed = ' '.join(valid_sentences)
# Final safety check
if not reconstructed.endswith(('.', '!', '?')):
last_break = max(
reconstructed.rfind('.'),
reconstructed.rfind('!'),
reconstructed.rfind('?')
)
if last_break > 0:
reconstructed = reconstructed[:last_break + 1]
else:
reconstructed = reconstructed + '.' if reconstructed else ""
return reconstructed
except Exception:
return text
|