from pptx import Presentation import pdfplumber from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas from io import BytesIO import docx from pathlib import Path import openpyxl import re def extract_text(file_path: Path, file_type: str) -> str: text = "" if file_type == "txt": with open(file_path, "r", encoding="utf-8") as f: text = f.read() elif file_type == "docx": doc = docx.Document(file_path) text = "\n".join([para.text for para in doc.paragraphs if para.text]) elif file_type == "xlsx": wb = openpyxl.load_workbook(file_path) sheet = wb.active for row in sheet.rows: for cell in row: if cell.value is not None: text += str(cell.value) + " " elif file_type == "pptx": prs = Presentation(file_path) for slide in prs.slides: for shape in slide.shapes: if shape.has_text_frame: for paragraph in shape.text_frame.paragraphs: if (clean_text := paragraph.text.strip()): text += clean_text + "\n" elif shape.has_table: for row in shape.table.rows: for cell in row.cells: if (cell_text := cell.text.strip()): text += cell_text + "\n" elif file_type == "pdf": with pdfplumber.open(file_path) as pdf: text = "\n".join( page.extract_text() for page in pdf.pages if page.extract_text() ) return text.strip() def save_file(text: str, original_path: Path, file_type: str, output_path: Path): if file_type == "docx": doc = docx.Document() doc.add_paragraph(text) doc.save(output_path) elif file_type == "xlsx": wb = openpyxl.Workbook() sheet = wb.active text_lines = text.split( "\n" ) for i, line in enumerate(text_lines, start=1): sheet.cell(row=i, column=1, value=line) wb.save(output_path) elif file_type == "pptx": prs = Presentation() slide_layout = prs.slide_layouts[1] slide = prs.slides.add_slide(slide_layout) content = slide.shapes.placeholders[1] content.text = text prs.save(output_path) elif file_type == "pdf": with open(output_path, "wb") as f: pdf_buffer = BytesIO() c = canvas.Canvas(pdf_buffer, pagesize=letter) text_lines = text.split("\n") y = 750 for line in text_lines: c.drawString(72, y, line) y -= 12 if y < 50: c.showPage() y = 750 c.save() f.write(pdf_buffer.getvalue()) else: with open(output_path, "w", encoding="utf-8") as f: f.write(text) def verify_summary(summary: str, original: str) -> str: """Simplified verification using word matching""" original_lower = original.lower() verified = [] for sentence in summary.split('.'): sentence = sentence.strip() if not sentence: continue # Count matching words (minimum 3 letters) matches = 0 total_words = 0 for word in sentence.lower().split(): if len(word) >= 3 and word in original_lower: matches += 1 total_words += 1 # Keep sentence if at least 30% of significant words match if total_words > 0 and (matches / total_words) >= 0.3: verified.append(sentence) return '. '.join(verified) if verified else summary[:500] def ensure_complete_sentences(text: str) -> str: """Guarantees proper sentence structure with robust error handling""" if not text or not isinstance(text, str): return "" try: # Normalize whitespace text = ' '.join(text.split()) # Split on sentence boundaries sentences = re.split(r'(?<=[.!?])\s+', text) # Filter and validate sentences valid_sentences = [ s.strip() for s in sentences if s.strip() and s[-1] in {'.', '!', '?'} ] # Reconstruct text with proper spacing reconstructed = ' '.join(valid_sentences) # Final safety check if not reconstructed.endswith(('.', '!', '?')): last_break = max( reconstructed.rfind('.'), reconstructed.rfind('!'), reconstructed.rfind('?') ) if last_break > 0: reconstructed = reconstructed[:last_break + 1] else: reconstructed = reconstructed + '.' if reconstructed else "" return reconstructed except Exception: return text