Spaces:
Sleeping
Sleeping
| from pptx import Presentation | |
| import pdfplumber | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.pdfgen import canvas | |
| from io import BytesIO | |
| import docx | |
| from pathlib import Path | |
| import openpyxl | |
| import re | |
| def extract_text(file_path: Path, file_type: str) -> str: | |
| text = "" | |
| if file_type == "txt": | |
| with open(file_path, "r", encoding="utf-8") as f: | |
| text = f.read() | |
| elif file_type == "docx": | |
| doc = docx.Document(file_path) | |
| text = "\n".join([para.text for para in doc.paragraphs if para.text]) | |
| elif file_type == "xlsx": | |
| wb = openpyxl.load_workbook(file_path) | |
| sheet = wb.active | |
| for row in sheet.rows: | |
| for cell in row: | |
| if cell.value is not None: | |
| text += str(cell.value) + " " | |
| elif file_type == "pptx": | |
| prs = Presentation(file_path) | |
| for slide in prs.slides: | |
| for shape in slide.shapes: | |
| if shape.has_text_frame: | |
| for paragraph in shape.text_frame.paragraphs: | |
| if (clean_text := paragraph.text.strip()): | |
| text += clean_text + "\n" | |
| elif shape.has_table: | |
| for row in shape.table.rows: | |
| for cell in row.cells: | |
| if (cell_text := cell.text.strip()): | |
| text += cell_text + "\n" | |
| elif file_type == "pdf": | |
| with pdfplumber.open(file_path) as pdf: | |
| text = "\n".join( | |
| page.extract_text() | |
| for page in pdf.pages | |
| if page.extract_text() | |
| ) | |
| return text.strip() | |
| def save_file(text: str, original_path: Path, file_type: str, output_path: Path): | |
| if file_type == "docx": | |
| doc = docx.Document() | |
| doc.add_paragraph(text) | |
| doc.save(output_path) | |
| elif file_type == "xlsx": | |
| wb = openpyxl.Workbook() | |
| sheet = wb.active | |
| text_lines = text.split( | |
| "\n" | |
| ) | |
| for i, line in enumerate(text_lines, start=1): | |
| sheet.cell(row=i, column=1, value=line) | |
| wb.save(output_path) | |
| elif file_type == "pptx": | |
| prs = Presentation() | |
| slide_layout = prs.slide_layouts[1] | |
| slide = prs.slides.add_slide(slide_layout) | |
| content = slide.shapes.placeholders[1] | |
| content.text = text | |
| prs.save(output_path) | |
| elif file_type == "pdf": | |
| with open(output_path, "wb") as f: | |
| pdf_buffer = BytesIO() | |
| c = canvas.Canvas(pdf_buffer, pagesize=letter) | |
| text_lines = text.split("\n") | |
| y = 750 | |
| for line in text_lines: | |
| c.drawString(72, y, line) | |
| y -= 12 | |
| if y < 50: | |
| c.showPage() | |
| y = 750 | |
| c.save() | |
| f.write(pdf_buffer.getvalue()) | |
| else: | |
| with open(output_path, "w", encoding="utf-8") as f: | |
| f.write(text) | |
| def verify_summary(summary: str, original: str) -> str: | |
| """Simplified verification using word matching""" | |
| original_lower = original.lower() | |
| verified = [] | |
| for sentence in summary.split('.'): | |
| sentence = sentence.strip() | |
| if not sentence: | |
| continue | |
| # Count matching words (minimum 3 letters) | |
| matches = 0 | |
| total_words = 0 | |
| for word in sentence.lower().split(): | |
| if len(word) >= 3 and word in original_lower: | |
| matches += 1 | |
| total_words += 1 | |
| # Keep sentence if at least 30% of significant words match | |
| if total_words > 0 and (matches / total_words) >= 0.3: | |
| verified.append(sentence) | |
| return '. '.join(verified) if verified else summary[:500] | |
| def ensure_complete_sentences(text: str) -> str: | |
| """Guarantees proper sentence structure with robust error handling""" | |
| if not text or not isinstance(text, str): | |
| return "" | |
| try: | |
| # Normalize whitespace | |
| text = ' '.join(text.split()) | |
| # Split on sentence boundaries | |
| sentences = re.split(r'(?<=[.!?])\s+', text) | |
| # Filter and validate sentences | |
| valid_sentences = [ | |
| s.strip() for s in sentences | |
| if s.strip() and s[-1] in {'.', '!', '?'} | |
| ] | |
| # Reconstruct text with proper spacing | |
| reconstructed = ' '.join(valid_sentences) | |
| # Final safety check | |
| if not reconstructed.endswith(('.', '!', '?')): | |
| last_break = max( | |
| reconstructed.rfind('.'), | |
| reconstructed.rfind('!'), | |
| reconstructed.rfind('?') | |
| ) | |
| if last_break > 0: | |
| reconstructed = reconstructed[:last_break + 1] | |
| else: | |
| reconstructed = reconstructed + '.' if reconstructed else "" | |
| return reconstructed | |
| except Exception: | |
| return text | |