Spaces:

Tech-di
/

WallTD-v.1

Sleeping

File size: 5,086 Bytes

from pptx import Presentation
import pdfplumber
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from io import BytesIO
import docx
from pathlib import Path
import openpyxl
import re

def extract_text(file_path: Path, file_type: str) -> str:
    text = ""

    if file_type == "txt":
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

    elif file_type == "docx":
        doc = docx.Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs if para.text])

    elif file_type == "xlsx":
        wb = openpyxl.load_workbook(file_path)
        sheet = wb.active
        for row in sheet.rows:
            for cell in row:
                if cell.value is not None:
                    text += str(cell.value) + " "

    elif file_type == "pptx":
        prs = Presentation(file_path)
        for slide in prs.slides:
            for shape in slide.shapes:
                if shape.has_text_frame:
                    for paragraph in shape.text_frame.paragraphs:
                        if (clean_text := paragraph.text.strip()):
                            text += clean_text + "\n"
                            
                elif shape.has_table:
                    for row in shape.table.rows:
                        for cell in row.cells:
                            if (cell_text := cell.text.strip()):
                                text += cell_text + "\n"
        

    elif file_type == "pdf":
        with pdfplumber.open(file_path) as pdf:
            text = "\n".join(
                page.extract_text() 
                for page in pdf.pages 
                if page.extract_text()
            )

    return text.strip()

def save_file(text: str, original_path: Path, file_type: str, output_path: Path):
    if file_type == "docx":
        doc = docx.Document()
        doc.add_paragraph(text)
        doc.save(output_path)

    elif file_type == "xlsx":
        wb = openpyxl.Workbook()
        sheet = wb.active
        text_lines = text.split(
            "\n"
        ) 
        for i, line in enumerate(text_lines, start=1):
            sheet.cell(row=i, column=1, value=line)
        wb.save(output_path)

    elif file_type == "pptx":
        prs = Presentation()
        slide_layout = prs.slide_layouts[1]
        slide = prs.slides.add_slide(slide_layout)
        content = slide.shapes.placeholders[1]
        content.text = text
        prs.save(output_path)

    elif file_type == "pdf":
         with open(output_path, "wb") as f: 
            pdf_buffer = BytesIO()
            c = canvas.Canvas(pdf_buffer, pagesize=letter)
            text_lines = text.split("\n")
            y = 750  
            for line in text_lines:
                c.drawString(72, y, line)
                y -= 12  
                if y < 50:  
                    c.showPage()
                    y = 750
            c.save()
            f.write(pdf_buffer.getvalue())

    else:
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(text)


def verify_summary(summary: str, original: str) -> str:
    """Simplified verification using word matching"""
    original_lower = original.lower()
    verified = []
    
    for sentence in summary.split('.'):
        sentence = sentence.strip()
        if not sentence:
            continue
            
        # Count matching words (minimum 3 letters)
        matches = 0
        total_words = 0
        for word in sentence.lower().split():
            if len(word) >= 3 and word in original_lower:
                matches += 1
            total_words += 1
        
        # Keep sentence if at least 30% of significant words match
        if total_words > 0 and (matches / total_words) >= 0.3:
            verified.append(sentence)
    
    return '. '.join(verified) if verified else summary[:500]

def ensure_complete_sentences(text: str) -> str:
    """Guarantees proper sentence structure with robust error handling"""
    if not text or not isinstance(text, str):
        return "" 
    
    try:
        # Normalize whitespace
        text = ' '.join(text.split())
        
        # Split on sentence boundaries
        sentences = re.split(r'(?<=[.!?])\s+', text)
        
        # Filter and validate sentences
        valid_sentences = [
            s.strip() for s in sentences 
            if s.strip() and s[-1] in {'.', '!', '?'}
        ]
        
        # Reconstruct text with proper spacing
        reconstructed = ' '.join(valid_sentences)
        
        # Final safety check
        if not reconstructed.endswith(('.', '!', '?')):
            last_break = max(
                reconstructed.rfind('.'), 
                reconstructed.rfind('!'), 
                reconstructed.rfind('?')
            )
            if last_break > 0:
                reconstructed = reconstructed[:last_break + 1]
            else:
                reconstructed = reconstructed + '.' if reconstructed else ""
                
        return reconstructed
    
    except Exception:
        return text