File size: 5,086 Bytes
04ac6c3
 
 
 
 
 
 
 
da5250c
04ac6c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61c5b48
 
 
30476c0
 
61c5b48
 
 
30476c0
 
61c5b48
 
30476c0
 
 
 
 
 
 
 
 
 
 
61c5b48
30476c0
da5250c
 
30476c0
 
 
da5250c
30476c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from pptx import Presentation
import pdfplumber
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from io import BytesIO
import docx
from pathlib import Path
import openpyxl
import re

def extract_text(file_path: Path, file_type: str) -> str:
    text = ""

    if file_type == "txt":
        with open(file_path, "r", encoding="utf-8") as f:
            text = f.read()

    elif file_type == "docx":
        doc = docx.Document(file_path)
        text = "\n".join([para.text for para in doc.paragraphs if para.text])

    elif file_type == "xlsx":
        wb = openpyxl.load_workbook(file_path)
        sheet = wb.active
        for row in sheet.rows:
            for cell in row:
                if cell.value is not None:
                    text += str(cell.value) + " "

    elif file_type == "pptx":
        prs = Presentation(file_path)
        for slide in prs.slides:
            for shape in slide.shapes:
                if shape.has_text_frame:
                    for paragraph in shape.text_frame.paragraphs:
                        if (clean_text := paragraph.text.strip()):
                            text += clean_text + "\n"
                            
                elif shape.has_table:
                    for row in shape.table.rows:
                        for cell in row.cells:
                            if (cell_text := cell.text.strip()):
                                text += cell_text + "\n"
        

    elif file_type == "pdf":
        with pdfplumber.open(file_path) as pdf:
            text = "\n".join(
                page.extract_text() 
                for page in pdf.pages 
                if page.extract_text()
            )

    return text.strip()

def save_file(text: str, original_path: Path, file_type: str, output_path: Path):
    if file_type == "docx":
        doc = docx.Document()
        doc.add_paragraph(text)
        doc.save(output_path)

    elif file_type == "xlsx":
        wb = openpyxl.Workbook()
        sheet = wb.active
        text_lines = text.split(
            "\n"
        ) 
        for i, line in enumerate(text_lines, start=1):
            sheet.cell(row=i, column=1, value=line)
        wb.save(output_path)

    elif file_type == "pptx":
        prs = Presentation()
        slide_layout = prs.slide_layouts[1]
        slide = prs.slides.add_slide(slide_layout)
        content = slide.shapes.placeholders[1]
        content.text = text
        prs.save(output_path)

    elif file_type == "pdf":
         with open(output_path, "wb") as f: 
            pdf_buffer = BytesIO()
            c = canvas.Canvas(pdf_buffer, pagesize=letter)
            text_lines = text.split("\n")
            y = 750  
            for line in text_lines:
                c.drawString(72, y, line)
                y -= 12  
                if y < 50:  
                    c.showPage()
                    y = 750
            c.save()
            f.write(pdf_buffer.getvalue())

    else:
        with open(output_path, "w", encoding="utf-8") as f:
            f.write(text)


def verify_summary(summary: str, original: str) -> str:
    """Simplified verification using word matching"""
    original_lower = original.lower()
    verified = []
    
    for sentence in summary.split('.'):
        sentence = sentence.strip()
        if not sentence:
            continue
            
        # Count matching words (minimum 3 letters)
        matches = 0
        total_words = 0
        for word in sentence.lower().split():
            if len(word) >= 3 and word in original_lower:
                matches += 1
            total_words += 1
        
        # Keep sentence if at least 30% of significant words match
        if total_words > 0 and (matches / total_words) >= 0.3:
            verified.append(sentence)
    
    return '. '.join(verified) if verified else summary[:500]

def ensure_complete_sentences(text: str) -> str:
    """Guarantees proper sentence structure with robust error handling"""
    if not text or not isinstance(text, str):
        return "" 
    
    try:
        # Normalize whitespace
        text = ' '.join(text.split())
        
        # Split on sentence boundaries
        sentences = re.split(r'(?<=[.!?])\s+', text)
        
        # Filter and validate sentences
        valid_sentences = [
            s.strip() for s in sentences 
            if s.strip() and s[-1] in {'.', '!', '?'}
        ]
        
        # Reconstruct text with proper spacing
        reconstructed = ' '.join(valid_sentences)
        
        # Final safety check
        if not reconstructed.endswith(('.', '!', '?')):
            last_break = max(
                reconstructed.rfind('.'), 
                reconstructed.rfind('!'), 
                reconstructed.rfind('?')
            )
            if last_break > 0:
                reconstructed = reconstructed[:last_break + 1]
            else:
                reconstructed = reconstructed + '.' if reconstructed else ""
                
        return reconstructed
    
    except Exception:
        return text