File size: 5,778 Bytes
1d8ed3b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import fitz  # PyMuPDF
import warnings
import streamlit as st


def extract_page_data_fitz(doc):
    """
    Extracts page numbers and text from a PDF file using PyMuPDF.
    The function looks for page numbers in the top and bottom 15% of each page.
    It returns a list of dictionaries, each containing the page index, page number,
    and the full text of the page.
    """
    pages_data = []

    for i, page in enumerate(doc):
        height = page.rect.height
        width = page.rect.width

        top_rect = fitz.Rect(0, 0, width, height * 0.15)
        bottom_rect = fitz.Rect(0, height * 0.85, width, height)

        top_text = page.get_text("text", clip=top_rect).split()
        bottom_text = page.get_text("text", clip=bottom_rect).split()

        found_number = next((int(text) for text in top_text + bottom_text if text.isdigit()), None)
        full_text = page.get_text("text")

        pages_data.append({
            "index": i,
            "number": found_number,
            "content": full_text
        })

    return pages_data


def correct_page_numbers(pages_data, sequence_length=10):
    """
    Corrects page numbers by finding the first sequence of consecutive values, 
    filling gaps forward and backward, and setting values < 1 to None. 
    Returns the index of the first page numbered 1, or None if no sequence is found.
    """
    try:
        seen = [(i, d["number"]) for i, d in enumerate(pages_data) if isinstance(d["number"], int)]

        for start in range(len(seen) - sequence_length + 1):
            if all(seen[start + j][1] == seen[start][1] + j for j in range(sequence_length)):
                base_index, base_number = seen[start]
                break
        else:
            return None

        for offset, page in enumerate(pages_data[base_index:], start=0):
            page["number"] = base_number + offset

        for offset in range(1, base_index + 1):
            page = pages_data[base_index - offset]
            page["number"] = base_number - offset

        for page in pages_data:
            if page["number"] < 1:
                page["number"] = None

        return next((page['index'] for page in pages_data if page["number"] == 1), None)

    except Exception:
        return None


def extract_text(doc, start_chapter=None):
    """
    Extracts the text of the book starting from the specified page index.
    If no start_chapter is provided, it returns the whole doc.
    """
    if start_chapter is not None:
        all_pages_text = [
            doc[page_range].get_text("text")
            for page_range in range(start_chapter, len(doc))
        ]
        return "\n".join(all_pages_text) 
    else:
        warnings.warn(
            "No chapter start has been detected: extracting text from the entire PDF.",
            UserWarning
        )
        return "\n".join(page.get_text("text") for page in doc)
    

def process_pdf():
    """
    Processes a PDF file to extract text starting from the first chapter.
    """
    pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
    if not pdf_bytes:
        st.error("No PDF uploaded.")
        return

    with st.spinner("Processing uploaded file..."):
        with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
            pages_data_infos = extract_page_data_fitz(doc)
            chapters_starting_page = correct_page_numbers(pages_data_infos)
            full_text = extract_text(doc, chapters_starting_page)

        st.session_state['full_text'] = full_text
        st.session_state['pages_data_infos'] = pages_data_infos
        st.session_state['chapters_starting_page'] = chapters_starting_page


def extract_toc(page_range):
    """
    Extracts text from specific pages in a PDF file using PyMuPDF.
    This is used to extract TOC based on a given range of page numbers indicated by the user.
    """
    pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
    if pdf_bytes is None:
        st.error("No PDF uploaded.")
        return ""

    chapters_content_list = []
    with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
        for page_num in page_range:
            if 0 <= page_num < len(doc):
                text = doc[page_num].get_text("text")
                chapters_content_list.append(text)
            else:
                print(f"Warning: Page number {page_num} is out of bounds.")

    toc_text = "\n".join(chapters_content_list)
    st.session_state["toc"] = toc_text


def extract_chapters(chapters_dict, pages_data_corrected):
    """
    Extract chapters from the provided JSON and pages data.
    Args:
        chapters_json (list): List of chapter dictionaries from the TOC.
        pages_data_corrected (list): List of page data dictionaries with content.
    Returns:
        list: List of dictionaries, each containing chapter details and content.
    """
    # Initialize an empty list to hold chapter dictionaries
    chapters = []
    
    # Iterate through each chapter in the JSON
    for chapter in chapters_dict:
        start_page = chapter['start_page']
        end_page = chapter['end_page']
        chapter_text = []

        # Extract content for the chapter from the pages data
        for chapter_range in range(start_page-1, end_page):
            chapter_text.append(pages_data_corrected[chapter_range]['content'])

        chapter_text = ' '.join(chapter_text)

        # Create a dictionary for the chapter
        chapter_dict = {
            'chapter_number': chapter['chapter_number'],
            'chapter_title': chapter['chapter_title'],
            'start_page': start_page,
            'end_page': end_page,
            'content': chapter_text
        }

        chapters.append(chapter_dict)
    
    st.session_state['chapters_extracted'] = chapters