File size: 4,066 Bytes
eeeaa67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# app.py
import streamlit as st
import fitz

from pdf_counter import count_characters


st.set_page_config(
    page_title="PDF Character Counter",
    layout="wide",
)

st.title("PDF Character Counter")
st.write(
    "Counts characters including spaces and can automatically remove headers, footers, and page numbers."
)

uploaded_file = st.file_uploader(
    "Upload PDF",
    type=["pdf"],
)

if uploaded_file:
    pdf_bytes = uploaded_file.read()

    doc = fitz.open(
        stream=pdf_bytes,
        filetype="pdf",
    )

    page_count = len(doc)

    st.subheader("Settings")

    excluded_pages = st.multiselect(
        "Exclude pages",
        options=list(range(1, page_count + 1)),
        default=[],
    )

    col1, col2, col3 = st.columns(3)

    with col1:
        remove_headers = st.checkbox(
            "Remove headers",
            value=True,
        )

    with col2:
        remove_footers = st.checkbox(
            "Remove footers",
            value=True,
        )

    with col3:
        remove_page_numbers = st.checkbox(
            "Remove page numbers",
            value=True,
        )

    characters_per_normal_page = st.number_input(
        "Tegn pr. normalside",
        min_value=1,
        value=2400,
        step=100,
    )

    result = count_characters(
        pdf_bytes=pdf_bytes,
        excluded_pages=set(excluded_pages),
        remove_headers=remove_headers,
        remove_footers=remove_footers,
        remove_page_numbers=remove_page_numbers,
    )

    normal_pages = result["total_characters"] / characters_per_normal_page

    st.divider()

    col1, col2 = st.columns(2)

    with col1:
        st.metric(
            "Characters including spaces",
            f"{result['total_characters']:,}".replace(",", "."),
        )

    with col2:
        st.metric(
            "Normalsider",
            f"{normal_pages:.2f}".replace(".", ","),
        )

    st.divider()

    st.subheader("Elements removed from the count")

    removed_items = result["removed_items"]

    removed_headers = [
        item
        for item in removed_items
        if item["Type"] in ["Sidehoved", "Løbende sidehoved"]
    ]

    removed_footers = [
        item
        for item in removed_items
        if item["Type"] == "Sidefod"
    ]

    removed_page_numbers = [
        item
        for item in removed_items
        if item["Type"] == "Sidetal"
    ]

    col1, col2, col3 = st.columns(3)

    with col1:
        st.metric(
            "Headers removed",
            len(removed_headers),
        )

    with col2:
        st.metric(
            "Footers removed",
            len(removed_footers),
        )

    with col3:
        st.metric(
            "Page numbers removed",
            len(removed_page_numbers),
        )

    with st.expander("Show removed headers"):
        if removed_headers:
            st.dataframe(
                removed_headers,
                use_container_width=True,
            )
        else:
            st.info("No headers were removed.")

    with st.expander("Show removed footers"):
        if removed_footers:
            st.dataframe(
                removed_footers,
                use_container_width=True,
            )
        else:
            st.info("No footers were removed.")

    with st.expander("Show removed page numbers"):
        if removed_page_numbers:
            st.dataframe(
                removed_page_numbers,
                use_container_width=True,
            )
        else:
            st.info("No page numbers were removed.")

    st.divider()

    st.subheader("Result per page")

    st.dataframe(
        result["page_results"],
        use_container_width=True,
    )

    st.divider()

    with st.expander("View text included in the count"):
        st.text_area(
            "Text",
            result["included_text"],
            height=400,
        )

    st.download_button(
        label="Download text as TXT",
        data=result["included_text"],
        file_name="counted_text.txt",
        mime="text/plain",
    )