danielhjerresen's picture
Upload 2 files
eeeaa67 verified
Raw
History Blame Contribute Delete
4.07 kB
# app.py
import streamlit as st
import fitz
from pdf_counter import count_characters
st.set_page_config(
page_title="PDF Character Counter",
layout="wide",
)
st.title("PDF Character Counter")
st.write(
"Counts characters including spaces and can automatically remove headers, footers, and page numbers."
)
uploaded_file = st.file_uploader(
"Upload PDF",
type=["pdf"],
)
if uploaded_file:
pdf_bytes = uploaded_file.read()
doc = fitz.open(
stream=pdf_bytes,
filetype="pdf",
)
page_count = len(doc)
st.subheader("Settings")
excluded_pages = st.multiselect(
"Exclude pages",
options=list(range(1, page_count + 1)),
default=[],
)
col1, col2, col3 = st.columns(3)
with col1:
remove_headers = st.checkbox(
"Remove headers",
value=True,
)
with col2:
remove_footers = st.checkbox(
"Remove footers",
value=True,
)
with col3:
remove_page_numbers = st.checkbox(
"Remove page numbers",
value=True,
)
characters_per_normal_page = st.number_input(
"Tegn pr. normalside",
min_value=1,
value=2400,
step=100,
)
result = count_characters(
pdf_bytes=pdf_bytes,
excluded_pages=set(excluded_pages),
remove_headers=remove_headers,
remove_footers=remove_footers,
remove_page_numbers=remove_page_numbers,
)
normal_pages = result["total_characters"] / characters_per_normal_page
st.divider()
col1, col2 = st.columns(2)
with col1:
st.metric(
"Characters including spaces",
f"{result['total_characters']:,}".replace(",", "."),
)
with col2:
st.metric(
"Normalsider",
f"{normal_pages:.2f}".replace(".", ","),
)
st.divider()
st.subheader("Elements removed from the count")
removed_items = result["removed_items"]
removed_headers = [
item
for item in removed_items
if item["Type"] in ["Sidehoved", "Løbende sidehoved"]
]
removed_footers = [
item
for item in removed_items
if item["Type"] == "Sidefod"
]
removed_page_numbers = [
item
for item in removed_items
if item["Type"] == "Sidetal"
]
col1, col2, col3 = st.columns(3)
with col1:
st.metric(
"Headers removed",
len(removed_headers),
)
with col2:
st.metric(
"Footers removed",
len(removed_footers),
)
with col3:
st.metric(
"Page numbers removed",
len(removed_page_numbers),
)
with st.expander("Show removed headers"):
if removed_headers:
st.dataframe(
removed_headers,
use_container_width=True,
)
else:
st.info("No headers were removed.")
with st.expander("Show removed footers"):
if removed_footers:
st.dataframe(
removed_footers,
use_container_width=True,
)
else:
st.info("No footers were removed.")
with st.expander("Show removed page numbers"):
if removed_page_numbers:
st.dataframe(
removed_page_numbers,
use_container_width=True,
)
else:
st.info("No page numbers were removed.")
st.divider()
st.subheader("Result per page")
st.dataframe(
result["page_results"],
use_container_width=True,
)
st.divider()
with st.expander("View text included in the count"):
st.text_area(
"Text",
result["included_text"],
height=400,
)
st.download_button(
label="Download text as TXT",
data=result["included_text"],
file_name="counted_text.txt",
mime="text/plain",
)