| |
| import streamlit as st |
| import fitz |
|
|
| from pdf_counter import count_characters |
|
|
|
|
| st.set_page_config( |
| page_title="PDF Character Counter", |
| layout="wide", |
| ) |
|
|
| st.title("PDF Character Counter") |
| st.write( |
| "Counts characters including spaces and can automatically remove headers, footers, and page numbers." |
| ) |
|
|
| uploaded_file = st.file_uploader( |
| "Upload PDF", |
| type=["pdf"], |
| ) |
|
|
| if uploaded_file: |
| pdf_bytes = uploaded_file.read() |
|
|
| doc = fitz.open( |
| stream=pdf_bytes, |
| filetype="pdf", |
| ) |
|
|
| page_count = len(doc) |
|
|
| st.subheader("Settings") |
|
|
| excluded_pages = st.multiselect( |
| "Exclude pages", |
| options=list(range(1, page_count + 1)), |
| default=[], |
| ) |
|
|
| col1, col2, col3 = st.columns(3) |
|
|
| with col1: |
| remove_headers = st.checkbox( |
| "Remove headers", |
| value=True, |
| ) |
|
|
| with col2: |
| remove_footers = st.checkbox( |
| "Remove footers", |
| value=True, |
| ) |
|
|
| with col3: |
| remove_page_numbers = st.checkbox( |
| "Remove page numbers", |
| value=True, |
| ) |
|
|
| characters_per_normal_page = st.number_input( |
| "Tegn pr. normalside", |
| min_value=1, |
| value=2400, |
| step=100, |
| ) |
|
|
| result = count_characters( |
| pdf_bytes=pdf_bytes, |
| excluded_pages=set(excluded_pages), |
| remove_headers=remove_headers, |
| remove_footers=remove_footers, |
| remove_page_numbers=remove_page_numbers, |
| ) |
|
|
| normal_pages = result["total_characters"] / characters_per_normal_page |
|
|
| st.divider() |
|
|
| col1, col2 = st.columns(2) |
|
|
| with col1: |
| st.metric( |
| "Characters including spaces", |
| f"{result['total_characters']:,}".replace(",", "."), |
| ) |
|
|
| with col2: |
| st.metric( |
| "Normalsider", |
| f"{normal_pages:.2f}".replace(".", ","), |
| ) |
|
|
| st.divider() |
|
|
| st.subheader("Elements removed from the count") |
|
|
| removed_items = result["removed_items"] |
|
|
| removed_headers = [ |
| item |
| for item in removed_items |
| if item["Type"] in ["Sidehoved", "Løbende sidehoved"] |
| ] |
|
|
| removed_footers = [ |
| item |
| for item in removed_items |
| if item["Type"] == "Sidefod" |
| ] |
|
|
| removed_page_numbers = [ |
| item |
| for item in removed_items |
| if item["Type"] == "Sidetal" |
| ] |
|
|
| col1, col2, col3 = st.columns(3) |
|
|
| with col1: |
| st.metric( |
| "Headers removed", |
| len(removed_headers), |
| ) |
|
|
| with col2: |
| st.metric( |
| "Footers removed", |
| len(removed_footers), |
| ) |
|
|
| with col3: |
| st.metric( |
| "Page numbers removed", |
| len(removed_page_numbers), |
| ) |
|
|
| with st.expander("Show removed headers"): |
| if removed_headers: |
| st.dataframe( |
| removed_headers, |
| use_container_width=True, |
| ) |
| else: |
| st.info("No headers were removed.") |
|
|
| with st.expander("Show removed footers"): |
| if removed_footers: |
| st.dataframe( |
| removed_footers, |
| use_container_width=True, |
| ) |
| else: |
| st.info("No footers were removed.") |
|
|
| with st.expander("Show removed page numbers"): |
| if removed_page_numbers: |
| st.dataframe( |
| removed_page_numbers, |
| use_container_width=True, |
| ) |
| else: |
| st.info("No page numbers were removed.") |
|
|
| st.divider() |
|
|
| st.subheader("Result per page") |
|
|
| st.dataframe( |
| result["page_results"], |
| use_container_width=True, |
| ) |
|
|
| st.divider() |
|
|
| with st.expander("View text included in the count"): |
| st.text_area( |
| "Text", |
| result["included_text"], |
| height=400, |
| ) |
|
|
| st.download_button( |
| label="Download text as TXT", |
| data=result["included_text"], |
| file_name="counted_text.txt", |
| mime="text/plain", |
| ) |