Spaces:

ChinmayBH
/

PDF_DATA_EXTRACTOR_PAGEWISE

Running

App Files Files Community

ChinmayBH commited on Aug 14, 2024

Commit

4516170

verified ·

1 Parent(s): a6cd894

updated app.py

Browse files

Files changed (1) hide show

app.py +250 -0

app.py CHANGED Viewed

	@@ -0,0 +1,250 @@

+import streamlit as st
+import os
+import json
+import fitz
+from io import BytesIO
+from PIL import Image
+import pandas as pd
+import tempfile
+def extract_text_images(
+        pdf_path: str, output_folder: str,
+        minimum_font_size: int,
+        extraction_type: str = 'both'
+        ) -> dict:
+    """
+    Extracts text and/or images from a PDF and organizes them by pages.
+    Params
+    -------
+    pdf_path: str
+        Path to the input PDF file.
+    output_folder: str
+        Path to the output folder where extracted data will be saved.
+    minimum_font_size: int
+        Minimum font size below which the text will be ignored.
+    extraction_type: str
+        Type of extraction, either 'text', 'images', or 'both'.
+    Returns
+    -------
+    dict
+        The extracted data organized by pages.
+    """
+    if not os.path.exists(output_folder):
+        os.makedirs(output_folder)
+    extraction_data = []
+    pdf_document = fitz.open(pdf_path)
+    for page_number in range(pdf_document.page_count):
+        page = pdf_document.load_page(page_number)
+        elements = []
+        if extraction_type in ('text', 'both'):
+            text_blocks = page.get_text("dict")["blocks"]
+            lines = {}
+            for block in text_blocks:
+                if block["type"] == 0:
+                    for line in block["lines"]:
+                        for span in line["spans"]:
+                            font_size = span["size"]
+                            top = span["bbox"][1]
+                            if font_size < minimum_font_size:
+                                continue
+                            if top not in lines:
+                                lines[top] = []
+                            lines[top].append(span)
+            for top in sorted(lines.keys()):
+                line = lines[top]
+                line_text = " ".join([span['text'] for span in line])
+                elements.append({
+                    'type': 'text',
+                    'font_size': line[0]['size'],
+                    'page': page_number + 1,
+                    'content': line_text,
+                    'x0': line[0]['bbox'][0],
+                    'top': top,
+                })
+        if extraction_type in ('images', 'both'):
+            image_list = page.get_images(full=True)
+            for img_index, img in enumerate(image_list):
+                xref = img[0]
+                base_image = pdf_document.extract_image(xref)
+                image_bytes = base_image["image"]
+                image_filename = os.path.join(
+                    output_folder,
+                    f"page_{page_number + 1}_img_{img_index + 1}.png"
+                )
+                with open(image_filename, "wb") as img_file:
+                    img_file.write(image_bytes)
+                img_rect = page.get_image_bbox(img)
+                elements.append({
+                    'type': 'image',
+                    'page': page_number + 1,
+                    'path': image_filename,
+                    'x0': img_rect.x0,
+                    'top': img_rect.y0
+                })
+        elements.sort(key=lambda e: (e['top'], e['x0']))
+        page_content = []
+        for element in elements:
+            if element['type'] == 'text':
+                if page_content and page_content[-1]['type'] == 'text':
+                    page_content[-1]['content'] += " " + element['content']
+                else:
+                    page_content.append({
+                        'type': 'text',
+                        'content': element['content']
+                    })
+            elif element['type'] == 'image':
+                page_content.append({
+                    'type': 'image',
+                    'path': element['path']
+                })
+        extraction_data.append({
+            'page': page_number + 1,
+            'content': page_content
+        })
+    pdf_document.close()
+    return extraction_data
+def convert_to_xlsx(data: dict) -> BytesIO:
+    rows = []
+    for item in data:
+        page_number = item['page']
+        content_list = item['content']
+        for content in content_list:
+            if content['type'] == 'text':
+                rows.append({
+                    'Page': page_number,
+                    'Content': content['content']
+                })
+            elif content['type'] == 'image':
+                rows.append({
+                    'Page': page_number,
+                    'Content': f"[Image: {content['path']}]"
+                })
+    df = pd.DataFrame(rows)
+    output = BytesIO()
+    with pd.ExcelWriter(output, engine='xlsxwriter') as writer:
+        df.to_excel(writer, index=False, sheet_name='Extraction')
+    output.seek(0)
+    return output
+def main():
+    st.markdown("<h1 style='text-align: center; color: blue;'>PDF DATA SNACHER:PAGEWISE</h1>", unsafe_allow_html=True)
+    st.markdown("<h3 style='text-align: center;color: brown;'>Extract valuable text and images from PDFs effortlessly and Convert PDFs into editable text and high-quality images </h3>", unsafe_allow_html=True)
+    st.sidebar.markdown('<p class="sidebar-header">PDF PREVIEW</p>', unsafe_allow_html=True)
+    pdf_file = st.file_uploader("Upload PDF", type="pdf")
+    if pdf_file is not None:
+        num_pages_to_preview = st.sidebar.slider(
+            "Select number of pages to preview:",
+            min_value=1, max_value=5, value=1
+        )
+        pdf_document = fitz.open(stream=pdf_file.read(), filetype="pdf")
+        for page_num in range(min(num_pages_to_preview, pdf_document.page_count)):
+            page = pdf_document.load_page(page_num)
+            pix = page.get_pixmap()
+            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            st.sidebar.image(image, caption=f"Page {page_num + 1} Preview", use_column_width=True)
+    st.info("You can select **only text** or **only images** or **text and images both** to extract form pdf")
+    extraction_type = st.selectbox(
+        "Choose extraction type:",
+        ("text", "images", "both")
+    )
+    st.info("Minimum font size is the size below which size, the text will get ignored for extraction")
+    minimum_font_size = st.number_input(
+        "Minimum font size to extract:",
+        min_value=1, value=2
+    )
+    if st.button("Start Extraction"):
+        if pdf_file is not None:
+            with tempfile.TemporaryDirectory() as output_folder:
+                temp_pdf_path = os.path.join(output_folder, pdf_file.name)
+                with open(temp_pdf_path, "wb") as f:
+                    f.write(pdf_file.getvalue())
+                extraction_data = extract_text_images(
+                    temp_pdf_path,
+                    output_folder,
+                    minimum_font_size,
+                    extraction_type
+                )
+                st.json(extraction_data)
+                xlsx_data = convert_to_xlsx(extraction_data)
+                col1, col2 = st.columns(2)
+                with col1:
+                    st.download_button(
+                        label="Download JSON",
+                        data=json.dumps(extraction_data, ensure_ascii=False, indent=4).encode('utf-8'),
+                        file_name='extraction_data.json',
+                        mime='application/json')
+                with col2:
+                    st.download_button(
+                        label="Download XLSX",
+                        data=xlsx_data,
+                        file_name='extraction_data.xlsx',
+                        mime='application/vnd.openxmlformats-officedocument.spreadsheetml.sheet')
+        else:
+            st.error("Please upload a PDF file.")
+    st.markdown(
+        """
+        <style>
+        .footer {
+            position: fixed;
+            bottom: 0;
+            left: 0;
+            width: 100%;
+            background-color: #F0F0F0;
+            font-family:cursive;
+            text-align: right;
+            padding: 5px 0;
+            font-size:20px;
+            font-weight: bold;
+            color: #FF0000;
+        }
+        </style>
+        <div class="footer">
+            CREATED BY: CHINMAY BHALERAO
+        </div>
+        """,
+        unsafe_allow_html=True
+    )
+if __name__ == "__main__":
+    main()