Spaces:

davidepanza
/

test2text

Sleeping

File size: 5,778 Bytes

1d8ed3b

import fitz  # PyMuPDF
import warnings
import streamlit as st


def extract_page_data_fitz(doc):
    """
    Extracts page numbers and text from a PDF file using PyMuPDF.
    The function looks for page numbers in the top and bottom 15% of each page.
    It returns a list of dictionaries, each containing the page index, page number,
    and the full text of the page.
    """
    pages_data = []

    for i, page in enumerate(doc):
        height = page.rect.height
        width = page.rect.width

        top_rect = fitz.Rect(0, 0, width, height * 0.15)
        bottom_rect = fitz.Rect(0, height * 0.85, width, height)

        top_text = page.get_text("text", clip=top_rect).split()
        bottom_text = page.get_text("text", clip=bottom_rect).split()

        found_number = next((int(text) for text in top_text + bottom_text if text.isdigit()), None)
        full_text = page.get_text("text")

        pages_data.append({
            "index": i,
            "number": found_number,
            "content": full_text
        })

    return pages_data


def correct_page_numbers(pages_data, sequence_length=10):
    """
    Corrects page numbers by finding the first sequence of consecutive values, 
    filling gaps forward and backward, and setting values < 1 to None. 
    Returns the index of the first page numbered 1, or None if no sequence is found.
    """
    try:
        seen = [(i, d["number"]) for i, d in enumerate(pages_data) if isinstance(d["number"], int)]

        for start in range(len(seen) - sequence_length + 1):
            if all(seen[start + j][1] == seen[start][1] + j for j in range(sequence_length)):
                base_index, base_number = seen[start]
                break
        else:
            return None

        for offset, page in enumerate(pages_data[base_index:], start=0):
            page["number"] = base_number + offset

        for offset in range(1, base_index + 1):
            page = pages_data[base_index - offset]
            page["number"] = base_number - offset

        for page in pages_data:
            if page["number"] < 1:
                page["number"] = None

        return next((page['index'] for page in pages_data if page["number"] == 1), None)

    except Exception:
        return None


def extract_text(doc, start_chapter=None):
    """
    Extracts the text of the book starting from the specified page index.
    If no start_chapter is provided, it returns the whole doc.
    """
    if start_chapter is not None:
        all_pages_text = [
            doc[page_range].get_text("text")
            for page_range in range(start_chapter, len(doc))
        ]
        return "\n".join(all_pages_text) 
    else:
        warnings.warn(
            "No chapter start has been detected: extracting text from the entire PDF.",
            UserWarning
        )
        return "\n".join(page.get_text("text") for page in doc)
    

def process_pdf():
    """
    Processes a PDF file to extract text starting from the first chapter.
    """
    pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
    if not pdf_bytes:
        st.error("No PDF uploaded.")
        return

    with st.spinner("Processing uploaded file..."):
        with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
            pages_data_infos = extract_page_data_fitz(doc)
            chapters_starting_page = correct_page_numbers(pages_data_infos)
            full_text = extract_text(doc, chapters_starting_page)

        st.session_state['full_text'] = full_text
        st.session_state['pages_data_infos'] = pages_data_infos
        st.session_state['chapters_starting_page'] = chapters_starting_page


def extract_toc(page_range):
    """
    Extracts text from specific pages in a PDF file using PyMuPDF.
    This is used to extract TOC based on a given range of page numbers indicated by the user.
    """
    pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
    if pdf_bytes is None:
        st.error("No PDF uploaded.")
        return ""

    chapters_content_list = []
    with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
        for page_num in page_range:
            if 0 <= page_num < len(doc):
                text = doc[page_num].get_text("text")
                chapters_content_list.append(text)
            else:
                print(f"Warning: Page number {page_num} is out of bounds.")

    toc_text = "\n".join(chapters_content_list)
    st.session_state["toc"] = toc_text


def extract_chapters(chapters_dict, pages_data_corrected):
    """
    Extract chapters from the provided JSON and pages data.
    Args:
        chapters_json (list): List of chapter dictionaries from the TOC.
        pages_data_corrected (list): List of page data dictionaries with content.
    Returns:
        list: List of dictionaries, each containing chapter details and content.
    """
    # Initialize an empty list to hold chapter dictionaries
    chapters = []
    
    # Iterate through each chapter in the JSON
    for chapter in chapters_dict:
        start_page = chapter['start_page']
        end_page = chapter['end_page']
        chapter_text = []

        # Extract content for the chapter from the pages data
        for chapter_range in range(start_page-1, end_page):
            chapter_text.append(pages_data_corrected[chapter_range]['content'])

        chapter_text = ' '.join(chapter_text)

        # Create a dictionary for the chapter
        chapter_dict = {
            'chapter_number': chapter['chapter_number'],
            'chapter_title': chapter['chapter_title'],
            'start_page': start_page,
            'end_page': end_page,
            'content': chapter_text
        }

        chapters.append(chapter_dict)
    
    st.session_state['chapters_extracted'] = chapters