import fitz # PyMuPDF import warnings import streamlit as st def extract_page_data_fitz(doc): """ Extracts page numbers and text from a PDF file using PyMuPDF. The function looks for page numbers in the top and bottom 15% of each page. It returns a list of dictionaries, each containing the page index, page number, and the full text of the page. """ pages_data = [] for i, page in enumerate(doc): height = page.rect.height width = page.rect.width top_rect = fitz.Rect(0, 0, width, height * 0.15) bottom_rect = fitz.Rect(0, height * 0.85, width, height) top_text = page.get_text("text", clip=top_rect).split() bottom_text = page.get_text("text", clip=bottom_rect).split() found_number = next((int(text) for text in top_text + bottom_text if text.isdigit()), None) full_text = page.get_text("text") pages_data.append({ "index": i, "number": found_number, "content": full_text }) return pages_data def correct_page_numbers(pages_data, sequence_length=10): """ Corrects page numbers by finding the first sequence of consecutive values, filling gaps forward and backward, and setting values < 1 to None. Returns the index of the first page numbered 1, or None if no sequence is found. """ try: seen = [(i, d["number"]) for i, d in enumerate(pages_data) if isinstance(d["number"], int)] for start in range(len(seen) - sequence_length + 1): if all(seen[start + j][1] == seen[start][1] + j for j in range(sequence_length)): base_index, base_number = seen[start] break else: return None for offset, page in enumerate(pages_data[base_index:], start=0): page["number"] = base_number + offset for offset in range(1, base_index + 1): page = pages_data[base_index - offset] page["number"] = base_number - offset for page in pages_data: if page["number"] < 1: page["number"] = None return next((page['index'] for page in pages_data if page["number"] == 1), None) except Exception: return None def extract_text(doc, start_chapter=None): """ Extracts the text of the book starting from the specified page index. If no start_chapter is provided, it returns the whole doc. """ if start_chapter is not None: all_pages_text = [ doc[page_range].get_text("text") for page_range in range(start_chapter, len(doc)) ] return "\n".join(all_pages_text) else: warnings.warn( "No chapter start has been detected: extracting text from the entire PDF.", UserWarning ) return "\n".join(page.get_text("text") for page in doc) def process_pdf(): """ Processes a PDF file to extract text starting from the first chapter. """ pdf_bytes = st.session_state.get("uploaded_pdf_bytes") if not pdf_bytes: st.error("No PDF uploaded.") return with st.spinner("Processing uploaded file..."): with fitz.open(stream=pdf_bytes, filetype="pdf") as doc: pages_data_infos = extract_page_data_fitz(doc) chapters_starting_page = correct_page_numbers(pages_data_infos) full_text = extract_text(doc, chapters_starting_page) st.session_state['full_text'] = full_text st.session_state['pages_data_infos'] = pages_data_infos st.session_state['chapters_starting_page'] = chapters_starting_page def extract_toc(page_range): """ Extracts text from specific pages in a PDF file using PyMuPDF. This is used to extract TOC based on a given range of page numbers indicated by the user. """ pdf_bytes = st.session_state.get("uploaded_pdf_bytes") if pdf_bytes is None: st.error("No PDF uploaded.") return "" chapters_content_list = [] with fitz.open(stream=pdf_bytes, filetype="pdf") as doc: for page_num in page_range: if 0 <= page_num < len(doc): text = doc[page_num].get_text("text") chapters_content_list.append(text) else: print(f"Warning: Page number {page_num} is out of bounds.") toc_text = "\n".join(chapters_content_list) st.session_state["toc"] = toc_text def extract_chapters(chapters_dict, pages_data_corrected): """ Extract chapters from the provided JSON and pages data. Args: chapters_json (list): List of chapter dictionaries from the TOC. pages_data_corrected (list): List of page data dictionaries with content. Returns: list: List of dictionaries, each containing chapter details and content. """ # Initialize an empty list to hold chapter dictionaries chapters = [] # Iterate through each chapter in the JSON for chapter in chapters_dict: start_page = chapter['start_page'] end_page = chapter['end_page'] chapter_text = [] # Extract content for the chapter from the pages data for chapter_range in range(start_page-1, end_page): chapter_text.append(pages_data_corrected[chapter_range]['content']) chapter_text = ' '.join(chapter_text) # Create a dictionary for the chapter chapter_dict = { 'chapter_number': chapter['chapter_number'], 'chapter_title': chapter['chapter_title'], 'start_page': start_page, 'end_page': end_page, 'content': chapter_text } chapters.append(chapter_dict) st.session_state['chapters_extracted'] = chapters