Spaces:
Sleeping
Sleeping
File size: 5,778 Bytes
1d8ed3b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
import fitz # PyMuPDF
import warnings
import streamlit as st
def extract_page_data_fitz(doc):
"""
Extracts page numbers and text from a PDF file using PyMuPDF.
The function looks for page numbers in the top and bottom 15% of each page.
It returns a list of dictionaries, each containing the page index, page number,
and the full text of the page.
"""
pages_data = []
for i, page in enumerate(doc):
height = page.rect.height
width = page.rect.width
top_rect = fitz.Rect(0, 0, width, height * 0.15)
bottom_rect = fitz.Rect(0, height * 0.85, width, height)
top_text = page.get_text("text", clip=top_rect).split()
bottom_text = page.get_text("text", clip=bottom_rect).split()
found_number = next((int(text) for text in top_text + bottom_text if text.isdigit()), None)
full_text = page.get_text("text")
pages_data.append({
"index": i,
"number": found_number,
"content": full_text
})
return pages_data
def correct_page_numbers(pages_data, sequence_length=10):
"""
Corrects page numbers by finding the first sequence of consecutive values,
filling gaps forward and backward, and setting values < 1 to None.
Returns the index of the first page numbered 1, or None if no sequence is found.
"""
try:
seen = [(i, d["number"]) for i, d in enumerate(pages_data) if isinstance(d["number"], int)]
for start in range(len(seen) - sequence_length + 1):
if all(seen[start + j][1] == seen[start][1] + j for j in range(sequence_length)):
base_index, base_number = seen[start]
break
else:
return None
for offset, page in enumerate(pages_data[base_index:], start=0):
page["number"] = base_number + offset
for offset in range(1, base_index + 1):
page = pages_data[base_index - offset]
page["number"] = base_number - offset
for page in pages_data:
if page["number"] < 1:
page["number"] = None
return next((page['index'] for page in pages_data if page["number"] == 1), None)
except Exception:
return None
def extract_text(doc, start_chapter=None):
"""
Extracts the text of the book starting from the specified page index.
If no start_chapter is provided, it returns the whole doc.
"""
if start_chapter is not None:
all_pages_text = [
doc[page_range].get_text("text")
for page_range in range(start_chapter, len(doc))
]
return "\n".join(all_pages_text)
else:
warnings.warn(
"No chapter start has been detected: extracting text from the entire PDF.",
UserWarning
)
return "\n".join(page.get_text("text") for page in doc)
def process_pdf():
"""
Processes a PDF file to extract text starting from the first chapter.
"""
pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
if not pdf_bytes:
st.error("No PDF uploaded.")
return
with st.spinner("Processing uploaded file..."):
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
pages_data_infos = extract_page_data_fitz(doc)
chapters_starting_page = correct_page_numbers(pages_data_infos)
full_text = extract_text(doc, chapters_starting_page)
st.session_state['full_text'] = full_text
st.session_state['pages_data_infos'] = pages_data_infos
st.session_state['chapters_starting_page'] = chapters_starting_page
def extract_toc(page_range):
"""
Extracts text from specific pages in a PDF file using PyMuPDF.
This is used to extract TOC based on a given range of page numbers indicated by the user.
"""
pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
if pdf_bytes is None:
st.error("No PDF uploaded.")
return ""
chapters_content_list = []
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
for page_num in page_range:
if 0 <= page_num < len(doc):
text = doc[page_num].get_text("text")
chapters_content_list.append(text)
else:
print(f"Warning: Page number {page_num} is out of bounds.")
toc_text = "\n".join(chapters_content_list)
st.session_state["toc"] = toc_text
def extract_chapters(chapters_dict, pages_data_corrected):
"""
Extract chapters from the provided JSON and pages data.
Args:
chapters_json (list): List of chapter dictionaries from the TOC.
pages_data_corrected (list): List of page data dictionaries with content.
Returns:
list: List of dictionaries, each containing chapter details and content.
"""
# Initialize an empty list to hold chapter dictionaries
chapters = []
# Iterate through each chapter in the JSON
for chapter in chapters_dict:
start_page = chapter['start_page']
end_page = chapter['end_page']
chapter_text = []
# Extract content for the chapter from the pages data
for chapter_range in range(start_page-1, end_page):
chapter_text.append(pages_data_corrected[chapter_range]['content'])
chapter_text = ' '.join(chapter_text)
# Create a dictionary for the chapter
chapter_dict = {
'chapter_number': chapter['chapter_number'],
'chapter_title': chapter['chapter_title'],
'start_page': start_page,
'end_page': end_page,
'content': chapter_text
}
chapters.append(chapter_dict)
st.session_state['chapters_extracted'] = chapters |