test2text / app /backend /raw_text_processing.py
Davide Panza
Upload 56 files
1d8ed3b verified
import fitz # PyMuPDF
import warnings
import streamlit as st
def extract_page_data_fitz(doc):
"""
Extracts page numbers and text from a PDF file using PyMuPDF.
The function looks for page numbers in the top and bottom 15% of each page.
It returns a list of dictionaries, each containing the page index, page number,
and the full text of the page.
"""
pages_data = []
for i, page in enumerate(doc):
height = page.rect.height
width = page.rect.width
top_rect = fitz.Rect(0, 0, width, height * 0.15)
bottom_rect = fitz.Rect(0, height * 0.85, width, height)
top_text = page.get_text("text", clip=top_rect).split()
bottom_text = page.get_text("text", clip=bottom_rect).split()
found_number = next((int(text) for text in top_text + bottom_text if text.isdigit()), None)
full_text = page.get_text("text")
pages_data.append({
"index": i,
"number": found_number,
"content": full_text
})
return pages_data
def correct_page_numbers(pages_data, sequence_length=10):
"""
Corrects page numbers by finding the first sequence of consecutive values,
filling gaps forward and backward, and setting values < 1 to None.
Returns the index of the first page numbered 1, or None if no sequence is found.
"""
try:
seen = [(i, d["number"]) for i, d in enumerate(pages_data) if isinstance(d["number"], int)]
for start in range(len(seen) - sequence_length + 1):
if all(seen[start + j][1] == seen[start][1] + j for j in range(sequence_length)):
base_index, base_number = seen[start]
break
else:
return None
for offset, page in enumerate(pages_data[base_index:], start=0):
page["number"] = base_number + offset
for offset in range(1, base_index + 1):
page = pages_data[base_index - offset]
page["number"] = base_number - offset
for page in pages_data:
if page["number"] < 1:
page["number"] = None
return next((page['index'] for page in pages_data if page["number"] == 1), None)
except Exception:
return None
def extract_text(doc, start_chapter=None):
"""
Extracts the text of the book starting from the specified page index.
If no start_chapter is provided, it returns the whole doc.
"""
if start_chapter is not None:
all_pages_text = [
doc[page_range].get_text("text")
for page_range in range(start_chapter, len(doc))
]
return "\n".join(all_pages_text)
else:
warnings.warn(
"No chapter start has been detected: extracting text from the entire PDF.",
UserWarning
)
return "\n".join(page.get_text("text") for page in doc)
def process_pdf():
"""
Processes a PDF file to extract text starting from the first chapter.
"""
pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
if not pdf_bytes:
st.error("No PDF uploaded.")
return
with st.spinner("Processing uploaded file..."):
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
pages_data_infos = extract_page_data_fitz(doc)
chapters_starting_page = correct_page_numbers(pages_data_infos)
full_text = extract_text(doc, chapters_starting_page)
st.session_state['full_text'] = full_text
st.session_state['pages_data_infos'] = pages_data_infos
st.session_state['chapters_starting_page'] = chapters_starting_page
def extract_toc(page_range):
"""
Extracts text from specific pages in a PDF file using PyMuPDF.
This is used to extract TOC based on a given range of page numbers indicated by the user.
"""
pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
if pdf_bytes is None:
st.error("No PDF uploaded.")
return ""
chapters_content_list = []
with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
for page_num in page_range:
if 0 <= page_num < len(doc):
text = doc[page_num].get_text("text")
chapters_content_list.append(text)
else:
print(f"Warning: Page number {page_num} is out of bounds.")
toc_text = "\n".join(chapters_content_list)
st.session_state["toc"] = toc_text
def extract_chapters(chapters_dict, pages_data_corrected):
"""
Extract chapters from the provided JSON and pages data.
Args:
chapters_json (list): List of chapter dictionaries from the TOC.
pages_data_corrected (list): List of page data dictionaries with content.
Returns:
list: List of dictionaries, each containing chapter details and content.
"""
# Initialize an empty list to hold chapter dictionaries
chapters = []
# Iterate through each chapter in the JSON
for chapter in chapters_dict:
start_page = chapter['start_page']
end_page = chapter['end_page']
chapter_text = []
# Extract content for the chapter from the pages data
for chapter_range in range(start_page-1, end_page):
chapter_text.append(pages_data_corrected[chapter_range]['content'])
chapter_text = ' '.join(chapter_text)
# Create a dictionary for the chapter
chapter_dict = {
'chapter_number': chapter['chapter_number'],
'chapter_title': chapter['chapter_title'],
'start_page': start_page,
'end_page': end_page,
'content': chapter_text
}
chapters.append(chapter_dict)
st.session_state['chapters_extracted'] = chapters