Spaces:

davidepanza
/

test2text

Sleeping

test2text / app /backend /raw_text_processing.py

Davide Panza

Upload 56 files

1d8ed3b verified 8 months ago

5.78 kB

	import fitz # PyMuPDF
	import warnings
	import streamlit as st


	def extract_page_data_fitz(doc):
	"""
	Extracts page numbers and text from a PDF file using PyMuPDF.
	The function looks for page numbers in the top and bottom 15% of each page.
	It returns a list of dictionaries, each containing the page index, page number,
	and the full text of the page.
	"""
	pages_data = []

	for i, page in enumerate(doc):
	height = page.rect.height
	width = page.rect.width

	top_rect = fitz.Rect(0, 0, width, height * 0.15)
	bottom_rect = fitz.Rect(0, height * 0.85, width, height)

	top_text = page.get_text("text", clip=top_rect).split()
	bottom_text = page.get_text("text", clip=bottom_rect).split()

	found_number = next((int(text) for text in top_text + bottom_text if text.isdigit()), None)
	full_text = page.get_text("text")

	pages_data.append({
	"index": i,
	"number": found_number,
	"content": full_text
	})

	return pages_data


	def correct_page_numbers(pages_data, sequence_length=10):
	"""
	Corrects page numbers by finding the first sequence of consecutive values,
	filling gaps forward and backward, and setting values < 1 to None.
	Returns the index of the first page numbered 1, or None if no sequence is found.
	"""
	try:
	seen = [(i, d["number"]) for i, d in enumerate(pages_data) if isinstance(d["number"], int)]

	for start in range(len(seen) - sequence_length + 1):
	if all(seen[start + j][1] == seen[start][1] + j for j in range(sequence_length)):
	base_index, base_number = seen[start]
	break
	else:
	return None

	for offset, page in enumerate(pages_data[base_index:], start=0):
	page["number"] = base_number + offset

	for offset in range(1, base_index + 1):
	page = pages_data[base_index - offset]
	page["number"] = base_number - offset

	for page in pages_data:
	if page["number"] < 1:
	page["number"] = None

	return next((page['index'] for page in pages_data if page["number"] == 1), None)

	except Exception:
	return None


	def extract_text(doc, start_chapter=None):
	"""
	Extracts the text of the book starting from the specified page index.
	If no start_chapter is provided, it returns the whole doc.
	"""
	if start_chapter is not None:
	all_pages_text = [
	doc[page_range].get_text("text")
	for page_range in range(start_chapter, len(doc))
	]
	return "\n".join(all_pages_text)
	else:
	warnings.warn(
	"No chapter start has been detected: extracting text from the entire PDF.",
	UserWarning
	)
	return "\n".join(page.get_text("text") for page in doc)


	def process_pdf():
	"""
	Processes a PDF file to extract text starting from the first chapter.
	"""
	pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
	if not pdf_bytes:
	st.error("No PDF uploaded.")
	return

	with st.spinner("Processing uploaded file..."):
	with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
	pages_data_infos = extract_page_data_fitz(doc)
	chapters_starting_page = correct_page_numbers(pages_data_infos)
	full_text = extract_text(doc, chapters_starting_page)

	st.session_state['full_text'] = full_text
	st.session_state['pages_data_infos'] = pages_data_infos
	st.session_state['chapters_starting_page'] = chapters_starting_page


	def extract_toc(page_range):
	"""
	Extracts text from specific pages in a PDF file using PyMuPDF.
	This is used to extract TOC based on a given range of page numbers indicated by the user.
	"""
	pdf_bytes = st.session_state.get("uploaded_pdf_bytes")
	if pdf_bytes is None:
	st.error("No PDF uploaded.")
	return ""

	chapters_content_list = []
	with fitz.open(stream=pdf_bytes, filetype="pdf") as doc:
	for page_num in page_range:
	if 0 <= page_num < len(doc):
	text = doc[page_num].get_text("text")
	chapters_content_list.append(text)
	else:
	print(f"Warning: Page number {page_num} is out of bounds.")

	toc_text = "\n".join(chapters_content_list)
	st.session_state["toc"] = toc_text


	def extract_chapters(chapters_dict, pages_data_corrected):
	"""
	Extract chapters from the provided JSON and pages data.
	Args:
	chapters_json (list): List of chapter dictionaries from the TOC.
	pages_data_corrected (list): List of page data dictionaries with content.
	Returns:
	list: List of dictionaries, each containing chapter details and content.
	"""
	# Initialize an empty list to hold chapter dictionaries
	chapters = []

	# Iterate through each chapter in the JSON
	for chapter in chapters_dict:
	start_page = chapter['start_page']
	end_page = chapter['end_page']
	chapter_text = []

	# Extract content for the chapter from the pages data
	for chapter_range in range(start_page-1, end_page):
	chapter_text.append(pages_data_corrected[chapter_range]['content'])

	chapter_text = ' '.join(chapter_text)

	# Create a dictionary for the chapter
	chapter_dict = {
	'chapter_number': chapter['chapter_number'],
	'chapter_title': chapter['chapter_title'],
	'start_page': start_page,
	'end_page': end_page,
	'content': chapter_text
	}

	chapters.append(chapter_dict)

	st.session_state['chapters_extracted'] = chapters