Spaces:

cryogenic22
/

doc_knowledge_base

Runtime error

App Files Files Community

doc_knowledge_base / pdf_processor.py

cryogenic22

Update pdf_processor.py

88bb2e2 verified 10 months ago

raw

history blame contribute delete

13.2 kB

	"""
	PDF processing utilities for extracting text, sections, and structured data from clinical documents.
	"""

	import os
	import re
	import fitz # PyMuPDF
	from typing import Dict, List, Tuple, Optional, Any
	import json
	from collections import defaultdict
	from langchain.text_splitter import RecursiveCharacterTextSplitter

	class PDFProcessor:
	"""Main class for PDF processing, extraction, and chunking."""

	def __init__(self, upload_dir="./data/uploads"):
	"""Initialize with the directory for uploaded PDFs."""
	self.upload_dir = upload_dir
	os.makedirs(upload_dir, exist_ok=True)

	def save_uploaded_file(self, uploaded_file) -> str:
	"""Save an uploaded file to disk and return the path."""
	file_path = os.path.join(self.upload_dir, uploaded_file.name)
	with open(file_path, "wb") as f:
	f.write(uploaded_file.getbuffer())
	return file_path

	def extract_text_from_pdf(self, pdf_path: str) -> Tuple[str, List[Dict]]:
	"""
	Extract text from PDF with page numbers and attempt to identify section headers.

	Returns:
	Tuple containing:
	- Full text string
	- List of pages with text and page numbers
	"""
	try:
	doc = fitz.open(pdf_path)
	full_text = ""
	pages = []

	for page_num, page in enumerate(doc):
	text = page.get_text()
	full_text += text + "\n\n"
	pages.append({
	"page_num": page_num + 1,
	"text": text
	})

	doc.close()
	return full_text, pages
	except Exception as e:
	print(f"Error extracting text from PDF {pdf_path}: {e}")
	return "", []

	def identify_section_titles(self, text: str) -> List[Dict]:
	"""
	Identify potential section titles based on common patterns in clinical documents.

	Returns:
	List of dictionaries with section title and position info
	"""
	# Common patterns for section headers in protocols and SAPs
	patterns = [
	# Numbered sections like "1. INTRODUCTION" or "2.3 Statistical Analysis"
	r'^(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z\s]+)$',
	# ALL CAPS headers like "OBJECTIVES AND ENDPOINTS"
	r'^([A-Z][A-Z\s]{3,})$',
	# Title case headers with optional trailing colon
	r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,5}):?$'
	]

	sections = []
	for line_num, line in enumerate(text.split('\n')):
	line = line.strip()
	if not line:
	continue

	for pattern in patterns:
	matches = re.match(pattern, line)
	if matches:
	if len(matches.groups()) > 1:
	# For numbered patterns
	section_num, section_title = matches.groups()
	sections.append({
	"section_num": section_num,
	"section_title": section_title.strip(),
	"line_num": line_num,
	"text": line
	})
	else:
	# For unnumbered patterns
	section_title = matches.group(1)
	sections.append({
	"section_num": None,
	"section_title": section_title.strip(),
	"line_num": line_num,
	"text": line
	})
	break

	return sections

	def split_into_sections(self, full_text: str, filename: str) -> Dict[str, str]:
	"""
	Split the full text into logical sections based on identified section titles.

	Returns:
	Dictionary mapping section names to their text content
	"""
	# First identify potential section titles
	lines = full_text.split('\n')
	section_markers = self.identify_section_titles(full_text)

	if not section_markers:
	# If no sections found, treat the whole document as one section
	return {"document": full_text}

	# Sort section markers by line number
	section_markers.sort(key=lambda x: x["line_num"])

	# Create sections
	sections = {}
	for i in range(len(section_markers)):
	start_line = section_markers[i]["line_num"]
	section_name = section_markers[i]["section_title"]

	# Determine end line (next section or end of document)
	if i < len(section_markers) - 1:
	end_line = section_markers[i+1]["line_num"]
	else:
	end_line = len(lines)

	# Extract section text
	section_text = '\n'.join(lines[start_line:end_line])
	sections[section_name] = section_text

	return sections

	def chunk_text(self, text: str, metadata: Dict[str, Any],
	chunk_size: int = 1000, overlap: int = 200) -> List[Dict]:
	"""
	Split text into chunks suitable for embedding.

	Args:
	text: Text to chunk
	metadata: Metadata to include with each chunk
	chunk_size: Maximum size of each chunk
	overlap: Overlap between chunks

	Returns:
	List of dictionaries with page_content and metadata
	"""
	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=chunk_size,
	chunk_overlap=overlap,
	length_function=len,
	)

	chunks = text_splitter.create_documents(
	[text],
	metadatas=[metadata]
	)

	return [{"page_content": chunk.page_content, "metadata": chunk.metadata} for chunk in chunks]

	def process_document_for_vector_store(self, pdf_path: str,
	document_metadata: Dict[str, Any]) -> List[Dict]:
	"""
	Process a document for storage in the vector store.
	Extract text, split into chunks, and add metadata.

	Args:
	pdf_path: Path to the PDF file
	document_metadata: Metadata about the document

	Returns:
	List of dictionaries with page_content and metadata ready for vector store
	"""
	full_text, pages = self.extract_text_from_pdf(pdf_path)
	sections = self.split_into_sections(full_text, os.path.basename(pdf_path))

	all_chunks = []

	# Process each section as its own set of chunks
	for section_name, section_text in sections.items():
	section_metadata = document_metadata.copy()
	section_metadata.update({
	"section": section_name,
	"source": os.path.basename(pdf_path)
	})

	chunks = self.chunk_text(section_text, section_metadata)
	all_chunks.extend(chunks)

	return all_chunks

	def extract_tables_from_pdf(self, pdf_path: str) -> List[Dict]:
	"""
	Attempt to extract tables from the PDF.
	This is a simplified implementation and may not work for all PDFs.

	Returns:
	List of dictionaries with table info including page number and content
	"""
	# This is a placeholder. Table extraction from PDFs is complex and often
	# requires specialized libraries or even manual extraction/OCR
	# For a production system, consider tools like Camelot, Tabula, or commercial APIs

	return [] # Placeholder for actual table extraction

	def identify_document_type(self, text: str, filename: str) -> str:
	"""
	Attempt to identify the type of document (Protocol, SAP, etc.)
	based on content and filename patterns.

	Returns:
	String indicating document type
	"""
	lower_text = text.lower()
	lower_filename = filename.lower()

	# Check filename patterns
	if "protocol" in lower_filename or "prot" in lower_filename:
	return "Protocol"
	elif "sap" in lower_filename or "analysis plan" in lower_filename:
	return "Statistical Analysis Plan"
	elif "csr" in lower_filename or "study report" in lower_filename:
	return "Clinical Study Report"
	elif "ib" in lower_filename or "investigator" in lower_filename and "brochure" in lower_filename:
	return "Investigator Brochure"

	# Check content patterns
	if "statistical analysis plan" in lower_text:
	return "Statistical Analysis Plan"
	elif "clinical study protocol" in lower_text or "study protocol" in lower_text:
	return "Protocol"
	elif "clinical study report" in lower_text:
	return "Clinical Study Report"
	elif "investigator's brochure" in lower_text or "investigator brochure" in lower_text:
	return "Investigator Brochure"

	# Default
	return "Unknown"

	def extract_protocol_id(self, text: str, filename: str) -> Optional[str]:
	"""
	Attempt to extract the protocol ID from the document text or filename.

	Returns:
	Protocol ID string if found, None otherwise
	"""
	# Common patterns for protocol IDs
	patterns = [
	# Common format like: Protocol B9531002
	r'[Pp]rotocol\s+([A-Z][0-9]{5,}[A-Z0-9]*)',
	# Format with hyphen like: C5161-001
	r'([A-Z][0-9]{4,}-[0-9]{3})',
	# Standard pattern like: ABC-123-456
	r'([A-Z]{2,5}-[0-9]{2,3}-[0-9]{2,3})',
	# Simple alphanumeric like: XYZ12345
	r'([A-Z]{2,5}[0-9]{4,6})'
	]

	# Try to find in the first few hundred characters (often in the title)
	sample_text = text[:1000]

	for pattern in patterns:
	matches = re.search(pattern, sample_text)
	if matches:
	return matches.group(1)

	# Check filename
	for pattern in patterns:
	matches = re.search(pattern, filename)
	if matches:
	return matches.group(1)

	return None

	def extract_basic_metadata(self, pdf_path: str) -> Dict[str, Any]:
	"""
	Extract basic metadata from a PDF without detailed structure extraction.

	Returns:
	Dictionary with basic document metadata
	"""
	filename = os.path.basename(pdf_path)
	full_text, _ = self.extract_text_from_pdf(pdf_path)

	# Sample the first part of the document
	sample_text = full_text[:5000]

	# Extract potential protocol ID
	protocol_id = self.extract_protocol_id(sample_text, filename)

	# Determine document type
	doc_type = self.identify_document_type(sample_text, filename)

	# Extract title (usually in the first few lines)
	lines = sample_text.split('\n')
	title = next((line.strip() for line in lines if len(line.strip()) > 20 and len(line.strip()) < 200), "Unknown Title")

	# Create basic metadata
	metadata = {
	"document_id": os.path.splitext(filename)[0],
	"filename": filename,
	"protocol_id": protocol_id,
	"type": doc_type,
	"title": title,
	"path": pdf_path
	}

	return metadata

	def process_complete_document(self, pdf_path: str) -> Dict[str, Any]:
	"""
	Process a complete document for both structured data and vector storage.
	This is the main entry point for document processing.

	Returns:
	Dictionary with processing results
	"""
	results = {
	"status": "success",
	"pdf_path": pdf_path,
	"filename": os.path.basename(pdf_path)
	}

	try:
	# Step 1: Extract basic metadata
	metadata = self.extract_basic_metadata(pdf_path)
	results["metadata"] = metadata

	# Step 2: Extract full text and split into sections
	full_text, pages = self.extract_text_from_pdf(pdf_path)
	sections = self.split_into_sections(full_text, os.path.basename(pdf_path))
	results["sections"] = sections # Store the entire sections dictionary
	results["page_count"] = len(pages)

	# Step 3: Prepare chunks for vector store
	chunks = self.process_document_for_vector_store(pdf_path, metadata)
	results["chunk_count"] = len(chunks)
	results["chunks"] = chunks

	return results
	except Exception as e:
	results["status"] = "error"
	results["error"] = str(e)
	return results