Spaces:
Runtime error
Runtime error
File size: 13,240 Bytes
46eb9e8 88bb2e2 46eb9e8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 |
"""
PDF processing utilities for extracting text, sections, and structured data from clinical documents.
"""
import os
import re
import fitz # PyMuPDF
from typing import Dict, List, Tuple, Optional, Any
import json
from collections import defaultdict
from langchain.text_splitter import RecursiveCharacterTextSplitter
class PDFProcessor:
"""Main class for PDF processing, extraction, and chunking."""
def __init__(self, upload_dir="./data/uploads"):
"""Initialize with the directory for uploaded PDFs."""
self.upload_dir = upload_dir
os.makedirs(upload_dir, exist_ok=True)
def save_uploaded_file(self, uploaded_file) -> str:
"""Save an uploaded file to disk and return the path."""
file_path = os.path.join(self.upload_dir, uploaded_file.name)
with open(file_path, "wb") as f:
f.write(uploaded_file.getbuffer())
return file_path
def extract_text_from_pdf(self, pdf_path: str) -> Tuple[str, List[Dict]]:
"""
Extract text from PDF with page numbers and attempt to identify section headers.
Returns:
Tuple containing:
- Full text string
- List of pages with text and page numbers
"""
try:
doc = fitz.open(pdf_path)
full_text = ""
pages = []
for page_num, page in enumerate(doc):
text = page.get_text()
full_text += text + "\n\n"
pages.append({
"page_num": page_num + 1,
"text": text
})
doc.close()
return full_text, pages
except Exception as e:
print(f"Error extracting text from PDF {pdf_path}: {e}")
return "", []
def identify_section_titles(self, text: str) -> List[Dict]:
"""
Identify potential section titles based on common patterns in clinical documents.
Returns:
List of dictionaries with section title and position info
"""
# Common patterns for section headers in protocols and SAPs
patterns = [
# Numbered sections like "1. INTRODUCTION" or "2.3 Statistical Analysis"
r'^(\d+(?:\.\d+)*)\s+([A-Z][A-Za-z\s]+)$',
# ALL CAPS headers like "OBJECTIVES AND ENDPOINTS"
r'^([A-Z][A-Z\s]{3,})$',
# Title case headers with optional trailing colon
r'^([A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,5}):?$'
]
sections = []
for line_num, line in enumerate(text.split('\n')):
line = line.strip()
if not line:
continue
for pattern in patterns:
matches = re.match(pattern, line)
if matches:
if len(matches.groups()) > 1:
# For numbered patterns
section_num, section_title = matches.groups()
sections.append({
"section_num": section_num,
"section_title": section_title.strip(),
"line_num": line_num,
"text": line
})
else:
# For unnumbered patterns
section_title = matches.group(1)
sections.append({
"section_num": None,
"section_title": section_title.strip(),
"line_num": line_num,
"text": line
})
break
return sections
def split_into_sections(self, full_text: str, filename: str) -> Dict[str, str]:
"""
Split the full text into logical sections based on identified section titles.
Returns:
Dictionary mapping section names to their text content
"""
# First identify potential section titles
lines = full_text.split('\n')
section_markers = self.identify_section_titles(full_text)
if not section_markers:
# If no sections found, treat the whole document as one section
return {"document": full_text}
# Sort section markers by line number
section_markers.sort(key=lambda x: x["line_num"])
# Create sections
sections = {}
for i in range(len(section_markers)):
start_line = section_markers[i]["line_num"]
section_name = section_markers[i]["section_title"]
# Determine end line (next section or end of document)
if i < len(section_markers) - 1:
end_line = section_markers[i+1]["line_num"]
else:
end_line = len(lines)
# Extract section text
section_text = '\n'.join(lines[start_line:end_line])
sections[section_name] = section_text
return sections
def chunk_text(self, text: str, metadata: Dict[str, Any],
chunk_size: int = 1000, overlap: int = 200) -> List[Dict]:
"""
Split text into chunks suitable for embedding.
Args:
text: Text to chunk
metadata: Metadata to include with each chunk
chunk_size: Maximum size of each chunk
overlap: Overlap between chunks
Returns:
List of dictionaries with page_content and metadata
"""
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=overlap,
length_function=len,
)
chunks = text_splitter.create_documents(
[text],
metadatas=[metadata]
)
return [{"page_content": chunk.page_content, "metadata": chunk.metadata} for chunk in chunks]
def process_document_for_vector_store(self, pdf_path: str,
document_metadata: Dict[str, Any]) -> List[Dict]:
"""
Process a document for storage in the vector store.
Extract text, split into chunks, and add metadata.
Args:
pdf_path: Path to the PDF file
document_metadata: Metadata about the document
Returns:
List of dictionaries with page_content and metadata ready for vector store
"""
full_text, pages = self.extract_text_from_pdf(pdf_path)
sections = self.split_into_sections(full_text, os.path.basename(pdf_path))
all_chunks = []
# Process each section as its own set of chunks
for section_name, section_text in sections.items():
section_metadata = document_metadata.copy()
section_metadata.update({
"section": section_name,
"source": os.path.basename(pdf_path)
})
chunks = self.chunk_text(section_text, section_metadata)
all_chunks.extend(chunks)
return all_chunks
def extract_tables_from_pdf(self, pdf_path: str) -> List[Dict]:
"""
Attempt to extract tables from the PDF.
This is a simplified implementation and may not work for all PDFs.
Returns:
List of dictionaries with table info including page number and content
"""
# This is a placeholder. Table extraction from PDFs is complex and often
# requires specialized libraries or even manual extraction/OCR
# For a production system, consider tools like Camelot, Tabula, or commercial APIs
return [] # Placeholder for actual table extraction
def identify_document_type(self, text: str, filename: str) -> str:
"""
Attempt to identify the type of document (Protocol, SAP, etc.)
based on content and filename patterns.
Returns:
String indicating document type
"""
lower_text = text.lower()
lower_filename = filename.lower()
# Check filename patterns
if "protocol" in lower_filename or "prot" in lower_filename:
return "Protocol"
elif "sap" in lower_filename or "analysis plan" in lower_filename:
return "Statistical Analysis Plan"
elif "csr" in lower_filename or "study report" in lower_filename:
return "Clinical Study Report"
elif "ib" in lower_filename or "investigator" in lower_filename and "brochure" in lower_filename:
return "Investigator Brochure"
# Check content patterns
if "statistical analysis plan" in lower_text:
return "Statistical Analysis Plan"
elif "clinical study protocol" in lower_text or "study protocol" in lower_text:
return "Protocol"
elif "clinical study report" in lower_text:
return "Clinical Study Report"
elif "investigator's brochure" in lower_text or "investigator brochure" in lower_text:
return "Investigator Brochure"
# Default
return "Unknown"
def extract_protocol_id(self, text: str, filename: str) -> Optional[str]:
"""
Attempt to extract the protocol ID from the document text or filename.
Returns:
Protocol ID string if found, None otherwise
"""
# Common patterns for protocol IDs
patterns = [
# Common format like: Protocol B9531002
r'[Pp]rotocol\s+([A-Z][0-9]{5,}[A-Z0-9]*)',
# Format with hyphen like: C5161-001
r'([A-Z][0-9]{4,}-[0-9]{3})',
# Standard pattern like: ABC-123-456
r'([A-Z]{2,5}-[0-9]{2,3}-[0-9]{2,3})',
# Simple alphanumeric like: XYZ12345
r'([A-Z]{2,5}[0-9]{4,6})'
]
# Try to find in the first few hundred characters (often in the title)
sample_text = text[:1000]
for pattern in patterns:
matches = re.search(pattern, sample_text)
if matches:
return matches.group(1)
# Check filename
for pattern in patterns:
matches = re.search(pattern, filename)
if matches:
return matches.group(1)
return None
def extract_basic_metadata(self, pdf_path: str) -> Dict[str, Any]:
"""
Extract basic metadata from a PDF without detailed structure extraction.
Returns:
Dictionary with basic document metadata
"""
filename = os.path.basename(pdf_path)
full_text, _ = self.extract_text_from_pdf(pdf_path)
# Sample the first part of the document
sample_text = full_text[:5000]
# Extract potential protocol ID
protocol_id = self.extract_protocol_id(sample_text, filename)
# Determine document type
doc_type = self.identify_document_type(sample_text, filename)
# Extract title (usually in the first few lines)
lines = sample_text.split('\n')
title = next((line.strip() for line in lines if len(line.strip()) > 20 and len(line.strip()) < 200), "Unknown Title")
# Create basic metadata
metadata = {
"document_id": os.path.splitext(filename)[0],
"filename": filename,
"protocol_id": protocol_id,
"type": doc_type,
"title": title,
"path": pdf_path
}
return metadata
def process_complete_document(self, pdf_path: str) -> Dict[str, Any]:
"""
Process a complete document for both structured data and vector storage.
This is the main entry point for document processing.
Returns:
Dictionary with processing results
"""
results = {
"status": "success",
"pdf_path": pdf_path,
"filename": os.path.basename(pdf_path)
}
try:
# Step 1: Extract basic metadata
metadata = self.extract_basic_metadata(pdf_path)
results["metadata"] = metadata
# Step 2: Extract full text and split into sections
full_text, pages = self.extract_text_from_pdf(pdf_path)
sections = self.split_into_sections(full_text, os.path.basename(pdf_path))
results["sections"] = sections # Store the entire sections dictionary
results["page_count"] = len(pages)
# Step 3: Prepare chunks for vector store
chunks = self.process_document_for_vector_store(pdf_path, metadata)
results["chunk_count"] = len(chunks)
results["chunks"] = chunks
return results
except Exception as e:
results["status"] = "error"
results["error"] = str(e)
return results |