bioethics-rag / src /document_processor.py
ciorant's picture
New regex matching for years and fixed UI issues
4f2c271
import fitz
import re
from typing import List, Dict
from pathlib import Path
import logging
import PyPDF2
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DocumentProcessor:
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def extract_text_from_pdf(self,pdf_path: str) -> str:
"""Extract text from PDF file"""
try:
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
text += f"\n--- Page {page.number + 1} ---\n" # page.number is 0-indexed
logger.info(f"Extracted text from {pdf_path}: {len(text)} characters, {len(doc)} pages")
doc.close()
return text
except Exception as e:
logger.error(f"Error extracting text from {pdf_path}: {e}")
return ""
def clean_text(self,text: str) -> str:
"""Clean text from PDF"""
text = re.sub(r'\n{2,}', '\n', text) # keep single newlines
text = re.sub(r'[ \t]+', ' ', text) # collapse spaces/tabs
# Remove page headers/footers
text = re.sub(r'Page \d+.*?\n', '', text)
# Remove references to figures/tables
text = re.sub(r'\[Figure \d+\]|\[Table \d+\]', '', text)
return text.strip()
def chunk_text(self,text: str, metadata: Dict = None) -> List[Dict]:
"""Split text into chunks with metadata"""
if not text:
return []
sentences = text.split('. ')
chunks = []
current_chunk = ""
for sentence in sentences:
# If adding this sentence would exceed chunk size
if len(current_chunk) + len(sentence) > self.chunk_size:
if current_chunk:
chunks.append({
"text": current_chunk.strip(),
"metadata": metadata or {},
"chunk_id": len(chunks)
})
# Start new chunk with overlap
overlap_text = current_chunk[-self.chunk_overlap:] if len(
current_chunk) > self.chunk_overlap else current_chunk
current_chunk = overlap_text + " " + sentence
else:
current_chunk = sentence
else:
current_chunk += ". " + sentence if current_chunk else sentence
# Add final chunk
if current_chunk:
chunks.append({
"text": current_chunk.strip(),
"metadata": metadata or {},
"chunk_id": len(chunks)
})
logger.info(f"Created {len(chunks)} chunks")
return chunks
def extract_metadata(self, pdf_path: str) -> dict:
"""Extract metadata (title, authors, year, filename, file_size) from a PDF."""
metadata = {
"filename": Path(pdf_path).name,
"file_size": Path(pdf_path).stat().st_size,
"title": None,
"authors": None,
"year": None
}
with open(pdf_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
# 1. Try embedded PDF metadata
pdf_meta = reader.metadata
if pdf_meta:
title = pdf_meta.get("/Title", "").strip()
author = pdf_meta.get("/Author", "").strip()
if title and title.lower() not in ["", "untitled", "unknown"]:
metadata["title"] = title
if author and author.lower() not in ["", "anonymous", "unknown"]:
metadata["authors"] = author
# 2. Fallback: look at first page
if not metadata["title"] or not metadata["authors"]:
try:
first_page = reader.pages[0].extract_text() or ""
lines = [line.strip() for line in first_page.split("\n") if line.strip()]
# crude heuristic: first line = title
if not metadata["title"] and lines:
metadata["title"] = lines[0]
# crude heuristic: authors in line(s) after title
if not metadata["authors"] and len(lines) > 1:
possible_authors = lines[1]
if re.search(r"[A-Z][a-z]+(?: [A-Z][a-z]+)*", possible_authors):
metadata["authors"] = possible_authors
# crude heuristic: find year (e.g., 2023, 2024)
year_patterns = [
r"\b(19|20)\d{2}\b", # Basic year
r"©\s*(19|20)\d{2}", # Copyright year
r"\((19|20)\d{2}\)", # Year in parentheses
r"(19|20)\d{2}[,.)]", # Year followed by comma/period
]
for pattern in year_patterns:
year_match = re.search(pattern, first_page)
if year_match:
year_text = re.search(r"(19|20)\d{2}", year_match.group(0))
if year_text:
metadata["year"] = year_text.group(0)
break
except Exception:
pass
# Defaults if missing
metadata["title"] = metadata["title"] or "Unknown Title"
metadata["authors"] = metadata["authors"] if metadata["authors"] else None
metadata["year"] = metadata["year"] or "n.d."
return metadata
def process_document(self,pdf_path: str) -> List[Dict]:
"""Complete document processing"""
try:
file_path = Path(pdf_path)
except TypeError as e: # Catches specifically if pdf_path is the wrong type
logger.error(f"Invalid path type: {pdf_path}: {e}")
raise
except OSError as e: # Catches other filesystem-related errors
logger.error(f"OS error with path: {pdf_path}: {e}")
raise
metadata=self.extract_metadata(pdf_path)
raw_text = self.extract_text_from_pdf(pdf_path)
clean_text = self.clean_text(raw_text)
chunks = self.chunk_text(clean_text, metadata)
logger.info(f"Processed {pdf_path}: {len(chunks)} chunks created")
return chunks
def process_documents(self, pdf_paths: List[str]) -> List[Dict]:
documents = []
for path in pdf_paths:
documents.extend(self.process_document(path))
return documents