Spaces:
Sleeping
Sleeping
File size: 6,851 Bytes
90ed798 4f2c271 90ed798 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
import fitz
import re
from typing import List, Dict
from pathlib import Path
import logging
import PyPDF2
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class DocumentProcessor:
def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
self.chunk_size = chunk_size
self.chunk_overlap = chunk_overlap
def extract_text_from_pdf(self,pdf_path: str) -> str:
"""Extract text from PDF file"""
try:
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
text += f"\n--- Page {page.number + 1} ---\n" # page.number is 0-indexed
logger.info(f"Extracted text from {pdf_path}: {len(text)} characters, {len(doc)} pages")
doc.close()
return text
except Exception as e:
logger.error(f"Error extracting text from {pdf_path}: {e}")
return ""
def clean_text(self,text: str) -> str:
"""Clean text from PDF"""
text = re.sub(r'\n{2,}', '\n', text) # keep single newlines
text = re.sub(r'[ \t]+', ' ', text) # collapse spaces/tabs
# Remove page headers/footers
text = re.sub(r'Page \d+.*?\n', '', text)
# Remove references to figures/tables
text = re.sub(r'\[Figure \d+\]|\[Table \d+\]', '', text)
return text.strip()
def chunk_text(self,text: str, metadata: Dict = None) -> List[Dict]:
"""Split text into chunks with metadata"""
if not text:
return []
sentences = text.split('. ')
chunks = []
current_chunk = ""
for sentence in sentences:
# If adding this sentence would exceed chunk size
if len(current_chunk) + len(sentence) > self.chunk_size:
if current_chunk:
chunks.append({
"text": current_chunk.strip(),
"metadata": metadata or {},
"chunk_id": len(chunks)
})
# Start new chunk with overlap
overlap_text = current_chunk[-self.chunk_overlap:] if len(
current_chunk) > self.chunk_overlap else current_chunk
current_chunk = overlap_text + " " + sentence
else:
current_chunk = sentence
else:
current_chunk += ". " + sentence if current_chunk else sentence
# Add final chunk
if current_chunk:
chunks.append({
"text": current_chunk.strip(),
"metadata": metadata or {},
"chunk_id": len(chunks)
})
logger.info(f"Created {len(chunks)} chunks")
return chunks
def extract_metadata(self, pdf_path: str) -> dict:
"""Extract metadata (title, authors, year, filename, file_size) from a PDF."""
metadata = {
"filename": Path(pdf_path).name,
"file_size": Path(pdf_path).stat().st_size,
"title": None,
"authors": None,
"year": None
}
with open(pdf_path, "rb") as f:
reader = PyPDF2.PdfReader(f)
# 1. Try embedded PDF metadata
pdf_meta = reader.metadata
if pdf_meta:
title = pdf_meta.get("/Title", "").strip()
author = pdf_meta.get("/Author", "").strip()
if title and title.lower() not in ["", "untitled", "unknown"]:
metadata["title"] = title
if author and author.lower() not in ["", "anonymous", "unknown"]:
metadata["authors"] = author
# 2. Fallback: look at first page
if not metadata["title"] or not metadata["authors"]:
try:
first_page = reader.pages[0].extract_text() or ""
lines = [line.strip() for line in first_page.split("\n") if line.strip()]
# crude heuristic: first line = title
if not metadata["title"] and lines:
metadata["title"] = lines[0]
# crude heuristic: authors in line(s) after title
if not metadata["authors"] and len(lines) > 1:
possible_authors = lines[1]
if re.search(r"[A-Z][a-z]+(?: [A-Z][a-z]+)*", possible_authors):
metadata["authors"] = possible_authors
# crude heuristic: find year (e.g., 2023, 2024)
year_patterns = [
r"\b(19|20)\d{2}\b", # Basic year
r"©\s*(19|20)\d{2}", # Copyright year
r"\((19|20)\d{2}\)", # Year in parentheses
r"(19|20)\d{2}[,.)]", # Year followed by comma/period
]
for pattern in year_patterns:
year_match = re.search(pattern, first_page)
if year_match:
year_text = re.search(r"(19|20)\d{2}", year_match.group(0))
if year_text:
metadata["year"] = year_text.group(0)
break
except Exception:
pass
# Defaults if missing
metadata["title"] = metadata["title"] or "Unknown Title"
metadata["authors"] = metadata["authors"] if metadata["authors"] else None
metadata["year"] = metadata["year"] or "n.d."
return metadata
def process_document(self,pdf_path: str) -> List[Dict]:
"""Complete document processing"""
try:
file_path = Path(pdf_path)
except TypeError as e: # Catches specifically if pdf_path is the wrong type
logger.error(f"Invalid path type: {pdf_path}: {e}")
raise
except OSError as e: # Catches other filesystem-related errors
logger.error(f"OS error with path: {pdf_path}: {e}")
raise
metadata=self.extract_metadata(pdf_path)
raw_text = self.extract_text_from_pdf(pdf_path)
clean_text = self.clean_text(raw_text)
chunks = self.chunk_text(clean_text, metadata)
logger.info(f"Processed {pdf_path}: {len(chunks)} chunks created")
return chunks
def process_documents(self, pdf_paths: List[str]) -> List[Dict]:
documents = []
for path in pdf_paths:
documents.extend(self.process_document(path))
return documents |