File size: 6,851 Bytes
90ed798
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4f2c271
 
 
 
 
 
 
 
 
 
 
 
 
 
90ed798
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
import fitz
import re
from typing import List, Dict
from pathlib import Path
import logging
import PyPDF2

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class DocumentProcessor:
    def __init__(self, chunk_size: int = 1000, chunk_overlap: int = 200):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap

    def extract_text_from_pdf(self,pdf_path: str) -> str:
        """Extract text from PDF file"""
        try:
            doc = fitz.open(pdf_path)
            text = ""

            for page in doc:
                text += page.get_text()
                text += f"\n--- Page {page.number + 1} ---\n"  # page.number is 0-indexed

            logger.info(f"Extracted text from {pdf_path}: {len(text)} characters, {len(doc)} pages")
            doc.close()
            return text

        except Exception as e:
            logger.error(f"Error extracting text from {pdf_path}: {e}")
            return ""

    def clean_text(self,text: str) -> str:
        """Clean text from PDF"""
        text = re.sub(r'\n{2,}', '\n', text)  # keep single newlines
        text = re.sub(r'[ \t]+', ' ', text)  # collapse spaces/tabs

        # Remove page headers/footers
        text = re.sub(r'Page \d+.*?\n', '', text)

        # Remove references to figures/tables
        text = re.sub(r'\[Figure \d+\]|\[Table \d+\]', '', text)

        return text.strip()

    def chunk_text(self,text: str, metadata: Dict = None) -> List[Dict]:
        """Split text into chunks with metadata"""
        if not text:
            return []

        sentences = text.split('. ')
        chunks = []
        current_chunk = ""

        for sentence in sentences:
            # If adding this sentence would exceed chunk size
            if len(current_chunk) + len(sentence) > self.chunk_size:
                if current_chunk:
                    chunks.append({
                        "text": current_chunk.strip(),
                        "metadata": metadata or {},
                        "chunk_id": len(chunks)
                    })

                    # Start new chunk with overlap
                    overlap_text = current_chunk[-self.chunk_overlap:] if len(
                        current_chunk) > self.chunk_overlap else current_chunk
                    current_chunk = overlap_text + " " + sentence
                else:
                    current_chunk = sentence
            else:
                current_chunk += ". " + sentence if current_chunk else sentence

            # Add final chunk
        if current_chunk:
            chunks.append({
                "text": current_chunk.strip(),
                "metadata": metadata or {},
                "chunk_id": len(chunks)
            })

        logger.info(f"Created {len(chunks)} chunks")
        return chunks

    def extract_metadata(self, pdf_path: str) -> dict:
        """Extract metadata (title, authors, year, filename, file_size) from a PDF."""

        metadata = {
            "filename": Path(pdf_path).name,
            "file_size": Path(pdf_path).stat().st_size,
            "title": None,
            "authors": None,
            "year": None
        }

        with open(pdf_path, "rb") as f:
            reader = PyPDF2.PdfReader(f)

            # 1. Try embedded PDF metadata
            pdf_meta = reader.metadata
            if pdf_meta:
                title = pdf_meta.get("/Title", "").strip()
                author = pdf_meta.get("/Author", "").strip()

                if title and title.lower() not in ["", "untitled", "unknown"]:
                    metadata["title"] = title

                if author and author.lower() not in ["", "anonymous", "unknown"]:
                    metadata["authors"] = author

            # 2. Fallback: look at first page
            if not metadata["title"] or not metadata["authors"]:
                try:
                    first_page = reader.pages[0].extract_text() or ""
                    lines = [line.strip() for line in first_page.split("\n") if line.strip()]

                    # crude heuristic: first line = title
                    if not metadata["title"] and lines:
                        metadata["title"] = lines[0]

                    # crude heuristic: authors in line(s) after title
                    if not metadata["authors"] and len(lines) > 1:
                        possible_authors = lines[1]
                        if re.search(r"[A-Z][a-z]+(?: [A-Z][a-z]+)*", possible_authors):
                            metadata["authors"] = possible_authors

                    # crude heuristic: find year (e.g., 2023, 2024)
                    year_patterns = [
                        r"\b(19|20)\d{2}\b",  # Basic year
                        r"©\s*(19|20)\d{2}",  # Copyright year
                        r"\((19|20)\d{2}\)",  # Year in parentheses
                        r"(19|20)\d{2}[,.)]",  # Year followed by comma/period
                    ]

                    for pattern in year_patterns:
                        year_match = re.search(pattern, first_page)
                        if year_match:
                            year_text = re.search(r"(19|20)\d{2}", year_match.group(0))
                            if year_text:
                                metadata["year"] = year_text.group(0)
                                break

                except Exception:
                    pass

        # Defaults if missing
        metadata["title"] = metadata["title"] or "Unknown Title"
        metadata["authors"] = metadata["authors"] if metadata["authors"] else None
        metadata["year"] = metadata["year"] or "n.d."

        return metadata

    def process_document(self,pdf_path: str) -> List[Dict]:
        """Complete document processing"""
        try:
            file_path = Path(pdf_path)

        except TypeError as e:  # Catches specifically if pdf_path is the wrong type
            logger.error(f"Invalid path type: {pdf_path}: {e}")
            raise
        except OSError as e:  # Catches other filesystem-related errors
            logger.error(f"OS error with path: {pdf_path}: {e}")
            raise

        metadata=self.extract_metadata(pdf_path)

        raw_text = self.extract_text_from_pdf(pdf_path)
        clean_text = self.clean_text(raw_text)
        chunks = self.chunk_text(clean_text, metadata)
        logger.info(f"Processed {pdf_path}: {len(chunks)} chunks created")
        return chunks

    def process_documents(self, pdf_paths: List[str]) -> List[Dict]:
        documents = []
        for path in pdf_paths:
            documents.extend(self.process_document(path))
        return documents