File size: 4,561 Bytes
7791c01
 
 
 
3806245
7791c01
3806245
7791c01
 
 
 
3806245
 
 
 
7791c01
3806245
7791c01
 
 
3806245
 
7791c01
3806245
7791c01
 
 
3806245
7791c01
 
 
 
 
 
 
 
3806245
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7791c01
 
3806245
7791c01
3806245
 
 
7791c01
3806245
 
 
 
 
7791c01
 
 
3806245
 
 
 
 
 
 
7791c01
 
 
 
 
 
 
 
3806245
7791c01
 
 
 
 
 
 
 
 
3806245
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import requests
import PyPDF2
import docx
from io import BytesIO
from typing import Dict, List
import re
from pathlib import Path

class AdvancedDocumentProcessor:
    def __init__(self):
        self.supported_formats = ['.pdf', '.docx', '.txt']
        self.chunk_size = 600   # words per chunk
        self.chunk_overlap = 100

    # === Public methods ===
    def process_document(self, url: str) -> Dict:
        """Download document from URL and extract text + metadata"""
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()

            if url.endswith('.pdf') or 'pdf' in response.headers.get('content-type', ''):
                text = self._extract_pdf_text(response.content)
            elif url.endswith('.docx') or 'word' in response.headers.get('content-type', ''):
                text = self._extract_docx_text(response.content)
            else:
                text = response.text

            return {
                'text': text,
                'metadata': self._extract_metadata(text),
                'document_type': self._detect_document_type(text),
                'url': url
            }
        except Exception as e:
            raise Exception(f"Error processing document: {str(e)}")

    def process_file(self, file_path: str) -> List[Dict]:
        """
        Process a local file path into chunks with metadata
        Returns: list of {id, text, page, metadata}
        """
        path = Path(file_path)
        suffix = path.suffix.lower()

        if suffix == ".pdf":
            pages = self._pdf_pages_from_path(file_path)
        elif suffix == ".docx":
            pages = [self._extract_docx_text_from_path(file_path)]
        elif suffix == ".txt":
            pages = [Path(file_path).read_text(encoding="utf-8", errors="ignore")]
        else:
            raise ValueError(f"Unsupported file format: {suffix}")

        # Chunk each page and add page number metadata
        chunks = []
        idx = 0
        for pnum, page_text in enumerate(pages, start=1):
            for chunk in self._chunk_text(page_text):
                chunks.append({
                    "id": f"chunk-{idx}",
                    "text": chunk,
                    "page": pnum,
                    "metadata": self._extract_metadata(chunk)
                })
                idx += 1
        return chunks

    # === Internal extractors ===
    def _extract_pdf_text(self, content: bytes) -> str:
        pdf_file = BytesIO(content)
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text + "\n"
        return text

    def _pdf_pages_from_path(self, file_path: str) -> List[str]:
        reader = PyPDF2.PdfReader(file_path)
        return [(p.extract_text() or "") for p in reader.pages]

    def _extract_docx_text(self, content: bytes) -> str:
        doc_file = BytesIO(content)
        doc = docx.Document(doc_file)
        return "\n".join(p.text for p in doc.paragraphs)

    def _extract_docx_text_from_path(self, file_path: str) -> str:
        doc = docx.Document(file_path)
        return "\n".join(p.text for p in doc.paragraphs)

    # === Metadata & type detection ===
    def _extract_metadata(self, text: str) -> Dict:
        return {
            'word_count': len(text.split()),
            'character_count': len(text),
            'paragraph_count': len(text.split('\n\n')),
            'has_tables': 'table' in text.lower(),
            'has_sections': bool(re.search(r'\b(section|clause|article)\s+\d+', text.lower()))
        }

    def _detect_document_type(self, text: str) -> str:
        text_lower = text.lower()
        if any(word in text_lower for word in ['policy', 'insurance', 'premium', 'coverage']):
            return 'insurance_policy'
        elif any(word in text_lower for word in ['contract', 'agreement', 'terms']):
            return 'legal_contract'
        elif any(word in text_lower for word in ['employee', 'hr', 'benefits', 'salary']):
            return 'hr_document'
        else:
            return 'general_document'

    # === Chunking ===
    def _chunk_text(self, text: str) -> List[str]:
        words = text.split()
        chunks = []
        i = 0
        while i < len(words):
            chunk_words = words[i:i + self.chunk_size]
            chunk_text = " ".join(chunk_words)
            chunks.append(chunk_text)
            i += self.chunk_size - self.chunk_overlap
        return chunks