Spaces:

strelizi
/

my-fast-api

Runtime error

File size: 4,561 Bytes

import requests
import PyPDF2
import docx
from io import BytesIO
from typing import Dict, List
import re
from pathlib import Path

class AdvancedDocumentProcessor:
    def __init__(self):
        self.supported_formats = ['.pdf', '.docx', '.txt']
        self.chunk_size = 600   # words per chunk
        self.chunk_overlap = 100

    # === Public methods ===
    def process_document(self, url: str) -> Dict:
        """Download document from URL and extract text + metadata"""
        try:
            response = requests.get(url, timeout=30)
            response.raise_for_status()

            if url.endswith('.pdf') or 'pdf' in response.headers.get('content-type', ''):
                text = self._extract_pdf_text(response.content)
            elif url.endswith('.docx') or 'word' in response.headers.get('content-type', ''):
                text = self._extract_docx_text(response.content)
            else:
                text = response.text

            return {
                'text': text,
                'metadata': self._extract_metadata(text),
                'document_type': self._detect_document_type(text),
                'url': url
            }
        except Exception as e:
            raise Exception(f"Error processing document: {str(e)}")

    def process_file(self, file_path: str) -> List[Dict]:
        """
        Process a local file path into chunks with metadata
        Returns: list of {id, text, page, metadata}
        """
        path = Path(file_path)
        suffix = path.suffix.lower()

        if suffix == ".pdf":
            pages = self._pdf_pages_from_path(file_path)
        elif suffix == ".docx":
            pages = [self._extract_docx_text_from_path(file_path)]
        elif suffix == ".txt":
            pages = [Path(file_path).read_text(encoding="utf-8", errors="ignore")]
        else:
            raise ValueError(f"Unsupported file format: {suffix}")

        # Chunk each page and add page number metadata
        chunks = []
        idx = 0
        for pnum, page_text in enumerate(pages, start=1):
            for chunk in self._chunk_text(page_text):
                chunks.append({
                    "id": f"chunk-{idx}",
                    "text": chunk,
                    "page": pnum,
                    "metadata": self._extract_metadata(chunk)
                })
                idx += 1
        return chunks

    # === Internal extractors ===
    def _extract_pdf_text(self, content: bytes) -> str:
        pdf_file = BytesIO(content)
        reader = PyPDF2.PdfReader(pdf_file)
        text = ""
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text + "\n"
        return text

    def _pdf_pages_from_path(self, file_path: str) -> List[str]:
        reader = PyPDF2.PdfReader(file_path)
        return [(p.extract_text() or "") for p in reader.pages]

    def _extract_docx_text(self, content: bytes) -> str:
        doc_file = BytesIO(content)
        doc = docx.Document(doc_file)
        return "\n".join(p.text for p in doc.paragraphs)

    def _extract_docx_text_from_path(self, file_path: str) -> str:
        doc = docx.Document(file_path)
        return "\n".join(p.text for p in doc.paragraphs)

    # === Metadata & type detection ===
    def _extract_metadata(self, text: str) -> Dict:
        return {
            'word_count': len(text.split()),
            'character_count': len(text),
            'paragraph_count': len(text.split('\n\n')),
            'has_tables': 'table' in text.lower(),
            'has_sections': bool(re.search(r'\b(section|clause|article)\s+\d+', text.lower()))
        }

    def _detect_document_type(self, text: str) -> str:
        text_lower = text.lower()
        if any(word in text_lower for word in ['policy', 'insurance', 'premium', 'coverage']):
            return 'insurance_policy'
        elif any(word in text_lower for word in ['contract', 'agreement', 'terms']):
            return 'legal_contract'
        elif any(word in text_lower for word in ['employee', 'hr', 'benefits', 'salary']):
            return 'hr_document'
        else:
            return 'general_document'

    # === Chunking ===
    def _chunk_text(self, text: str) -> List[str]:
        words = text.split()
        chunks = []
        i = 0
        while i < len(words):
            chunk_words = words[i:i + self.chunk_size]
            chunk_text = " ".join(chunk_words)
            chunks.append(chunk_text)
            i += self.chunk_size - self.chunk_overlap
        return chunks