my-fast-api / backend /document_processor.py
strelizi's picture
updated
3806245
import requests
import PyPDF2
import docx
from io import BytesIO
from typing import Dict, List
import re
from pathlib import Path
class AdvancedDocumentProcessor:
def __init__(self):
self.supported_formats = ['.pdf', '.docx', '.txt']
self.chunk_size = 600 # words per chunk
self.chunk_overlap = 100
# === Public methods ===
def process_document(self, url: str) -> Dict:
"""Download document from URL and extract text + metadata"""
try:
response = requests.get(url, timeout=30)
response.raise_for_status()
if url.endswith('.pdf') or 'pdf' in response.headers.get('content-type', ''):
text = self._extract_pdf_text(response.content)
elif url.endswith('.docx') or 'word' in response.headers.get('content-type', ''):
text = self._extract_docx_text(response.content)
else:
text = response.text
return {
'text': text,
'metadata': self._extract_metadata(text),
'document_type': self._detect_document_type(text),
'url': url
}
except Exception as e:
raise Exception(f"Error processing document: {str(e)}")
def process_file(self, file_path: str) -> List[Dict]:
"""
Process a local file path into chunks with metadata
Returns: list of {id, text, page, metadata}
"""
path = Path(file_path)
suffix = path.suffix.lower()
if suffix == ".pdf":
pages = self._pdf_pages_from_path(file_path)
elif suffix == ".docx":
pages = [self._extract_docx_text_from_path(file_path)]
elif suffix == ".txt":
pages = [Path(file_path).read_text(encoding="utf-8", errors="ignore")]
else:
raise ValueError(f"Unsupported file format: {suffix}")
# Chunk each page and add page number metadata
chunks = []
idx = 0
for pnum, page_text in enumerate(pages, start=1):
for chunk in self._chunk_text(page_text):
chunks.append({
"id": f"chunk-{idx}",
"text": chunk,
"page": pnum,
"metadata": self._extract_metadata(chunk)
})
idx += 1
return chunks
# === Internal extractors ===
def _extract_pdf_text(self, content: bytes) -> str:
pdf_file = BytesIO(content)
reader = PyPDF2.PdfReader(pdf_file)
text = ""
for page in reader.pages:
page_text = page.extract_text() or ""
text += page_text + "\n"
return text
def _pdf_pages_from_path(self, file_path: str) -> List[str]:
reader = PyPDF2.PdfReader(file_path)
return [(p.extract_text() or "") for p in reader.pages]
def _extract_docx_text(self, content: bytes) -> str:
doc_file = BytesIO(content)
doc = docx.Document(doc_file)
return "\n".join(p.text for p in doc.paragraphs)
def _extract_docx_text_from_path(self, file_path: str) -> str:
doc = docx.Document(file_path)
return "\n".join(p.text for p in doc.paragraphs)
# === Metadata & type detection ===
def _extract_metadata(self, text: str) -> Dict:
return {
'word_count': len(text.split()),
'character_count': len(text),
'paragraph_count': len(text.split('\n\n')),
'has_tables': 'table' in text.lower(),
'has_sections': bool(re.search(r'\b(section|clause|article)\s+\d+', text.lower()))
}
def _detect_document_type(self, text: str) -> str:
text_lower = text.lower()
if any(word in text_lower for word in ['policy', 'insurance', 'premium', 'coverage']):
return 'insurance_policy'
elif any(word in text_lower for word in ['contract', 'agreement', 'terms']):
return 'legal_contract'
elif any(word in text_lower for word in ['employee', 'hr', 'benefits', 'salary']):
return 'hr_document'
else:
return 'general_document'
# === Chunking ===
def _chunk_text(self, text: str) -> List[str]:
words = text.split()
chunks = []
i = 0
while i < len(words):
chunk_words = words[i:i + self.chunk_size]
chunk_text = " ".join(chunk_words)
chunks.append(chunk_text)
i += self.chunk_size - self.chunk_overlap
return chunks