cryogenic22 commited on
Commit
d178ae1
·
verified ·
1 Parent(s): fb4d9a7

Create utils/document_processor.py

Browse files
Files changed (1) hide show
  1. utils/document_processor.py +104 -0
utils/document_processor.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # utils/document_processor.py
2
+ import pytesseract
3
+ from pdf2image import convert_from_path
4
+ import docx
5
+ import fitz # PyMuPDF
6
+ from PIL import Image
7
+ import io
8
+ from typing import List, Dict
9
+ import spacy
10
+ from transformers import AutoTokenizer, AutoModel
11
+ import torch
12
+ import numpy as np
13
+
14
+ class DocumentProcessor:
15
+ def __init__(self):
16
+ self.nlp = spacy.load("en_core_web_sm")
17
+ self.tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
18
+ self.model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
19
+
20
+ def process_document(self, file_path: str) -> str:
21
+ """Process document and extract text"""
22
+ file_extension = file_path.split('.')[-1].lower()
23
+
24
+ try:
25
+ if file_extension in ['jpg', 'jpeg', 'png']:
26
+ return self._process_image(file_path)
27
+ elif file_extension == 'pdf':
28
+ return self._process_pdf(file_path)
29
+ elif file_extension == 'docx':
30
+ return self._process_docx(file_path)
31
+ else:
32
+ with open(file_path, 'r', encoding='utf-8') as file:
33
+ return file.read()
34
+ except Exception as e:
35
+ print(f"Error processing document: {str(e)}")
36
+ return ""
37
+
38
+ def _process_image(self, file_path: str) -> str:
39
+ """Process image using OCR"""
40
+ image = Image.open(file_path)
41
+ return pytesseract.image_to_string(image)
42
+
43
+ def _process_pdf(self, file_path: str) -> str:
44
+ """Process PDF file"""
45
+ doc = fitz.open(file_path)
46
+ text = ""
47
+
48
+ for page_num in range(doc.page_count):
49
+ page = doc[page_num]
50
+ text += page.get_text()
51
+
52
+ # If no text found, try OCR
53
+ if not text.strip():
54
+ pix = page.get_pixmap()
55
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
56
+ text += pytesseract.image_to_string(img)
57
+
58
+ return text
59
+
60
+ def _process_docx(self, file_path: str) -> str:
61
+ """Process DOCX file"""
62
+ doc = docx.Document(file_path)
63
+ return "\n".join([paragraph.text for paragraph in doc.paragraphs])
64
+
65
+ def chunk_document(self, text: str, chunk_size: int = 1000) -> List[Dict]:
66
+ """Split document into semantic chunks"""
67
+ doc = self.nlp(text)
68
+ chunks = []
69
+ current_chunk = ""
70
+ current_tokens = 0
71
+
72
+ for sent in doc.sents:
73
+ sentence = sent.text.strip()
74
+ tokens = self.tokenizer.encode(sentence)
75
+
76
+ if current_tokens + len(tokens) > chunk_size and current_chunk:
77
+ chunks.append(self._create_chunk(current_chunk))
78
+ current_chunk = sentence
79
+ current_tokens = len(tokens)
80
+ else:
81
+ current_chunk += " " + sentence
82
+ current_tokens += len(tokens)
83
+
84
+ if current_chunk:
85
+ chunks.append(self._create_chunk(current_chunk))
86
+
87
+ return chunks
88
+
89
+ def _create_chunk(self, text: str) -> Dict:
90
+ """Create a chunk with embeddings"""
91
+ # Generate embeddings
92
+ inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
93
+ with torch.no_grad():
94
+ outputs = self.model(**inputs)
95
+ embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
96
+
97
+ return {
98
+ "text": text,
99
+ "embeddings": embeddings,
100
+ "metadata": {
101
+ "length": len(text),
102
+ "token_count": len(self.tokenizer.encode(text))
103
+ }
104
+ }