Spaces:

synaptyx
/

SuoMoto.AI

Sleeping

App Files Files Community

cryogenic22 commited on Dec 9, 2024

Commit

d178ae1

verified ·

1 Parent(s): fb4d9a7

Create utils/document_processor.py

Browse files

Files changed (1) hide show

utils/document_processor.py +104 -0

utils/document_processor.py ADDED Viewed

	@@ -0,0 +1,104 @@

+# utils/document_processor.py
+import pytesseract
+from pdf2image import convert_from_path
+import docx
+import fitz  # PyMuPDF
+from PIL import Image
+import io
+from typing import List, Dict
+import spacy
+from transformers import AutoTokenizer, AutoModel
+import torch
+import numpy as np
+class DocumentProcessor:
+    def __init__(self):
+        self.nlp = spacy.load("en_core_web_sm")
+        self.tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
+        self.model = AutoModel.from_pretrained("nlpaueb/legal-bert-base-uncased")
+    def process_document(self, file_path: str) -> str:
+        """Process document and extract text"""
+        file_extension = file_path.split('.')[-1].lower()
+        try:
+            if file_extension in ['jpg', 'jpeg', 'png']:
+                return self._process_image(file_path)
+            elif file_extension == 'pdf':
+                return self._process_pdf(file_path)
+            elif file_extension == 'docx':
+                return self._process_docx(file_path)
+            else:
+                with open(file_path, 'r', encoding='utf-8') as file:
+                    return file.read()
+        except Exception as e:
+            print(f"Error processing document: {str(e)}")
+            return ""
+    def _process_image(self, file_path: str) -> str:
+        """Process image using OCR"""
+        image = Image.open(file_path)
+        return pytesseract.image_to_string(image)
+    def _process_pdf(self, file_path: str) -> str:
+        """Process PDF file"""
+        doc = fitz.open(file_path)
+        text = ""
+        for page_num in range(doc.page_count):
+            page = doc[page_num]
+            text += page.get_text()
+            # If no text found, try OCR
+            if not text.strip():
+                pix = page.get_pixmap()
+                img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+                text += pytesseract.image_to_string(img)
+        return text
+    def _process_docx(self, file_path: str) -> str:
+        """Process DOCX file"""
+        doc = docx.Document(file_path)
+        return "\n".join([paragraph.text for paragraph in doc.paragraphs])
+    def chunk_document(self, text: str, chunk_size: int = 1000) -> List[Dict]:
+        """Split document into semantic chunks"""
+        doc = self.nlp(text)
+        chunks = []
+        current_chunk = ""
+        current_tokens = 0
+        for sent in doc.sents:
+            sentence = sent.text.strip()
+            tokens = self.tokenizer.encode(sentence)
+            if current_tokens + len(tokens) > chunk_size and current_chunk:
+                chunks.append(self._create_chunk(current_chunk))
+                current_chunk = sentence
+                current_tokens = len(tokens)
+            else:
+                current_chunk += " " + sentence
+                current_tokens += len(tokens)
+        if current_chunk:
+            chunks.append(self._create_chunk(current_chunk))
+        return chunks
+    def _create_chunk(self, text: str) -> Dict:
+        """Create a chunk with embeddings"""
+        # Generate embeddings
+        inputs = self.tokenizer(text, return_tensors="pt", padding=True, truncation=True)
+        with torch.no_grad():
+            outputs = self.model(**inputs)
+        embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
+        return {
+            "text": text,
+            "embeddings": embeddings,
+            "metadata": {
+                "length": len(text),
+                "token_count": len(self.tokenizer.encode(text))
+            }
+        }