Spaces:

synaptyx
/

SuoMoto.AI

Sleeping

File size: 6,992 Bytes

d1dd20b
d178ae1
 
 
d1dd20b
d178ae1
 
289cd09
d1dd20b
 
 
68613c4
d178ae1
 
 
d1dd20b
 
 
 
 
 
 
 
 
289cd09
68613c4
d178ae1
68613c4
 
d1dd20b
 
 
68613c4
 
 
 
289cd09
68613c4
 
 
 
 
 
 
 
 
 
 
d178ae1
68613c4
d1dd20b
d178ae1
d1dd20b
68613c4
68fead3
68613c4
 
 
 
 
 
 
d1dd20b
68613c4
d1dd20b
 
 
68613c4
d1dd20b
 
68613c4
d1dd20b
68613c4
 
 
d1dd20b
68613c4
d1dd20b
 
68613c4
d1dd20b
68613c4
68fead3
68613c4
d1dd20b
 
 
68613c4
d1dd20b
 
 
 
68613c4
 
d1dd20b
 
 
 
 
68613c4
d1dd20b
68613c4
68fead3
 
d178ae1
68613c4
d1dd20b
 
 
 
68613c4
d1dd20b
68613c4
d1dd20b
d178ae1
d1dd20b
68613c4
 
 
 
d1dd20b
 
 
 
68613c4
d1dd20b
68613c4
d178ae1
d1dd20b
68613c4
 
 
 
d1dd20b
 
 
d178ae1
 
 
d1dd20b
68613c4
68fead3
 
 
68613c4
 
68fead3
 
d1dd20b
d178ae1
d1dd20b
d178ae1
 
68fead3
 
 
68613c4
 
68fead3
 
d178ae1
 
 
d1dd20b

# utils/document_processor.py
import pytesseract
from pdf2image import convert_from_path
import docx
import fitz  # PyMuPDF
from PIL import Image
import io
from typing import List, Dict, Optional, Union, Any
import re
import tempfile
import os
import streamlit as st

class DocumentProcessor:
    def __init__(self):
        self.supported_formats = {
            'pdf': self._process_pdf,
            'docx': self._process_docx,
            'txt': self._process_text,
            'jpg': self._process_image,
            'jpeg': self._process_image,
            'png': self._process_image
        }

    def process_document(self, uploaded_file: Any) -> str:
        """Process uploaded document and extract text"""
        try:
            # Get file extension
            file_extension = uploaded_file.name.split('.')[-1].lower()
            
            if file_extension not in self.supported_formats:
                raise ValueError(f"Unsupported file format: {file_extension}")

            # Create a temporary file
            with tempfile.NamedTemporaryFile(delete=False, suffix=f'.{file_extension}') as tmp_file:
                # Write the uploaded file's content to the temporary file
                tmp_file.write(uploaded_file.getbuffer())
                tmp_file.flush()
                
                # Process the temporary file
                processor = self.supported_formats[file_extension]
                text = processor(tmp_file.name)
                
                # Clean up
                os.unlink(tmp_file.name)
                
                return self._clean_text(text)
                
        except Exception as e:
            st.error(f"Error processing document: {str(e)}")
            return ""

    def _process_pdf(self, file_path: str) -> str:
        """Process PDF files"""
        try:
            # Open PDF file
            with fitz.open(file_path) as doc:
                text = ""
                for page_num in range(len(doc)):
                    page = doc[page_num]
                    text += page.get_text()
                return text
        except Exception as e:
            st.error(f"Error processing PDF: {str(e)}")
            return ""

    def _process_docx(self, file_path: str) -> str:
        """Process DOCX files"""
        try:
            doc = docx.Document(file_path)
            text = []
            
            # Get paragraphs
            for para in doc.paragraphs:
                text.append(para.text)
            
            # Get tables
            for table in doc.tables:
                for row in table.rows:
                    text.append(" | ".join(cell.text for cell in row.cells))
            
            return "\n\n".join(text)
        except Exception as e:
            st.error(f"Error processing DOCX: {str(e)}")
            return ""

    def _process_text(self, file_path: str) -> str:
        """Process text files"""
        try:
            with open(file_path, 'r', encoding='utf-8') as file:
                return file.read()
        except UnicodeDecodeError:
            # Try different encodings
            for encoding in ['latin-1', 'iso-8859-1', 'cp1252']:
                try:
                    with open(file_path, 'r', encoding=encoding) as file:
                        return file.read()
                except:
                    continue
            return ""
        except Exception as e:
            st.error(f"Error processing text file: {str(e)}")
            return ""

    def _process_image(self, file_path: str) -> str:
        """Process image files"""
        try:
            image = Image.open(file_path)
            if image.mode != 'RGB':
                image = image.convert('RGB')
            return pytesseract.image_to_string(image)
        except Exception as e:
            st.error(f"Error processing image: {str(e)}")
            return ""

    def _clean_text(self, text: str) -> str:
        """Clean and normalize text"""
        if not text:
            return ""
            
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters but keep basic punctuation
        text = re.sub(r'[^\w\s.,!?-]', '', text)
        # Split into lines and remove empty ones
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        return '\n'.join(lines)

    def chunk_document(self, text: str, chunk_size: int = 2000) -> List[Dict]:
        """Split document into chunks"""
        if not text:
            return []
            
        # Split into paragraphs
        paragraphs = [p.strip() for p in text.split('\n\n') if p.strip()]
        
        chunks = []
        current_chunk = ""
        
        for para in paragraphs:
            if len(current_chunk) + len(para) > chunk_size and current_chunk:
                chunks.append({
                    "text": current_chunk,
                    "metadata": {
                        "length": len(current_chunk),
                        "type": "paragraph"
                    }
                })
                current_chunk = para
            else:
                current_chunk += "\n\n" + para if current_chunk else para
        
        if current_chunk:
            chunks.append({
                "text": current_chunk,
                "metadata": {
                    "length": len(current_chunk),
                    "type": "paragraph"
                }
            })
        
        return chunks

    def get_document_metadata(self, file_path: str) -> Dict:
        """
        Extract metadata from document
        """
        try:
            file_extension = file_path.split('.')[-1].lower()
            file_size = os.path.getsize(file_path)
            created_time = os.path.getctime(file_path)
            modified_time = os.path.getmtime(file_path)
            
            metadata = {
                "filename": os.path.basename(file_path),
                "file_type": file_extension,
                "file_size": file_size,
                "created_time": created_time,
                "modified_time": modified_time
            }
            
            # Add format-specific metadata
            if file_extension == 'pdf':
                doc = fitz.open(file_path)
                metadata.update({
                    "page_count": doc.page_count,
                    "pdf_metadata": doc.metadata
                })
                
            elif file_extension == 'docx':
                doc = docx.Document(file_path)
                metadata.update({
                    "paragraph_count": len(doc.paragraphs),
                    "table_count": len(doc.tables)
                })
            
            return metadata
            
        except Exception as e:
            print(f"Error extracting metadata: {str(e)}")
            return {
                "filename": os.path.basename(file_path),
                "error": str(e)
            }