File size: 5,568 Bytes
48c7fed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import os
import re
from typing import Tuple, Optional
from pathlib import Path
from pypdf import PdfReader
from docx import Document

_EMOJI_RE = re.compile(
    "[\U0001F600-\U0001F64F"  # emoticons (πŸ˜€ πŸ˜‚ πŸ₯Ή etc.)
    "\U0001F300-\U0001F5FF"  # misc symbols & pictographs (🌍 πŸŽ‰ πŸ”₯ etc.)
    "\U0001F680-\U0001F6FF"  # transport & map (πŸš€ ✈️ πŸš— etc.)
    "\U0001F1E0-\U0001F1FF"  # regional indicator letters (flags πŸ‡ΊπŸ‡Έ)
    "\U0001F900-\U0001F9FF"  # supplemental symbols (πŸ€” 🀣 🧠 etc.)
    "\U0001FA00-\U0001FA6F"  # chess / extended pictographic
    "\U0001FA70-\U0001FAFF"  # symbols & pictographs extended-A
    "]+",
    flags=re.UNICODE,
)

class FileParser:
    """
    Parse multiple file formats and extract text.
    Supports: PDF, DOCX, TXT, and raw text input.
    """
    
    SUPPORTED_FORMATS = {".pdf", ".docx", ".doc", ".txt"}
    
    @staticmethod
    def parse_file(file_path: str) -> Tuple[str, str, Optional[Exception]]:
        """
        Parse a file and extract text.
        
        Args:
            file_path: Path to the file
            
        Returns:
            Tuple of (text, format, error)
            - text: Extracted text content
            - format: File format (pdf, docx, txt)
            - error: Exception if parsing failed, None if successful
        """
        file_extension = Path(file_path).suffix.lower()
        
        if file_extension not in FileParser.SUPPORTED_FORMATS:
            error = ValueError(f"Unsupported file format: {file_extension}")
            return "", "", error
        
        if file_extension == ".pdf":
            return FileParser.parse_pdf(file_path)
        elif file_extension in {".docx", ".doc"}:
            return FileParser.parse_docx(file_path)
        elif file_extension == ".txt":
            return FileParser.parse_txt(file_path)
        
        return "", "", ValueError("Unknown error")
    
    @staticmethod
    def parse_pdf(file_path: str) -> Tuple[str, str, Optional[Exception]]:
        """Extract text from PDF file"""
        try:
            text = ""
            with open(file_path, 'rb') as pdf_file:
                pdf_reader = PdfReader(pdf_file)
                
                # Extract text from all pages
                for page_num in range(len(pdf_reader.pages)):
                    page = pdf_reader.pages[page_num]
                    text += page.extract_text() + "\n"
            
            return text.strip(), "pdf", None
        
        except Exception as e:
            return "", "pdf", e
    
    @staticmethod
    def parse_docx(file_path: str) -> Tuple[str, str, Optional[Exception]]:
        """Extract text from DOCX file"""
        try:
            doc = Document(file_path)
            text = ""
            
            # Extract text from all paragraphs
            for paragraph in doc.paragraphs:
                text += paragraph.text + "\n"
            
            # Also extract text from tables if present
            for table in doc.tables:
                for row in table.rows:
                    for cell in row.cells:
                        text += cell.text + "\n"
            
            return text.strip(), "docx", None
        
        except Exception as e:
            return "", "docx", e
    
    @staticmethod
    def parse_txt(file_path: str) -> Tuple[str, str, Optional[Exception]]:
        """Extract text from plain text file"""
        try:
            with open(file_path, 'r', encoding='utf-8') as txt_file:
                text = txt_file.read()
            
            return text.strip(), "txt", None
        
        except UnicodeDecodeError:
            # Try with different encoding
            try:
                with open(file_path, 'r', encoding='latin-1') as txt_file:
                    text = txt_file.read()
                return text.strip(), "txt", None
            except Exception as e:
                return "", "txt", e
        
        except Exception as e:
            return "", "txt", e
    
    @staticmethod
    def parse_raw_text(text: str) -> Tuple[str, str, Optional[Exception]]:
        """Process raw text input"""
        try:
            cleaned_text = text.strip()
            if not cleaned_text:
                return "", "raw", ValueError("Empty text provided")
            return cleaned_text, "raw", None
        except Exception as e:
            return "", "raw", e

class TextCleaner:
    """Clean and normalize extracted text"""
    
    @staticmethod
    def clean(text: str) -> str:
        """
        Clean and normalize text.
        Removes extra whitespace, normalizes line breaks, etc.
        """
        # Remove extra whitespace
        text = ' '.join(text.split())
        
        # Normalize line breaks
        text = text.replace('\r\n', '\n').replace('\r', '\n')
        
        return text
    
    @staticmethod
    def get_text_stats(text: str) -> dict:
        """Get statistics about text"""
        words = text.split()
        sentences = text.split('.')

        return {
            "character_count": len(text),
            "word_count": len(words),
            "sentence_count": len([s for s in sentences if s.strip()]),
            "average_word_length": len(text) / len(words) if words else 0,
            "average_sentence_length": len(words) / len(sentences) if sentences else 0,
            "emoji_count": sum(len(m) for m in _EMOJI_RE.findall(text)),
            "em_dash_count": text.count('\u2014'),
            "arrow_count": text.count('\u2192'),
        }