File size: 4,340 Bytes
d037cdf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
"""HTML parser.

Contains parser for html files.

"""
import re
from pathlib import Path
from typing import Dict, Union
from abc import abstractmethod
from pathlib import Path
from typing import Dict, List, Optional, Union


class BaseParser:
    """Base class for all parsers."""

    def __init__(self, parser_config: Optional[Dict] = None):
        """Init params."""
        self._parser_config = parser_config

    def init_parser(self) -> None:
        """Init parser and store it."""
        parser_config = self._init_parser()
        self._parser_config = parser_config

    @property
    def parser_config_set(self) -> bool:
        """Check if parser config is set."""
        return self._parser_config is not None

    @property
    def parser_config(self) -> Dict:
        """Check if parser config is set."""
        if self._parser_config is None:
            raise ValueError("Parser config not set.")
        return self._parser_config

    @abstractmethod
    def _init_parser(self) -> Dict:
        """Initialize the parser with the config."""

    @abstractmethod
    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, List[str]]:
        """Parse file."""

class HTMLParser(BaseParser):
    """HTML parser."""

    def _init_parser(self) -> Dict:
        """Init parser."""
        return {}

    def parse_file(self, file: Path, errors: str = "ignore") -> Union[str, list[str]]:
        """Parse file.

            Returns:
            Union[str, List[str]]: a string or a List of strings.
        """
        try:
            from unstructured.partition.html import partition_html
            from unstructured.staging.base import convert_to_isd
            from unstructured.cleaners.core import clean
        except ImportError:
            raise ValueError("unstructured package is required to parse HTML files.")

        # Using the unstructured library to convert the html to isd format
        # isd sample : isd = [
        #   {"text": "My Title", "type": "Title"},
        #   {"text": "My Narrative", "type": "NarrativeText"}
        # ]
        with open(file, "r", encoding="utf-8") as fp:
            elements = partition_html(file=fp)
            isd = convert_to_isd(elements)

            # Removing non ascii charactwers from isd_el['text']
        for isd_el in isd:
            isd_el['text'] = isd_el['text'].encode("ascii", "ignore").decode()

        # Removing all the \n characters from isd_el['text'] using regex and replace with single space
        # Removing all the extra spaces  from isd_el['text'] using regex and replace with single space
        for isd_el in isd:
            isd_el['text'] = re.sub(r'\n', ' ', isd_el['text'], flags=re.MULTILINE | re.DOTALL)
            isd_el['text'] = re.sub(r"\s{2,}", " ", isd_el['text'], flags=re.MULTILINE | re.DOTALL)

        # more cleaning: extra_whitespaces, dashes, bullets, trailing_punctuation
        for isd_el in isd:
            clean(isd_el['text'], extra_whitespace=True, dashes=True, bullets=True, trailing_punctuation=True)

        # Creating a list of all the indexes of isd_el['type'] = 'Title'
        title_indexes = [i for i, isd_el in enumerate(isd) if isd_el['type'] == 'Title']

        # Creating 'Chunks' - List of lists of strings 
        # each list starting with with isd_el['type'] = 'Title' and all the data till the next 'Title'
        # Each Chunk can be thought of as an individual set of data, which can be sent to the model
        # Where Each Title is grouped together with the data under it

        Chunks = [[]]
        final_chunks = list(list())

        for i, isd_el in enumerate(isd):
            if i in title_indexes:
                Chunks.append([])
            Chunks[-1].append(isd_el['text'])

        # Removing all the chunks with sum of lenth of all the strings in the chunk < 25
        # TODO: This value can be an user defined variable
        for chunk in Chunks:
            # sum of lenth of all the strings in the chunk
            sum = 0
            sum += len(str(chunk))
            if sum < 25:
                Chunks.remove(chunk)
            else:
                # appending all the approved chunks to final_chunks as a single string       
                final_chunks.append(" ".join([str(item) for item in chunk]))
        return final_chunks