Spaces:
Runtime error
Runtime error
Update scripts/process.py
Browse files- scripts/process.py +10 -19
scripts/process.py
CHANGED
|
@@ -58,39 +58,29 @@ def load_document(
|
|
| 58 |
id_hash_keys=id_hash_keys))
|
| 59 |
|
| 60 |
return documents
|
| 61 |
-
|
| 62 |
-
def preprocessing(document
|
| 63 |
-
split_by: Literal["sentence", "word"] = 'sentence',
|
| 64 |
-
split_length:int = 3):
|
| 65 |
-
|
| 66 |
"""
|
| 67 |
-
takes in haystack document object and splits it into
|
| 68 |
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
|
| 69 |
list that contains all text joined together.
|
| 70 |
"""
|
| 71 |
-
|
| 72 |
-
split_respect_sentence_boundary = False
|
| 73 |
-
split_overlap=0
|
| 74 |
-
else:
|
| 75 |
-
split_respect_sentence_boundary = True
|
| 76 |
-
split_overlap= 20
|
| 77 |
-
|
| 78 |
preprocessor = PreProcessor(
|
| 79 |
clean_empty_lines=True,
|
| 80 |
clean_whitespace=True,
|
| 81 |
clean_header_footer=True,
|
| 82 |
-
split_by=
|
| 83 |
-
split_length=
|
| 84 |
-
split_respect_sentence_boundary=
|
| 85 |
-
split_overlap=
|
| 86 |
)
|
| 87 |
for i in document:
|
| 88 |
docs_processed = preprocessor.process([i])
|
| 89 |
for item in docs_processed:
|
| 90 |
item.content = basic(item.content)
|
| 91 |
|
| 92 |
-
|
| 93 |
-
# logger.info("document has been splitted to {}".format(len(docs_processed)))
|
| 94 |
|
| 95 |
# create dataframe of text and list of all text
|
| 96 |
#df = pd.DataFrame(docs_processed)
|
|
@@ -98,5 +88,6 @@ def load_document(
|
|
| 98 |
#par_list = df.content.to_list()
|
| 99 |
|
| 100 |
return docs_processed #, df, all_text, par_list
|
|
|
|
| 101 |
|
| 102 |
|
|
|
|
| 58 |
id_hash_keys=id_hash_keys))
|
| 59 |
|
| 60 |
return documents
|
| 61 |
+
|
| 62 |
+
def preprocessing(document):
|
|
|
|
|
|
|
|
|
|
| 63 |
"""
|
| 64 |
+
takes in haystack document object and splits it into paragraphs and applies simple cleaning.
|
| 65 |
Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
|
| 66 |
list that contains all text joined together.
|
| 67 |
"""
|
| 68 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
preprocessor = PreProcessor(
|
| 70 |
clean_empty_lines=True,
|
| 71 |
clean_whitespace=True,
|
| 72 |
clean_header_footer=True,
|
| 73 |
+
split_by="sentence",
|
| 74 |
+
split_length=3,
|
| 75 |
+
split_respect_sentence_boundary=False,
|
| 76 |
+
split_overlap=1
|
| 77 |
)
|
| 78 |
for i in document:
|
| 79 |
docs_processed = preprocessor.process([i])
|
| 80 |
for item in docs_processed:
|
| 81 |
item.content = basic(item.content)
|
| 82 |
|
| 83 |
+
st.write("your document has been splitted to", len(docs_processed), "paragraphs")
|
|
|
|
| 84 |
|
| 85 |
# create dataframe of text and list of all text
|
| 86 |
#df = pd.DataFrame(docs_processed)
|
|
|
|
| 88 |
#par_list = df.content.to_list()
|
| 89 |
|
| 90 |
return docs_processed #, df, all_text, par_list
|
| 91 |
+
|
| 92 |
|
| 93 |
|