Spaces:

peter2000
/

policy_test

Runtime error

App Files Files Community

peter2000 commited on Jul 28, 2022

Commit

5a67f08

1 Parent(s): 9423e4e

Delete udfPreprocess/cleaning,py

Browse files

Files changed (1) hide show

udfPreprocess/cleaning,py +0 -144

udfPreprocess/cleaning,py DELETED Viewed

@@ -1,144 +0,0 @@
-import pandas as pd
-import numpy as np
-import string
-import nltk
-import spacy
-import en_core_web_sm
-import re
-import streamlit as st
-from haystack.nodes import PreProcessor
-'''basic cleaning - suitable for transformer models'''
-def basic(s):
-    """
-    :param s: string to be processed
-    :return: processed string: see comments in the source code for more info
-    """
-    # Text Lowercase
-    #s = s.lower()
-    # Remove punctuation
-    #translator = str.maketrans(' ', ' ', string.punctuation)
-    #s = s.translate(translator)
-    # Remove URLs
-    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
-    s = re.sub(r"http\S+", " ", s)
-    # Remove new line characters
-    #s = re.sub('\n', ' ', s)
-    # Remove distracting single quotes
-    #s = re.sub("\'", " ", s)
-    # Remove all remaining numbers and non alphanumeric characters
-    #s = re.sub(r'\d+', ' ', s)
-    #s = re.sub(r'\W+', ' ', s)
-    # define custom words to replace:
-    #s = re.sub(r'strengthenedstakeholder', 'strengthened stakeholder', s)
-    return s.strip()
-def preprocessingForSDG(document):
-    """
-    takes in haystack document object and splits it into paragraphs and applies simple cleaning.
-    Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
-    list that contains all text joined together.
-    """
-    preprocessor = PreProcessor(
-        clean_empty_lines=True,
-        clean_whitespace=True,
-        clean_header_footer=True,
-        split_by="word",
-        split_length=100,
-        split_respect_sentence_boundary=True,
-        split_overlap=4
-    )
-    for i in document:
-        docs_processed = preprocessor.process([i])
-        for item in docs_processed:
-            item.content = basic(item.content)
-    st.write("your document has been splitted to", len(docs_processed), "paragraphs")
-    # create dataframe of text and list of all text
-    df = pd.DataFrame(docs_processed)
-    all_text = " ".join(df.content.to_list())
-    par_list = df.content.to_list()
-    return docs_processed, df, all_text, par_list
-def preprocessing(document):
-    """
-    takes in haystack document object and splits it into paragraphs and applies simple cleaning.
-    Returns cleaned list of haystack document objects. One paragraph per object. Also returns pandas df and
-    list that contains all text joined together.
-    """
-    preprocessor = PreProcessor(
-        clean_empty_lines=True,
-        clean_whitespace=True,
-        clean_header_footer=True,
-        split_by="sentence",
-        split_length=3,
-        split_respect_sentence_boundary=False,
-        split_overlap=1
-    )
-    for i in document:
-        docs_processed = preprocessor.process([i])
-        for item in docs_processed:
-            item.content = basic(item.content)
-    st.write("your document has been splitted to", len(docs_processed), "paragraphs")
-    # create dataframe of text and list of all text
-    df = pd.DataFrame(docs_processed)
-    all_text = " ".join(df.content.to_list())
-    par_list = df.content.to_list()
-    return docs_processed, df, all_text, par_list
-'''processing with spacy - suitable for models such as tf-idf, word2vec'''
-def spacy_clean(alpha:str, use_nlp:bool = True) -> str:
-    """
-    Clean and tokenise a string using Spacy. Keeps only alphabetic characters, removes stopwords and
-    filters out all but proper nouns, nounts, verbs and adjectives.
-    Parameters
-    ----------
-    alpha : str
-            The input string.
-    use_nlp : bool, default False
-            Indicates whether Spacy needs to use NLP. Enable this when using this function on its own.
-            Should be set to False if used inside nlp.pipeline
-     Returns
-    -------
-    ' '.join(beta) : a concatenated list of lemmatised tokens, i.e. a processed string
-    Notes
-    -----
-    Fails if alpha is an NA value. Performance decreases as len(alpha) gets large.
-    Use together with nlp.pipeline for batch processing.
-    """
-    nlp = spacy.load("en_core_web_sm", disable=["parser", "ner", "textcat"])
-    if use_nlp:
-        alpha = nlp(alpha)
-    beta = []
-    for tok in alpha:
-        if all([tok.is_alpha, not tok.is_stop, tok.pos_ in ['PROPN', 'NOUN', 'VERB', 'ADJ']]):
-            beta.append(tok.lemma_)
-    text = ' '.join(beta)
-    text = text.lower()
-    return text