Spaces:

ThanaphonJoe
/

hug101

Runtime error

ThanaphonJoe commited on Feb 23, 2024

Commit

43452d4

verified ·

1 Parent(s): 6200f99

test

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,6 +7,9 @@ from sklearn.model_selection import train_test_split
 from sklearn.metrics import confusion_matrix
 import matplotlib.pyplot as plt
 import re
 def deEmojify(text):
@@ -34,12 +37,32 @@ def deEmojify(text):
 def clean_me(data):
-  data['clean_text'] = data.str.replace(r'<[^<>]*>', '', regex=True)
   data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ')
   data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1)
   return(data)
 def combine(a, b):
     return a + " " + b
@@ -79,5 +102,7 @@ with gr.Blocks() as demo:
         cache_examples=True,
     )
 if __name__ == "__main__":
     demo.launch()

 from sklearn.metrics import confusion_matrix
 import matplotlib.pyplot as plt
 import re
+from pythainlp.util import normalize
+from pythainlp.corpus import thai_stopwords
+from pythainlp.tokenize import word_tokenize
 def deEmojify(text):
 def clean_me(data):
+  stopwords = list(thai_stopwords())
+  stopwords.append("nan")
+  stopwords.append("-")
+  stopwords.append("_")
+  stopwords.append("")
+  stopwords.append(" ")
+  data['clean_text'] = data['text'].str.replace(r'<[^<>]*>', '', regex=True)
   data['clean2_text']= data['clean_text'].str.strip().str.lower().str.replace('\r+', ' ').str.replace('\n+',' ').str.replace('\t+',' ')
   data['clean3_text'] = data.apply(lambda row: deEmojify(row['clean2_text']), axis=1)
+  # Normalize text
+  data['clean4_text'] = data.apply(lambda row: normalize(row['clean3_text']), axis=1)
+  # Word segmentation: it will take a while....
+  data['wordseged_text'] = data.apply(lambda row: word_tokenize(row['clean4_text'], engine="newmm-safe"), axis=1)
+  # Join the wordsegged with space
+  data['wordseged_space_text'] = data.apply(lambda row: " ".join(row["wordseged_text"]), axis=1)
   return(data)
 def combine(a, b):
+    data = pd.DataFrame()
+    data['text'] = [a]
+    data = clean_me(data)
+    a = data['wordseged_space_text'][0] + '123'
     return a + " " + b
         cache_examples=True,
     )
 if __name__ == "__main__":
     demo.launch()