Spaces:

rapacious
/

NLTK

Paused

App Files Files Community

rapacious commited on Feb 24, 2025

Commit

6d4f6ab

verified ·

1 Parent(s): 3b6e119

Update app.py

Browse files

Files changed (1) hide show

app.py +106 -59

app.py CHANGED Viewed

@@ -24,13 +24,10 @@ stemmer = PorterStemmer()
 lemmatizer = WordNetLemmatizer()
 stop_words = set(stopwords.words('english'))
-# Hàm huấn luyện classifier sửa lại
 def train_classifier():
-    # Lấy danh sách file từ thư mục pos và neg
-    pos_files = movie_reviews.fileids('pos')[:50]  # Giới hạn 50 file để nhanh hơn
     neg_files = movie_reviews.fileids('neg')[:50]
-    # Tạo tập huấn luyện
     pos_reviews = [({word: True for word in movie_reviews.words(fileid)}, 'positive') for fileid in pos_files]
     neg_reviews = [({word: True for word in movie_reviews.words(fileid)}, 'negative') for fileid in neg_files]
     train_set = pos_reviews + neg_reviews
@@ -39,55 +36,45 @@ def train_classifier():
 classifier = train_classifier()
-# Hàm chính xử lý các chức năng
-def nlp_tool(input_text, function):
     if not input_text:
         return "Vui lòng nhập văn bản!"
     if function == "Sentence Tokenization":
         return "\n".join(sent_tokenize(input_text))
     elif function == "Word Tokenization":
         return "\n".join(word_tokenize(input_text))
     elif function == "Part-of-Speech Tagging":
         words = word_tokenize(input_text)
         return "\n".join([f"{word}: {tag}" for word, tag in pos_tag(words)])
     elif function == "Stemming":
         words = word_tokenize(input_text)
         return "\n".join([stemmer.stem(word) for word in words])
     elif function == "Lemmatization":
         words = word_tokenize(input_text)
         return "\n".join([lemmatizer.lemmatize(word) for word in words])
     elif function == "Remove Stop Words":
         words = word_tokenize(input_text)
         return "\n".join([word for word in words if word.lower() not in stop_words])
     elif function == "Named Entity Recognition":
         words = word_tokenize(input_text)
         pos_tags = pos_tag(words)
         entities = ne_chunk(pos_tags)
         return str(entities)
     elif function == "Text Classification":
         words = word_tokenize(input_text)
         result = classifier.classify({word: True for word in words})
         return f"Sentiment: {result}"
     elif function == "N-grams (Bigrams)":
         words = word_tokenize(input_text)
         bigrams = list(ngrams(words, 2))
         return "\n".join([f"{w1} - {w2}" for w1, w2 in bigrams])
     elif function == "Collocations":
         words = word_tokenize(input_text)
         finder = BigramCollocationFinder.from_words(words)
         collocations = finder.nbest(nltk.collocations.BigramAssocMeasures().pmi, 5)
         return "\n".join([f"{w1} - {w2}" for w1, w2 in collocations])
     elif function == "WordNet Synsets":
         words = word_tokenize(input_text)
         first_word = words[0] if words else ""
@@ -95,61 +82,121 @@ def nlp_tool(input_text, function):
         if synsets:
             return f"Definition: {synsets[0].definition()}\nExamples: {synsets[0].examples()}"
         return "Không tìm thấy từ trong WordNet!"
     elif function == "Sample from Brown Corpus":
         return " ".join(brown.words()[:50])
     return "Chức năng chưa được triển khai!"
-# Tạo giao diện Gradio
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # Công cụ xử lý ngôn ngữ tự nhiên với NLTK
-        Nhập văn bản và chọn chức năng để khám phá các khả năng của NLTK!
         """
     )
-    with gr.Row():
-        with gr.Column(scale=1):
-            input_text = gr.Textbox(
-                label="Nhập văn bản",
-                placeholder="Ví dụ: I love coding in Python.",
-                lines=5
             )
-            function_dropdown = gr.Dropdown(
-                label="Chọn chức năng",
-                choices=[
-                    "Sentence Tokenization",
-                    "Word Tokenization",
-                    "Part-of-Speech Tagging",
-                    "Stemming",
-                    "Lemmatization",
-                    "Remove Stop Words",
-                    "Named Entity Recognition",
-                    "Text Classification",
-                    "N-grams (Bigrams)",
-                    "Collocations",
-                    "WordNet Synsets",
-                    "Sample from Brown Corpus"
-                ],
-                value="Sentence Tokenization"
             )
-            submit_btn = gr.Button("Xử lý", variant="primary")
-        with gr.Column(scale=2):
-            output_text = gr.Textbox(
-                label="Kết quả",
-                lines=10,
-                interactive=False
             )
-    # Kết nối nút bấm với hàm xử lý
-    submit_btn.click(
-        fn=nlp_tool,
-        inputs=[input_text, function_dropdown],
-        outputs=output_text
-    )
 # Chạy giao diện
 demo.launch()

 lemmatizer = WordNetLemmatizer()
 stop_words = set(stopwords.words('english'))
+# Hàm huấn luyện classifier
 def train_classifier():
+    pos_files = movie_reviews.fileids('pos')[:50]
     neg_files = movie_reviews.fileids('neg')[:50]
     pos_reviews = [({word: True for word in movie_reviews.words(fileid)}, 'positive') for fileid in pos_files]
     neg_reviews = [({word: True for word in movie_reviews.words(fileid)}, 'negative') for fileid in neg_files]
     train_set = pos_reviews + neg_reviews
 classifier = train_classifier()
+# Hàm xử lý cho từng chức năng
+def process_text(input_text, function):
     if not input_text:
         return "Vui lòng nhập văn bản!"
     if function == "Sentence Tokenization":
         return "\n".join(sent_tokenize(input_text))
     elif function == "Word Tokenization":
         return "\n".join(word_tokenize(input_text))
     elif function == "Part-of-Speech Tagging":
         words = word_tokenize(input_text)
         return "\n".join([f"{word}: {tag}" for word, tag in pos_tag(words)])
     elif function == "Stemming":
         words = word_tokenize(input_text)
         return "\n".join([stemmer.stem(word) for word in words])
     elif function == "Lemmatization":
         words = word_tokenize(input_text)
         return "\n".join([lemmatizer.lemmatize(word) for word in words])
     elif function == "Remove Stop Words":
         words = word_tokenize(input_text)
         return "\n".join([word for word in words if word.lower() not in stop_words])
     elif function == "Named Entity Recognition":
         words = word_tokenize(input_text)
         pos_tags = pos_tag(words)
         entities = ne_chunk(pos_tags)
         return str(entities)
     elif function == "Text Classification":
         words = word_tokenize(input_text)
         result = classifier.classify({word: True for word in words})
         return f"Sentiment: {result}"
     elif function == "N-grams (Bigrams)":
         words = word_tokenize(input_text)
         bigrams = list(ngrams(words, 2))
         return "\n".join([f"{w1} - {w2}" for w1, w2 in bigrams])
     elif function == "Collocations":
         words = word_tokenize(input_text)
         finder = BigramCollocationFinder.from_words(words)
         collocations = finder.nbest(nltk.collocations.BigramAssocMeasures().pmi, 5)
         return "\n".join([f"{w1} - {w2}" for w1, w2 in collocations])
     elif function == "WordNet Synsets":
         words = word_tokenize(input_text)
         first_word = words[0] if words else ""
         if synsets:
             return f"Definition: {synsets[0].definition()}\nExamples: {synsets[0].examples()}"
         return "Không tìm thấy từ trong WordNet!"
     elif function == "Sample from Brown Corpus":
         return " ".join(brown.words()[:50])
     return "Chức năng chưa được triển khai!"
+# Tạo giao diện Gradio với các tab
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown(
         """
         # Công cụ xử lý ngôn ngữ tự nhiên với NLTK
+        Chọn tab và nhập văn bản để khám phá các tính năng!
         """
     )
+    with gr.Tabs():
+        # Tab 1: Tokenization
+        with gr.TabItem("Tokenization"):
+            with gr.Row():
+                token_input = gr.Textbox(label="Nhập văn bản", placeholder="Ví dụ: I love coding.", lines=5)
+                token_dropdown = gr.Dropdown(
+                    label="Chọn chức năng",
+                    choices=["Sentence Tokenization", "Word Tokenization"],
+                    value="Sentence Tokenization"
+                )
+            token_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
+            token_btn = gr.Button("Xử lý", variant="primary")
+            gr.Markdown(
+                """
+                ### Hướng dẫn:
+                - **Sentence Tokenization:** Tách văn bản thành các câu riêng biệt.
+                - **Word Tokenization:** Tách văn bản thành các từ riêng lẻ.
+                """
             )
+            token_btn.click(fn=process_text, inputs=[token_input, token_dropdown], outputs=token_output)
+        # Tab 2: Morphology
+        with gr.TabItem("Morphology"):
+            with gr.Row():
+                morph_input = gr.Textbox(label="Nhập văn bản", placeholder="Ví dụ: Running is fun.", lines=5)
+                morph_dropdown = gr.Dropdown(
+                    label="Chọn chức năng",
+                    choices=["Stemming", "Lemmatization", "Remove Stop Words"],
+                    value="Stemming"
+                )
+            morph_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
+            morph_btn = gr.Button("Xử lý", variant="primary")
+            gr.Markdown(
+                """
+                ### Hướng dẫn:
+                - **Stemming:** Rút gọn từ về dạng gốc thô (VD: 'running' → 'run').
+                - **Lemmatization:** Rút gọn từ về dạng gốc có nghĩa (VD: 'better' → 'good').
+                - **Remove Stop Words:** Loại bỏ từ dừng như 'the', 'is' (chỉ hỗ trợ tiếng Anh).
+                """
             )
+            morph_btn.click(fn=process_text, inputs=[morph_input, morph_dropdown], outputs=morph_output)
+        # Tab 3: Syntax & Semantics
+        with gr.TabItem("Syntax & Semantics"):
+            with gr.Row():
+                syntax_input = gr.Textbox(label="Nhập văn bản", placeholder="Ví dụ: Apple is in California.", lines=5)
+                syntax_dropdown = gr.Dropdown(
+                    label="Chọn chức năng",
+                    choices=["Part-of-Speech Tagging", "Named Entity Recognition", "WordNet Synsets"],
+                    value="Part-of-Speech Tagging"
+                )
+            syntax_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
+            syntax_btn = gr.Button("Xử lý", variant="primary")
+            gr.Markdown(
+                """
+                ### Hướng dẫn:
+                - **Part-of-Speech Tagging:** Gắn nhãn từ loại (VD: danh từ, động từ).
+                - **Named Entity Recognition:** Nhận diện thực thể (VD: tên người, địa điểm).
+                - **WordNet Synsets:** Tra cứu định nghĩa và ví dụ từ WordNet (cho từ đầu tiên).
+                """
             )
+            syntax_btn.click(fn=process_text, inputs=[syntax_input, syntax_dropdown], outputs=syntax_output)
+        # Tab 4: Text Analysis
+        with gr.TabItem("Text Analysis"):
+            with gr.Row():
+                analysis_input = gr.Textbox(label="Nhập văn bản", placeholder="Ví dụ: I love this movie!", lines=5)
+                analysis_dropdown = gr.Dropdown(
+                    label="Chọn chức năng",
+                    choices=["Text Classification", "N-grams (Bigrams)", "Collocations"],
+                    value="Text Classification"
+                )
+            analysis_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
+            analysis_btn = gr.Button("Xử lý", variant="primary")
+            gr.Markdown(
+                """
+                ### Hướng dẫn:
+                - **Text Classification:** Phân loại cảm xúc (tích cực/tiêu cực).
+                - **N-grams (Bigrams):** Tạo các cặp từ liên tiếp.
+                - **Collocations:** Tìm các cặp từ thường xuất hiện cùng nhau.
+                """
+            )
+            analysis_btn.click(fn=process_text, inputs=[analysis_input, analysis_dropdown], outputs=analysis_output)
+        # Tab 5: Corpus
+        with gr.TabItem("Corpus"):
+            with gr.Row():
+                corpus_input = gr.Textbox(label="Nhập văn bản (không cần thiết)", placeholder="Để trống cũng được", lines=5)
+                corpus_dropdown = gr.Dropdown(
+                    label="Chọn chức năng",
+                    choices=["Sample from Brown Corpus"],
+                    value="Sample from Brown Corpus"
+                )
+            corpus_output = gr.Textbox(label="Kết quả", lines=10, interactive=False)
+            corpus_btn = gr.Button("Xử lý", variant="primary")
+            gr.Markdown(
+                """
+                ### Hướng dẫn:
+                - **Sample from Brown Corpus:** Lấy mẫu 50 từ từ Brown Corpus bất kể văn bản nhập vào.
+                """
+            )
+            corpus_btn.click(fn=process_text, inputs=[corpus_input, corpus_dropdown], outputs=corpus_output)
 # Chạy giao diện
 demo.launch()