Spaces:
Running
Running
| import gradio as gr | |
| from hazm import Normalizer, word_tokenize, Lemmatizer, POSTagger, Chunker | |
| # Initialize Hazm components | |
| lemmatizer = Lemmatizer() | |
| pos_tagger = POSTagger(model='resources/pos_tagger.model') # Load POS Tagger model | |
| chunker = Chunker(model='resources/chunker.model') # Load Chunker model | |
| def process_text(text, operation, correct_spacing, remove_diacritics, remove_specials_chars, decrease_repeated_chars, persian_style, persian_numbers, unicodes_replacement, seperate_mi): | |
| # Initialize the Normalizer with user-selected parameters | |
| normalizer = Normalizer( | |
| correct_spacing=correct_spacing, | |
| remove_diacritics=remove_diacritics, | |
| remove_specials_chars=remove_specials_chars, | |
| decrease_repeated_chars=decrease_repeated_chars, | |
| persian_style=persian_style, | |
| persian_numbers=persian_numbers, | |
| unicodes_replacement=unicodes_replacement, | |
| seperate_mi=seperate_mi | |
| ) | |
| result = "" | |
| if operation == "normalize": | |
| result = normalizer.normalize(text) | |
| elif operation == "tokenize": | |
| tokens = word_tokenize(text) | |
| result = " ".join(tokens) # Show tokens as a space-separated string | |
| elif operation == "lemmatize": | |
| lemmas = [lemmatizer.lemmatize(token) for token in word_tokenize(text)] | |
| result = " ".join(lemmas) # Show lemmas as a space-separated string | |
| elif operation == "chunk": | |
| # Tokenize and tag the input text | |
| tokens = word_tokenize(text) | |
| pos_tags = pos_tagger.tag(tokens) # Generate POS tags | |
| chunks = chunker.parse(pos_tags) # Pass tagged tokens to Chunker | |
| result = str(chunks) # Show chunks as text | |
| elif operation == "pos_tag": | |
| tokens = word_tokenize(text) | |
| pos_tags = pos_tagger.tag(tokens) | |
| result = " ".join([f"{token}/{tag}" for token, tag in pos_tags]) # Format: token/POS | |
| return result | |
| def toggle_normalization_options(operation): | |
| # Show normalization options only if 'normalize' is selected | |
| is_normalize = (operation == "normalize") | |
| return [gr.update(visible=is_normalize)] * 8 # Update visibility for all 8 checkboxes | |
| # Define Gradio interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Persian Text Processor with Hazm") | |
| gr.Markdown("Select an operation and, if applicable, adjust normalization parameters to process the input text using Hazm.") | |
| with gr.Row(): | |
| input_text = gr.Textbox(lines=10, label="Input Text", placeholder="Enter Persian text here...") | |
| with gr.Row(): | |
| operation = gr.Radio( | |
| choices=['normalize', 'tokenize', 'lemmatize', 'chunk', 'pos_tag'], | |
| label="Select Operation", | |
| value='normalize', | |
| info="Choose the type of text processing operation to perform." | |
| ) | |
| with gr.Column(visible=True) as normalization_options: | |
| correct_spacing = gr.Checkbox(value=True, label="Correct Spacing", info="Adjusts spaces between words for proper formatting.") | |
| remove_diacritics = gr.Checkbox(value=True, label="Remove Diacritics", info="Eliminates diacritical marks from the text.") | |
| remove_specials_chars = gr.Checkbox(value=True, label="Remove Special Characters", info="Removes non-alphanumeric characters.") | |
| decrease_repeated_chars = gr.Checkbox(value=True, label="Decrease Repeated Characters", info="Reduces sequences of repeated characters to a single character.") | |
| persian_style = gr.Checkbox(value=True, label="Persian Style", info="Applies standard Persian typography rules.") | |
| persian_numbers = gr.Checkbox(value=True, label="Persian Numbers", info="Converts Arabic numbers to Persian numbers.") | |
| unicodes_replacement = gr.Checkbox(value=True, label="Unicodes Replacement", info="Replaces characters with their standard Unicode equivalents.") | |
| seperate_mi = gr.Checkbox(value=True, label="Separate 'می'", info="Separates the Persian prefix 'می' from verbs.") | |
| operation.change( | |
| fn=toggle_normalization_options, | |
| inputs=operation, | |
| outputs=normalization_options | |
| ) | |
| output_text = gr.Textbox(label="Processed Text", lines=10, interactive=False, show_copy_button=True) | |
| submit_button = gr.Button("Process Text") | |
| submit_button.click( | |
| fn=process_text, | |
| inputs=[ | |
| input_text, operation, | |
| correct_spacing, remove_diacritics, remove_specials_chars, | |
| decrease_repeated_chars, persian_style, persian_numbers, | |
| unicodes_replacement, seperate_mi | |
| ], | |
| outputs=output_text | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |