NLPHub / stages /data_preprocessing.py
NeonSamurai's picture
Update stages/data_preprocessing.py
9d517d8 verified
import streamlit as st
def main():
def preprocess_text(text: str) -> str:
import re
import emoji
"""
Preprocesses the input text by performing:
1. Lowercasing
2. Removing HTML/XML tags
3. Removing URLs
4. Removing mentions/hashtags
5. Handling emojis
6. Removing numbers and special characters
"""
text = text.lower()
text = re.sub(r'<.*?>', '', text)
text = re.sub(r'http\S+|www\S+|https\S+', '', text)
text = re.sub(r'@\w+|#\w+', '', text)
text = emoji.replace_emoji(text, replace='')
text = re.sub(r'[^a-zA-Z\s]', '', text)
return text
st.title("**Step 4: Data Pre-processing**")
st.markdown("""
### **Welcome to Text Pre-processing**
In this step, we clean and prepare raw text for further analysis or model training. This is essential for ensuring that the models understand the core content of the text.
Here’s a breakdown of the pre-processing steps we’ll perform:
""")
st.markdown("""
1. **Lowercasing**: Converts all text to lowercase to avoid distinguishing between the same word in different cases (e.g., 'Apple' and 'apple').
2. **Removing HTML/XML Tags**: Strips out HTML/XML tags (e.g., `<p>`, `<div>`) that don't contribute to the meaning.
3. **Removing URLs**: Eliminates URLs (e.g., `https://example.com`) which are irrelevant to most NLP tasks.
4. **Removing Mentions/Hashtags**: Removes social media mentions (e.g., `@username`) and hashtags (e.g., `#hashtag`), to focus on the text itself.
5. **Handling Emojis**: Emojis are removed or replaced with a placeholder (e.g., :blush: becomes 'blush').
6. **Handling Numbers and Special Characters**: Removes numbers and special characters unless they carry specific meaning.
""")
st.markdown("""
### **Try it yourself!**:sparkles:
Here's an example for you to preprocess. Try pasting it into the text box below, and see how it gets cleaned. Feel free to modify it or enter your own text with URLs, mentions, emojis, etc.
""")
user_input = st.text_area("**Enter text to preprocess**",
"Add some text with URLs, mentions, emojis, etc. here! For example: 'Check this out: https://example.com @user #fun'")
if user_input:
preprocessed_text = preprocess_text(user_input)
st.subheader("**Pre-processed Text**")
st.write(f"**Before Pre-processing**: \n")
st.markdown(f"{user_input}")
st.write("**After Pre-processing**: \n")
st.markdown(f"{preprocessed_text}")
st.markdown("""
---
### **Why is Pre-processing Important?**
- It removes noise from text, such as irrelevant links or social media mentions.
- It standardizes the text, making it easier for models to learn patterns.
- It improves the performance of NLP models by simplifying the input.
""")
st.divider()
main()