File size: 12,533 Bytes
6c26805 0d400f0 6c26805 0d400f0 b8713da 0d400f0 419554f 7b6ca16 0d400f0 7b6ca16 0d400f0 1b03fe7 0d400f0 2454fb2 1b03fe7 1d0bfe1 1b03fe7 1d0bfe1 338b21c 1d0bfe1 338b21c 2a35973 338b21c 2a35973 338b21c cee93d5 338b21c 1d0bfe1 338b21c d8b57db 0391467 d8b57db 0391467 d8b57db 0391467 d8b57db 0391467 d8b57db 0391467 d8b57db 0391467 d8b57db 0391467 d8b57db 0391467 d8b57db 0391467 d8b57db ae16ad4 eb14a89 f5b1f4a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 |
import streamlit as st
st.markdown(
"""
<style>
body {
background-color: #f9f9f9; /* Light gray background */
font-family: 'Arial', sans-serif;
}
@keyframes fadeIn {
0% { opacity: 0; transform: translateY(-20px); }
100% { opacity: 1; transform: translateY(0); }
}
.title {
text-align: center;
color: #2c3e50; /* Deep gray-blue */
font-size: 3rem;
font-weight: bold;
animation: fadeIn 1s ease-in-out;
}
.caption {
text-align: center;
font-style: italic;
font-size: 1.2rem;
color: #7f8c8d; /* Soft gray */
animation: fadeIn 1.5s ease-in-out;
}
.section {
font-size: 1.1rem;
text-align: justify;
line-height: 1.8;
color: #34495e; /* Muted gray */
background: #ffffff; /* White card-style background */
padding: 20px;
border-radius: 10px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
animation: fadeIn 2s ease-in-out;
margin: 10px 0;
}
.image-container {
text-align: center;
margin: 20px 0;
animation: fadeIn 2.5s ease-in-out;
}
.image-container img {
border-radius: 15px;
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.2);
transition: transform 0.3s ease-in-out;
}
.image-container img:hover {
transform: scale(1.05); /* Subtle zoom effect */
}
</style>
""",
unsafe_allow_html=True,
)
st.header(":blue[β¨ Pre-processing of Text πΊοΈ]")
st.markdown("<div class='section'>", unsafe_allow_html=True)
st.markdown("<h2 class='title'>π Transforming Raw Text</h2>", unsafe_allow_html=True)
st.markdown("<p class='subtitle'>Convert unstructured text into a clean and structured format</p>", unsafe_allow_html=True)
st.info("π **We preprocess text in three key ways:**\n\nβ
Cleaning - Problem-specific\n\nβ
Simple Pre-processing\n\nβ
Advanced Pre-processing")
st.markdown("</div>", unsafe_allow_html=True)
st.markdown("### β¨ **Essential Preprocessing Techniques:**")
st.markdown("β
**Convert Text Case** β Convert all words to **uppercase** or **lowercase** to maintain consistency and reduce dimensions.")
st.markdown("β
**Handle URLs and Tags** β Based on problem statement, either remove or preserve them.")
st.markdown("β
**Mentions, Digits, Emails** β Generally removed unless required by the analysis.")
st.markdown("β
**Preserve Emojis** β Emojis carry sentiment and play a crucial role in NLP tasks.")
st.markdown("β
**Grammar Preservation** β If grammar is needed, avoid removing punctuation.")
st.success("π Well-structured and clean text significantly boosts ML model performance!")
st.markdown("<div class='section'>", unsafe_allow_html=True)
st.markdown("<h2 class='title'>π NLP Data Preprocessing</h2>", unsafe_allow_html=True)
st.markdown("<p class='subtitle'>Transforming raw text into structured data for better ML performance</p>", unsafe_allow_html=True)
st.success("π **Benefits of Preprocessing:**\n\nβ
Reduces dimensionality\n\nβ
Improves ML performance\n\nβ
Converts raw text into problem-specific structured data")
st.markdown("### β¨ **Essential Preprocessing Steps:**")
st.markdown(
"""
<div class='image-container'>
<img src="https://cdn-uploads.huggingface.co/production/uploads/66bde9bf3c885d04498227a0/HtdtNm-UJdfN057BeKSgV.png",width=400>
</div>
""",
unsafe_allow_html=True,
)
st.markdown("β
**Converting Text Case** β Reduces dimensionality; case conversion depends on problem statement.")
st.markdown("β
**Removing URLs, Tags, and Mentions** β Retain only if required by the problem statement.")
st.markdown("β
**Handling Emojis** β Preserve or convert emoji data based on context.")
st.markdown("β
**Expanding Contractions & Acronyms** β Convert abbreviations into standard text.")
st.markdown("β
**Stop Words Removal** β Optional, useful for text simplification.")
st.markdown("β
**Stemming & Lemmatization** β Perform only if grammar is **not** crucial for analysis.")
st.markdown("</div>", unsafe_allow_html=True)
st.markdown("<h1 class='header-title'>π Stemming & Lemmatization π¬</h1>", unsafe_allow_html=True)
st.markdown(
"""
<div class='info-box'>
<p>π In English, words are often made up of three components:</p>
<ul>
<li>πΉ <span class='highlight'>Prefix</span> + <span class='highlight'>Word</span> + <span class='highlight'>Suffix</span></li>
</ul>
<p>β
Words without a suffix are called <span class='highlight'>Root Words</span>.</p>
<p>β
If a suffix is added to a root word, the resulting word is an <span class='highlight'>Inflected Word</span>:</p>
<ul>
<li>π οΈ <span class='highlight'>Root Word</span> + <span class='highlight'>Suffix</span> = Inflected Word</li>
</ul>
<p>π¬ The process of removing the suffix from inflected words to get the root word is known as:</p>
<ul>
<li>βοΈ <span class='highlight'>Stemming</span></li>
<li>π§ <span class='highlight'>Lemmatization</span></li>
</ul>
</div>
""",
unsafe_allow_html=True
)
st.markdown("<h1 class='header-title'>πΏ Stemming π</h1>", unsafe_allow_html=True)
st.markdown(
"""
<div class='info-box'>
<p>π <span class='highlight'>Stemming</span> is the process of reducing an **inflected word** to its root form, known as the <span class='highlight'>stem</span>.</p>
<ul>
<li>πΉ <span class='highlight'>Inflected word β Root word (Stem)</span></li>
<li>β‘ The **stem may not always be a valid English word**.</li>
<li>π <span class='highlight'>Performance is faster</span> compared to lemmatization.</li>
<li>β‘ It is used only for **Removal**.</li>
<li>πΉ Whenever we need **Retrieval system** we use stemming</li>
</ul>
</div>
""",
unsafe_allow_html=True
)
st.markdown("<h2 class='sub-header'>π Types of Stemming</h2>", unsafe_allow_html=True)
st.markdown("""
- There are **three** major types of stemming techniques:
- πΉ **Porter Stemmer** ποΈ (Rule-based, works in 5 stages)
- πΉ **Snowball Stemmer** βοΈ (Rule-base, Language adaptable)
- πΉ **Lancaster Stemmer** π (Iterative, aggressive removal)
""")
st.markdown("<h2 class='sub-header'>ποΈ Porter Stemmer</h2>", unsafe_allow_html=True)
st.markdown(
"""
<div class='info-box'>
<ul>
<li>πΉ A Rule-based Algorithm for stemming.</li>
<li>πΉ It takes a particular word which have some rule.</li>
<li>πΉ For a particular rule it'll going on removing suffix till it reaches 5th stage until the inflection is removed.</li>
<li>πΉ Works only for the English language.</li>
</ul>
</div>
""",
unsafe_allow_html=True
)
st.markdown("<h2 class='sub-header'>βοΈ Snowball Stemmer</h2>", unsafe_allow_html=True)
st.markdown(
"""
<div class='info-box'>
<ul>
<li>πΉ An advanced version of the Porter Stemmer.</li>
<li>πΉ Can be applied to multiple languages.</li>
</ul>
</div>
""",
unsafe_allow_html=True
)
st.markdown("<h2 class='sub-header'>π Lancaster Stemmer</h2>", unsafe_allow_html=True)
st.markdown(
"""
<div class='info-box'>
<ul>
<li>πΉ An Iterative Algorithm for stemming.</li>
<li>πΉ Removes suffixes in multiple iterations.</li>
<li>β οΈ More aggressive removal, which might result in non-English words.</li>
</ul>
</div>
""",
unsafe_allow_html=True
)
st.markdown("<h1 class='header-title'>π Lemmatization π</h1>", unsafe_allow_html=True)
st.markdown(
"""
<div class='info-box'>
<p>π <span class='highlight'>Lemmatization</span> is the process of reducing an inflected word to its root form, known as the <span class='highlight'>lemma</span>.</p>
<ul>
<li>πΉ <span class='highlight'>Inflected word β Root word (Lemma)</span></li>
<li>β
The lemma is always an actual English word.</li>
<li>π’ <span class='highlight'>Performance is slower</span> than stemming.</li>
<li>π Both removal & dictionary-based checking are performed.</li>
<li>π Used when we need to preserve grammar in text.</li>
</ul>
</div>
""",
unsafe_allow_html=True
)
st.markdown("<h2 class='sub-header'>π WordNet Lemmatizer</h2>", unsafe_allow_html=True)
st.markdown(
"""
<div class='info-box'>
<ul>
<li>πΉ Takes an inflected word as input.</li>
<li>ποΈ Searches in a huge dictionary (WordNet) containing millions of English words.</li>
<li>π Iteratively removes suffixes & checks:</li>
<ul>
<li>βοΈ If it's an actual English word, it continues removing more suffixes.</li>
<li>β If it's not an English word, the last valid root word is returned as the lemma.</li>
</ul>
</ul>
</div>
""",
unsafe_allow_html=True
)
st.code('''
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer,LancasterStemmer,SnowballStemmer,WordNetLemmatizer
from nltk.tokenize import sent_tokenize,word_tokenize
def pre_process(data,col,case="lower",tags=True,url=True,mail=True,mentions=True,digits=True,dates=True,emojis=True,contraction=True,stopwordss=True,inflection="stem",stemmer="porter",punc=True):
stp = stopwords.words("english")
stp.remove("not")
ps = PorterStemmer()
ls = LancasterStemmer()
sb = SnowballStemmer(language="english")
wl = WordNetLemmatizer()
## emoji
if emojis==True:
data[col] = data[col].apply(lambda x:emoji.demojize(x,delimiters=('','')))
else:
pass
## case
if case == "lower":
data[col]=data[col].str.lower()
elif case == "upper":
data[col]=data[col].str.upper()
else:
pass
## tags
if tags==True:
data[col] = data[col].apply(lambda x:re.sub("<.*?>"," ",x))
else:
pass
## urls
if url ==True:
data[col] = data[col].apply(lambda x:re.sub("https://\S+"," ",x))
else:
pass
## mails
if mail ==True:
data[col] = data[col].apply(lambda x:re.sub("\S+@\S+"," ",x))
else:
pass
## mentions
if mentions ==True:
data[col] = data[col].apply(lambda x:re.sub("\B[@#]\S+"," ",x))
else:
pass
## digits
if mentions ==True:
data[col] = data[col].apply(lambda x:re.sub("\d"," ",x))
else:
pass
## dates
if dates==True:
data[col] = data[col].apply(lambda x:re.sub(r"^[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{4}$"," ",x))
data[col] = data[col].apply(lambda x:re.sub(r"^[0-9]{4}\/[0-9]{1,2}\/[0-9]{1,2}$"," ",x))
else:
pass
## contractions
if contraction==True:
data[col]= data[col].apply(lambda x:contractions.fix(x))
else:
pass
## punctuations
if punc == True:
data[col]=data[col].apply(lambda x:re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'," ",x))
else:
pass
return data
''')
st.markdown('''
- It'll give the pre-processed text data
- We'll get the clean processed data on which we can perform feature engineering
''')
|