| import streamlit as st |
|
|
| st.markdown( |
| """ |
| <style> |
| /* App Background */ |
| .stApp { |
| background: linear-gradient(to right , #00BFFF, #00DED1 ,#DAA520); /* Gradient dark professional background */ |
| color: #ffffff; |
| padding: 20px; |
| } |
| /* Align content to the left */ |
| .block-container { |
| text-align: left; /* Left align for content */ |
| padding: 2rem; /* Padding for aesthetics */ |
| } |
| |
| /* Header and Subheader Text */ |
| h1 { |
| color: #FF69B4 !important; /* Custom styling for the main header */ |
| font-family: 'Arial', sans-serif !important; |
| font-weight: bold !important; |
| text-align: center; |
| } |
| h2, h3, h4 { |
| color: #4BDD82 !important; /* Custom styling for subheaders */ |
| font-family: 'Arial', sans-serif !important; |
| font-weight: bold !important; |
| } |
| /* Paragraph Text */ |
| p { |
| color: #2F4F4F !important; /* Custom styling for paragraphs */ |
| font-family: 'Arial', sans-serif !important; |
| line-height: 1.6; |
| } |
| </style> |
| """, |
| unsafe_allow_html=True |
| ) |
|
|
| if "page" not in st.session_state: |
| st.session_state.page = "home" |
|
|
| |
| if st.session_state.page == "home": |
| st.markdown( |
| """ |
| <h1><center>Important Steps in NLP Project</center></h1> |
| <p>We have learnt about first two steps of NLP project i.e Problem Statement and Data Collection in our <b>ZERO TO HERO IN ML</b> app.So in this page we will learn about three main steps of NLP Project.They are..</p> |
| <ul> |
| <li>Simple EDA of Text</li> |
| <li>Pre-Processing of Text</li> |
| <li>Feature Engineering of Text</li> |
| </ul> |
| <p><b>NOTE:</b>This three steps are explained in veiw of NLP project which is mainly related to Text and only applicable for Text data.So don't get confuse between roadmap of ML project and NLP project.</p> |
| """, |
| unsafe_allow_html=True |
| ) |
|
|
| |
| if st.button("Simple EDA"): |
| st.session_state.page = "simple_eda" |
| elif st.button("Pre-Processing"): |
| st.session_state.page = "pre_processing" |
| elif st.button("Feature Engineering"): |
| st.session_state.page = "feature_engineering" |
|
|
| |
| if st.session_state.page == "simple_eda": |
| st.markdown( |
| """ |
| <h2>Simple EDA of Text Data</h2> |
| <p>This step helps assess the quality and structure the collected text data.The collected data is often raw, so simple EDA helps identify and address inconsistencies or irrelevant elements.</p> |
| <h3>Introduction to Simple EDA (Exploratory Data Analysis) for Text Data</h3> |
| <p>EDA (Exploratory Data Analysis) is the process of examining, visualizing, and summarizing a dataset to understand its structure, patterns, and anomalies. When it comes to text data, EDA becomes even more crucial because raw text often contains inconsistencies, hidden patterns, and noise that can significantly impact the success of downstream tasks, such as machine learning or natural language processing (NLP) models.Unlike structured numerical data, text data is unstructured and highly varied. Therefore, EDA helps uncover the nuances in text, providing insights that guide preprocessing, feature engineering, and modeling decisions.</p> |
| <h3>Why is Simple EDA Important for Text Data?</h3> |
| <p><b>1.Understanding the Data's Quality:</b>Text data is often messy. Simple EDA helps detect issues like inconsistent casing, presence of HTML tags, special characters, URLs, or emojis that could affect preprocessing steps.</p> |
| <p><b>2.Unveiling Patterns and Characteristics:</b>EDA reveals the structure of the data, such as word distributions, sentence lengths, and frequent terms.</p> |
| <p><b>3.Identifying Outliers and Noise:</b>Outliers in text could be irrelevant rows, such as boilerplate messages, or entries in foreign languages.</p> |
| <p><b>4.Detecting Language or Context-Specific Features:</b>EDA helps identify elements that are unique to the dataset's language or domain, such as slang, technical terms, or idioms.</p> |
| <p><b>5.Saves Time and Improves Model Accuracy:</b>Proper EDA reduces the risk of overlooking important details in the text, ensuring preprocessing and modeling are efficient and effective</p> |
| <h4>Some important steps in Simple EDA of text data</h4> |
| <h5>Step-1:Case format (lower/upper/mixed)</h5> |
| <p>Check whether the text data is in lower or upper cases or combination of lower/upper cases</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| data = pd.DataFrame({"Review":["I enjoy programming with Python","I dislike debugging errors","Learning AI is exciting"]}) |
| data["Review"].apply(lambda x: True if((x.islower()) or (x.isupper())) else False) # False means combination of upper and lower |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-2:Presence of HTML or XML tags.</h5> |
| <p>Check whether the text data contains any tags.</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| data = pd.DataFrame({"Review":["I <b>enjoy</b> programming","Debugging <i>errors</i> is tedious","Learning <a href='AI'>AI</a> is exciting"]}) |
| data["Review"].apply(lambda x : True if re.search("<.*?>",x) else False) |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-3:Presence of Mentions (@, #).</h5> |
| <p>Check whether the text data contains any mentions.</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| data = pd.DataFrame({"Review":["@John loves Python","Debugging #errors is tedious","Learning AI is exciting"]}) |
| data["Review"].apply(lambda x : True if re.search("\B[@#]\S+",x) else False) |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-4:Presence of Numeric data (digits).</h5> |
| <p>Check whether the text data contains any Numeric data (digits).</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| data = pd.DataFrame({"Review":["Python 3.10 is amazing","AI has grown exponentially in 2023","Learn programming in 5 steps"]}) |
| data["Review"].apply(lambda x : True if re.search("\d+",x) else False) |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-5:Presence of URLs (e.g., https://)..</h5> |
| <p>Check whether the text data contains any URLs (e.g., https://).</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| data = pd.DataFrame({"Review":["Check out <a href='https://python.org'>Python</a>","Debugging is tedious","Visit https://www.github.com/ for repositories"]}) |
| data["Review"].apply(lambda x: True if re.search("https?://\S+",x) else False) |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-6:Presence of Punctuation and special characters.</h5> |
| <p>Check whether the text data contains any Punctuation and special characters.</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| data = pd.DataFrame({"Review":["Coding is fun!","Why is AI so popular?","Debugging #errors is tedious"]}) |
| data["Review"].apply(lambda x : True if re.search('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]',x) else False) |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-7:Presence of Emojis.</h5> |
| <p>Check whether the text data contains any Emojis.</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| import emoji |
| data = pd.DataFrame({"Review":["I ❤️ programming","Debugging is tedious 😓","Learning AI is fun!"]}) |
| data["Review"].apply(lambda x : True if emoji.emoji_count(x) else False) |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-8:Presence of Date and time information.</h5> |
| <p>Check whether the text data contains any Date and time information.</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| data1 = pd.DataFrame({"Review":["Coding started on 2025-01-21","Learn AI in 12/05/2023","No dates here"]}) |
| data1["Review"].apply(lambda x : True if (re.search(r"\b[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{4}",x)) or (re.search(r"\b[0-9]{4}\/[0-9]{1,2}\/[0-9]{1,2}",x)) else False) |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-9:Presence of Emails.</h5> |
| <p>Check whether the text data contains any emails.</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| data = pd.DataFrame({"Review":["Contact us at support@python.org","Send queries to info@ai.com","I love coding"]}) |
| data["Review"].apply(lambda x : True if re.search("\S+@\S+",x) else False) |
| """,language = "python") |
| |
| st.markdown(""" |
| <p>Simple EDA for text data is the foundation of any NLP project. It not only provides a deep understanding of the dataset but also helps design a tailored pipeline for preprocessing and feature extraction. Skipping EDA can result in wasted time, poor model performance, and missed insights, making it an indispensable step in text analytics.</p> |
| """,unsafe_allow_html = True) |
|
|
| |
| if st.button("Back"): |
| st.session_state.page = "home" |
|
|
| |
| if st.session_state.page == "pre_processing": |
| st.markdown( |
| """ |
| <h2>Pre-Processing of Text Data</h2> |
| <p>Pre-processing ensures data is clean and ready for modeling.</p> |
| <h3>Introduction to Preprocessing for Text Data</h3>- |
| <p>Preprocessing in Natural Language Processing (NLP) involves transforming raw, unstructured text data into a clean and structured format suitable for analysis and modeling. Raw text data often contains noise, inconsistencies, and elements irrelevant to the task at hand. Preprocessing helps standardize this data, remove unwanted information, and extract meaningful patterns that models can effectively utilize.</p> |
| <p>Text data is highly unstructured and complex because it contains varying formats, encodings, and linguistic elements like slang, abbreviations, and emojis. Without preprocessing, downstream tasks such as classification, sentiment analysis, or text generation can produce inaccurate results.</p> |
| <h3>Why is Preprocessing Important?</h3> |
| <p><b>1.Enhances Model Accuracy and Performance:</b>Cleaned and standardized data allows machine learning and NLP models to focus on meaningful patterns.</p> |
| <p><b>2.Reduces Noise in Data:</b>Preprocessing eliminates unnecessary elements like special characters, URLs, and stopwords that can obscure the real signal in the text.</p> |
| <p><b>3.Standardizes Text Representation:</b>Text preprocessing converts data into a uniform format (e.g., lowercase conversion, removing extra spaces), enabling consistent feature extraction.</p> |
| <p><b>4.Improves Efficiency:</b>By reducing dimensionality (e.g., removing rare words or emojis), preprocessing makes computations faster and more memory-efficient.</p> |
| <p><b>5.Removes Irrelevant Information:</b>Preprocessing filters out domain-specific noise (e.g., HTML tags in scraped web data) that isn’t relevant to the task.</p> |
| <p><b>6.Facilitates Linguistic Normalization:</b>Techniques like stemming and lemmatization reduce words to their base forms, improving generalization in downstream models.</p> |
| <p><b>7.Handles Non-Textual Features in Text:</b>Text data often contains dates, numbers, emojis, and special symbols that need appropriate handling.</p> |
| <p><b>8.Improves Interpretability of the Data:</b>Preprocessed data is easier to visualize and understand, aiding exploratory data analysis and feature engineering.</p> |
| <h4>Common Steps in Text Preprocessing</h4> |
| <h5>Step-1:Case Normalization:</h5> |
| <p>Convert all text to lowercase to ensure consistency (e.g., "Data" and "data" are treated equally).</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| data = pd.DataFrame({"Review":["I love Hyderabad Biryani","I hate other places Biryani","I like Cooking process"]}) |
| data["Review"].str.lower() |
| data["Review"].str.upper() |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-2:Removing Noise</h5> |
| <p>Eliminate special characters, HTML/XML tags, and unwanted symbols.Remove URLs, email addresses, hashtags, and mentions.</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| # Removing tags |
| data1 = pd.DataFrame({"Review":["I love Hyderabad<p> Biryani","I hate other places<br>Biryani","I like Cooking process"]}) |
| data1["Review"].apply(lambda x:re.sub("<.*?>"," ",x)) |
| # Removing URLs |
| data2 = pd.DataFrame({"Review":["I <p>love</p> Hyderabad Biryani","I <h1>hate other places Biryani</h1>","https://www.google.com/","I like Cooking process"]}) |
| data2["Review"].apply(lambda x:re.sub("https?://\S+"," ",x)) |
| # Removing Emails |
| data3 = pd.DataFrame({"Review":["I love Hyderabad Biryani","saniya@gmail.com dfghj","I hate other places Biryani","htyuiokj manny@outlook.com","I like Cooking process"]}) |
| data3["Review"].apply(lambda x : re.sub("\S+@\S+"," ",x)) |
| # Removing Hashtags and Mentions |
| data4 = pd.DataFrame({"Review":["I love @Hyderabad Biryani","I hate other #places @Biryani","I like Cooking process"]}) |
| data4["Review"].apply(lambda x :re.sub("\B[@#]\S+"," ",x)) |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-3:Emoji Handling</h5> |
| <p>Convert emojis to descriptive text or remove them based on the context. |
| Example: "😊" can be replaced with "happy" or ignored if irrelevant.</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| data5 = pd.DataFrame({"Review":["I ❤️ Hyderabad Biryani","I 😒 other places Biryani","I like Cooking process"]}) |
| data5["Review"].apply(lambda x:emoji.demojize(x,delimiters=(" "," "))) |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-4:Removing Stopwords</h5> |
| <p>topwords are common words like "and," "the," or "is" that don’t add value in many NLP tasks. |
| Example: Removing "the" ensures the focus remains on content words like "machine learning.".</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| from nltk.corpus import stopwords |
| stp = stopwords.words("English") |
| from nltk.tokenize import sent_tokenize,word_tokenize |
| dataa = pd.DataFrame({"Review":["I love Hyderabad Biryani","I hate other places Biryani","I like Cooking process"]}) |
| |
| if stopwordss == True: |
| l2 = [] |
| for doc in data[column]: |
| l1 = [] |
| for word in word_tokenize(doc): |
| if word not in stp: |
| l1.append(word) |
| else : |
| pass |
| l2.append([" ".join(l1)]) |
| data[column] = pd.DataFrame(l2) |
| else : |
| pass |
| if inflac == "stem": |
| l3 = [] |
| for doc in data[column]: |
| l4 = [] |
| for word in word_tokenize(doc): |
| if stemm == "porter": |
| l4.append(ps.stem(word)) |
| elif stemm == "snow": |
| l4.append(ss.stem(word)) |
| elif stemm == "lanc": |
| l4.append(ls.stem(word)) |
| l3.append([" ".join(l3)]) |
| data[column] = pd.DataFrame(l3) |
| |
| elif inflac == "lemma": |
| l5 = [] |
| for doc in data[column]: |
| l6 = [] |
| for word in word_tokenize(doc): |
| l6.append(wl.lemmatize(word)) |
| l5.append([" ".join(l6)]) |
| data[column] = pd.DataFrame(l5) |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-5:Removing Punctuation and Digits</h5> |
| <p>Strips symbols and numbers if they aren’t meaningful for the task. |
| Example: Removing "123!" ensures models don’t overfit on numbers or punctuation marks.</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| # Removing Punctuations |
| data = pd.DataFrame({"Review":["I love Hyderabadi @Biryani","I hate other pla#ces Biryani","I like Co.oking process"]}) |
| if punc == True: |
| data[column] = data[column].apply(lambda x:re.sub('[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'," ",x)) |
| else : |
| pass |
| # Removing Digits |
| data = pd.DataFrame({"Review":["I 143 Hyderabadi Biryani","I hate other places Biryani","I like 567Cooking process"]}) |
| if digits == True: |
| data[column] = data[column].apply(lambda x :re.sub("\d","",x)) |
| else : |
| pass |
| """,language = "python") |
| st.markdown(""" |
| <h5>Step-6:Fixing Contractions</h5> |
| <p>Fixing contractions for neat and clear understanding for machine</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| import contractions |
| df = pd.DataFrame({"Review":["I love Hyderabadi Biryani","You're a bad person","I can't learn process"]}) |
| df["Review"].apply(lambda x:contractions.fix(x)) |
| """,language = "python") |
|
|
| st.markdown(""" |
| <h5>Step-7:Handling Dates and Times</h5> |
| <p>Extract or format dates into standard representations.</p> |
| """, |
| unsafe_allow_html=True) |
| st.code( |
| """ |
| data = pd.DataFrame({"Review":["I love Hyderabad Biryani","I hate other places 15/08/2002 Biryani","I like Cooking 2002/08/15 process"]}) |
| if dates == True: |
| data[column] = data[column].apply(lambda x : re.sub("\b[0-9]{1,2}\/[0-9]{1,2}\/[0-9]{4}\b"," ",x)) |
| data[column] = data[column].apply(lambda x : re.sub("\b[0-9]{4}\/[0-9]{1,2}\/[0-9]{1,2}\b"," ",x)) |
| else : |
| pass |
| """,language = "python") |
| |
| |
| if st.button("Back"): |
| st.session_state.page = "home" |
|
|
| |
| if st.session_state.page == "feature_engineering": |
| st.markdown( |
| """ |
| <h2>Feature Engineering of Text Data</h2> |
| <p>Content for Feature Engineering step goes here.</p> |
| """, |
| unsafe_allow_html=True |
| ) |
| if st.button("Back"): |
| st.session_state.page = "home" |