Spaces:
Sleeping
Sleeping
Update pages/steps_of_NLP_project.py
Browse files- pages/steps_of_NLP_project.py +159 -0
pages/steps_of_NLP_project.py
CHANGED
|
@@ -0,0 +1,159 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import re
|
| 4 |
+
|
| 5 |
+
# Custom CSS for a professional UI
|
| 6 |
+
st.markdown(
|
| 7 |
+
"""
|
| 8 |
+
<style>
|
| 9 |
+
/* App Background */
|
| 10 |
+
.stApp {
|
| 11 |
+
background: linear-gradient(to right, #1e3c72, #2a5298); /* Subtle gradient with cool tones */
|
| 12 |
+
color: #f0f0f0;
|
| 13 |
+
padding: 20px;
|
| 14 |
+
}
|
| 15 |
+
/* Align content to the left */
|
| 16 |
+
.block-container {
|
| 17 |
+
text-align: left;
|
| 18 |
+
padding: 2rem;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
/* Header and Subheader Text */
|
| 22 |
+
h1 {
|
| 23 |
+
background: linear-gradient(to right, #ff7f50, #ffd700); /* Orange to yellow gradient */
|
| 24 |
+
-webkit-background-clip: text;
|
| 25 |
+
-webkit-text-fill-color: transparent;
|
| 26 |
+
font-family: 'Arial', sans-serif !important;
|
| 27 |
+
font-weight: bold !important;
|
| 28 |
+
text-align: center;
|
| 29 |
+
}
|
| 30 |
+
h2, h3, h4, h5, h6 {
|
| 31 |
+
background: linear-gradient(to right, #ff7f50, #ffd700); /* Orange to yellow gradient */
|
| 32 |
+
-webkit-background-clip: text;
|
| 33 |
+
-webkit-text-fill-color: transparent;
|
| 34 |
+
font-family: 'Arial', sans-serif !important;
|
| 35 |
+
font-weight: bold !important;
|
| 36 |
+
}
|
| 37 |
+
/* Paragraph Text */
|
| 38 |
+
p {
|
| 39 |
+
color: #f0f0f0 !important; /* Light gray for readability */
|
| 40 |
+
font-family: 'Roboto', sans-serif !important;
|
| 41 |
+
line-height: 1.6;
|
| 42 |
+
font-size: 1.1rem;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
/* List Styling */
|
| 46 |
+
ul li {
|
| 47 |
+
color: #f0f0f0;
|
| 48 |
+
font-family: 'Roboto', sans-serif;
|
| 49 |
+
font-size: 1.1rem;
|
| 50 |
+
margin-bottom: 0.5rem;
|
| 51 |
+
}
|
| 52 |
+
</style>
|
| 53 |
+
""",
|
| 54 |
+
unsafe_allow_html=True
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
# Navigation state
|
| 58 |
+
if "page" not in st.session_state:
|
| 59 |
+
st.session_state.page = "home" # Default to home
|
| 60 |
+
|
| 61 |
+
# Main page navigation
|
| 62 |
+
if st.session_state.page == "home":
|
| 63 |
+
st.markdown(
|
| 64 |
+
"""
|
| 65 |
+
<h1><center>Important Steps in NLP Project</center></h1>
|
| 66 |
+
<p>In this section, we will learn about three critical steps in an NLP project:</p>
|
| 67 |
+
<ul>
|
| 68 |
+
<li>Simple EDA of Text</li>
|
| 69 |
+
<li>Pre-Processing of Text</li>
|
| 70 |
+
<li>Feature Engineering of Text</li>
|
| 71 |
+
</ul>
|
| 72 |
+
<p><b>NOTE:</b> These steps are specifically for NLP projects focused on text data, not general ML projects.</p>
|
| 73 |
+
""",
|
| 74 |
+
unsafe_allow_html=True
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
# Navigation buttons
|
| 78 |
+
if st.button("Simple EDA"):
|
| 79 |
+
st.session_state.page = "simple_eda"
|
| 80 |
+
elif st.button("Pre-Processing"):
|
| 81 |
+
st.session_state.page = "pre_processing"
|
| 82 |
+
elif st.button("Feature Engineering"):
|
| 83 |
+
st.session_state.page = "feature_engineering"
|
| 84 |
+
|
| 85 |
+
# Simple EDA page
|
| 86 |
+
if st.session_state.page == "simple_eda":
|
| 87 |
+
st.markdown(
|
| 88 |
+
"""
|
| 89 |
+
<h2>Simple EDA of Text Data</h2>
|
| 90 |
+
<p>This step helps assess the quality and structure of the collected text data. The data is often raw, so simple EDA helps identify and address inconsistencies or irrelevant elements.</p>
|
| 91 |
+
<h3>Introduction to Simple EDA (Exploratory Data Analysis) for Text Data</h3>
|
| 92 |
+
<p>EDA is the process of examining, visualizing, and summarizing a dataset to understand its structure, patterns, and anomalies. For text data, EDA is crucial as raw text often contains inconsistencies, hidden patterns, and noise that can impact downstream tasks like NLP models.</p>
|
| 93 |
+
<h3>Why is Simple EDA Important for Text Data?</h3>
|
| 94 |
+
<p><b>1. Understanding the Data's Quality:</b> Detect issues like inconsistent casing, HTML tags, special characters, URLs, or emojis.</p>
|
| 95 |
+
<p><b>2. Unveiling Patterns and Characteristics:</b> EDA reveals the structure of data, such as word distributions, sentence lengths, and frequent terms.</p>
|
| 96 |
+
<p><b>3. Identifying Outliers and Noise:</b> Outliers could be irrelevant rows or entries in foreign languages.</p>
|
| 97 |
+
<p><b>4. Detecting Language or Context-Specific Features:</b> Identifying elements unique to the dataset's domain (slang, technical terms, idioms).</p>
|
| 98 |
+
<p><b>5. Saves Time and Improves Model Accuracy:</b> Proper EDA ensures efficient preprocessing and effective modeling.</p>
|
| 99 |
+
|
| 100 |
+
<h4>Some Important Steps in Simple EDA of Text Data</h4>
|
| 101 |
+
<h5>Step-1: Case Format (Lower/Upper/Mixed)</h5>
|
| 102 |
+
<p>Check whether the text data is in lowercase, uppercase, or a combination.</p>
|
| 103 |
+
""",
|
| 104 |
+
unsafe_allow_html=True
|
| 105 |
+
)
|
| 106 |
+
st.code(
|
| 107 |
+
"""
|
| 108 |
+
data = pd.DataFrame({"Review":["I love Hyderabad Biryani","I hate other places Biryani","I like Cooking process"]})
|
| 109 |
+
data["Review"].apply(lambda x: True if((x.islower()) or (x.isupper())) else False) # False means combination of upper and lower
|
| 110 |
+
""", language="python"
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# Other steps (Step-2 to Step-9) here...
|
| 114 |
+
|
| 115 |
+
if st.button("Back"):
|
| 116 |
+
st.session_state.page = "home"
|
| 117 |
+
|
| 118 |
+
# Pre-Processing page
|
| 119 |
+
if st.session_state.page == "pre_processing":
|
| 120 |
+
st.markdown(
|
| 121 |
+
"""
|
| 122 |
+
<h2>Pre-Processing of Text Data</h2>
|
| 123 |
+
<p>Pre-processing ensures the data is clean and ready for modeling.</p>
|
| 124 |
+
<h3>Introduction to Preprocessing for Text Data</h3>
|
| 125 |
+
<p>Preprocessing in NLP involves transforming raw, unstructured text data into a structured format suitable for analysis and modeling. Raw text data often contains noise and inconsistencies that must be addressed to improve model performance.</p>
|
| 126 |
+
|
| 127 |
+
<h3>Why is Preprocessing Important?</h3>
|
| 128 |
+
<p><b>1. Enhances Model Accuracy:</b> Cleaned and standardized data helps models focus on meaningful patterns.</p>
|
| 129 |
+
<p><b>2. Reduces Noise:</b> Preprocessing eliminates unwanted characters, URLs, and stopwords.</p>
|
| 130 |
+
<p><b>3. Standardizes Text Representation:</b> Converts text to a uniform format (e.g., lowercase conversion, removing extra spaces).</p>
|
| 131 |
+
<p><b>4. Improves Efficiency:</b> Reduces dimensionality and speeds up computations.</p>
|
| 132 |
+
<p><b>5. Removes Irrelevant Information:</b> Filters out domain-specific noise (e.g., HTML tags in web-scraped data).</p>
|
| 133 |
+
|
| 134 |
+
<h4>Common Steps in Text Preprocessing</h4>
|
| 135 |
+
<h5>Step-1: Case Normalization</h5>
|
| 136 |
+
<p>Convert all text to lowercase for consistency.</p>
|
| 137 |
+
""",
|
| 138 |
+
unsafe_allow_html=True
|
| 139 |
+
)
|
| 140 |
+
st.code(
|
| 141 |
+
"""
|
| 142 |
+
data = pd.DataFrame({"Review":["I love Hyderabad Biryani","I hate other places Biryani","I like Cooking process"]})
|
| 143 |
+
data["Review"].str.lower()
|
| 144 |
+
data["Review"].str.upper()
|
| 145 |
+
""", language="python"
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
# Other steps (Step-2 to Step-6) here...
|
| 149 |
+
|
| 150 |
+
if st.button("Back"):
|
| 151 |
+
st.session_state.page = "home"
|
| 152 |
+
|
| 153 |
+
# Feature Engineering page (To be implemented)
|
| 154 |
+
if st.session_state.page == "feature_engineering":
|
| 155 |
+
st.markdown("<h2>Feature Engineering of Text Data</h2>", unsafe_allow_html=True)
|
| 156 |
+
st.markdown("<p>Details on Feature Engineering will be here.</p>", unsafe_allow_html=True)
|
| 157 |
+
|
| 158 |
+
if st.button("Back"):
|
| 159 |
+
st.session_state.page = "home"
|