Update pages/7_Advance_vectorization_techniques.py
Browse files
pages/7_Advance_vectorization_techniques.py
CHANGED
|
@@ -278,29 +278,94 @@ if file_type == "Word2Vec":
|
|
| 278 |
|
| 279 |
st.markdown(
|
| 280 |
"""
|
| 281 |
-
<div class='formula'>
|
| 282 |
<strong>Final Weighted Representation:</strong>
|
| 283 |
<pre style="background-color:#F7F7F7; padding: 10px; border-radius: 5px;">
|
| 284 |
v_final = (TF-IDF(w1) * v1 + TF-IDF(w2) * v2 + TF-IDF(w3) * v3)
|
| 285 |
/ (TF-IDF(w1) + TF-IDF(w2) + TF-IDF(w3))
|
| 286 |
</pre>
|
| 287 |
-
</div>
|
| 288 |
""",
|
| 289 |
unsafe_allow_html=True,
|
| 290 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
|
|
|
|
| 292 |
st.markdown(
|
| 293 |
"""
|
| 294 |
<div class='box'>
|
| 295 |
-
<h3 style='color: #6A0572;'>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 296 |
<ul>
|
| 297 |
-
<li><
|
| 298 |
-
<li
|
| 299 |
-
<li>Improves the <strong>semantic representation</strong> of text</li>
|
| 300 |
</ul>
|
| 301 |
-
|
| 302 |
""",
|
| 303 |
unsafe_allow_html=True,
|
| 304 |
-
)
|
| 305 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 306 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 278 |
|
| 279 |
st.markdown(
|
| 280 |
"""
|
|
|
|
| 281 |
<strong>Final Weighted Representation:</strong>
|
| 282 |
<pre style="background-color:#F7F7F7; padding: 10px; border-radius: 5px;">
|
| 283 |
v_final = (TF-IDF(w1) * v1 + TF-IDF(w2) * v2 + TF-IDF(w3) * v3)
|
| 284 |
/ (TF-IDF(w1) + TF-IDF(w2) + TF-IDF(w3))
|
| 285 |
</pre>
|
|
|
|
| 286 |
""",
|
| 287 |
unsafe_allow_html=True,
|
| 288 |
)
|
| 289 |
+
st.subheader("How to train our own W2V model")
|
| 290 |
+
st.markdown('''
|
| 291 |
+
- At training time Corpus + W2V algorithm can be implemented by 2 techniques
|
| 292 |
+
- They are:
|
| 293 |
+
- Skip-gram
|
| 294 |
+
- CBOW
|
| 295 |
+
''')
|
| 296 |
|
| 297 |
+
st.subheader(":red[CBOW]")
|
| 298 |
st.markdown(
|
| 299 |
"""
|
| 300 |
<div class='box'>
|
| 301 |
+
<h3 style='color: #6A0572;'>What is CBOW?</h3>
|
| 302 |
+
<p><strong>CBOW (Continuous Bag of Words)</strong> is a technique where we use surrounding words (context) to predict the target word (focus word).</p>
|
| 303 |
+
</div>
|
| 304 |
+
""",
|
| 305 |
+
unsafe_allow_html=True,
|
| 306 |
+
)
|
| 307 |
+
st.markdown(
|
| 308 |
+
"""
|
| 309 |
+
<h3 style='color: #6A0572;'>π Example Corpus</h3>
|
| 310 |
<ul>
|
| 311 |
+
<li><strong>d1:</strong> w1, w2, w3, w4, w5, w4</li>
|
| 312 |
+
<li><strong>d2:</strong> w3, w4, w5, w2, w1, w2, w3, w4</li>
|
|
|
|
| 313 |
</ul>
|
| 314 |
+
<p>We first preprocess the data to extract meaningful relationships.</p>
|
| 315 |
""",
|
| 316 |
unsafe_allow_html=True,
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
st.markdown(
|
| 320 |
+
"""
|
| 321 |
+
<h3 style='color: #6A0572;'>π Steps to Process the Data</h3>
|
| 322 |
+
<ul>
|
| 323 |
+
<li>Create a <span class='highlight'>vocabulary</span> from the entire corpus: <pre style="background-color:#F7F7F7; padding: 10px; border-radius: 5px;">{w1, w2, w3, w4, w5}</pre></li>
|
| 324 |
+
<li>Generate a <strong>tabular dataset</strong> with:
|
| 325 |
+
<ul>
|
| 326 |
+
<li><strong>Feature variables (Context Words)</strong></li>
|
| 327 |
+
<li><strong>Class variables (Target Words)</strong></li>
|
| 328 |
+
</ul>
|
| 329 |
+
</li>
|
| 330 |
+
<li>Apply a <span class='highlight'>window size</span> of 2 (how many neighbors we consider).</li>
|
| 331 |
+
<li>Slide the window over the text with <span class='highlight'>stride = 1</span>.</li>
|
| 332 |
+
</ul>
|
| 333 |
+
""",
|
| 334 |
+
unsafe_allow_html=True,
|
| 335 |
+
)
|
| 336 |
|
| 337 |
+
st.markdown(
|
| 338 |
+
"""
|
| 339 |
+
<h3 style='color: #6A0572;'> Handling Variable Context Length</h3>
|
| 340 |
+
<ul>
|
| 341 |
+
<li>To ensure a consistent feature length, we use <strong>zero-padding</strong> when needed.</li>
|
| 342 |
+
<li>The model tries to understand relationships based on the surrounding <span class='highlight'>context words</span>.</li>
|
| 343 |
+
</ul>
|
| 344 |
+
""",
|
| 345 |
+
unsafe_allow_html=True,
|
| 346 |
+
)
|
| 347 |
+
st.markdown(
|
| 348 |
+
"""
|
| 349 |
+
<strong>Mathematical Representation:</strong>
|
| 350 |
+
<pre style="background-color:#F7F7F7; padding: 10px; border-radius: 5px;">
|
| 351 |
+
y = f(xi)
|
| 352 |
+
where,
|
| 353 |
+
y = Focus Word (Target)
|
| 354 |
+
xi = Context Words (Neighbors)
|
| 355 |
+
</pre>
|
| 356 |
+
""",
|
| 357 |
+
unsafe_allow_html=True,
|
| 358 |
+
)
|
| 359 |
+
|
| 360 |
+
st.markdown(
|
| 361 |
+
"""
|
| 362 |
+
<h3 style='color: #6A0572;'> Training with Artificial Neural Networks</h3>
|
| 363 |
+
<p>The tabular data is passed to an <strong>Artificial Neural Network (ANN)</strong> which learns:</p>
|
| 364 |
+
<ul>
|
| 365 |
+
<li>How <span class='highlight'>context words</span> are related to <span class='highlight'>focus words</span>.</li>
|
| 366 |
+
</ul>
|
| 367 |
+
""",
|
| 368 |
+
unsafe_allow_html=True,
|
| 369 |
+
)
|
| 370 |
+
|
| 371 |
+
|