From_Zero_to_ML_Hero / pages /6_semi_structured_data.py
DOMMETI's picture
Update pages/6_semi_structured_data.py
3cef663 verified
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import requests
st.markdown("""
<style>
/* Set a soft background color */
body {
background-color: #eef2f7;
}
/* Style for main title */
h1 {
color: #00FFFF;
font-family: 'Roboto', sans-serif;
font-weight: 700;
text-align: center;
margin-bottom: 25px;
}
/* Style for headers */
h2 {
color: #FFFACD;
font-family: 'Roboto', sans-serif;
font-weight: 600;
margin-top: 30px;
}
/* Style for subheaders */
h3 {
color: #ba95b0;
font-family: 'Roboto', sans-serif;
font-weight: 500;
margin-top: 20px;
}
.custom-subheader {
color: #00FFFF;
font-family: 'Roboto', sans-serif;
font-weight: 600;
margin-bottom: 15px;
}
/* Paragraph styling */
p {
font-family: 'Georgia', serif;
line-height: 1.8;
color: #FFFFFF;
margin-bottom: 20px;
}
/* List styling with checkmark bullets */
.icon-bullet {
list-style-type: none;
padding-left: 20px;
}
.icon-bullet li {
font-family: 'Georgia', serif;
font-size: 1.1em;
margin-bottom: 10px;
color: #FFFFF0;
}
.icon-bullet li::before {
content: "βœ”οΈ";
padding-right: 10px;
color: #b3b3ff;
}
/* Sidebar styling */
.sidebar .sidebar-content {
background-color: #ffffff;
border-radius: 10px;
padding: 15px;
}
.sidebar h2 {
color: #495057;
}
/* Custom button style */
.streamlit-button {
background-color: #00FFFF;
color: #000000;
font-weight: bold;
}
</style>
""", unsafe_allow_html=True)
st.title("Semi-Structured Data πŸ“ŠπŸ“œ")
st.markdown("""
Semi-structured data is a type of data that doesn’t follow strict rules like a table but still has some structure, like labels or tags, to organize it. Examples include:
<ul class="icon-bullet">
<li>CSV πŸ“‹</li>
<li>XML 🏷️</li>
<li>JSON 🌐</li>
<li>HTML πŸ–₯️</li>
</ul>
""", unsafe_allow_html=True)
if st.button("πŸš€ Open Jupyter Notebook for entire semi-structured data"):
notebook_url = "https://github.com/raj2216/my-code/blob/main/DATA%20HANDLING%20(1).ipynb"
st.markdown(f"[Click here to go to the Jupyter notebook]({notebook_url})", unsafe_allow_html=True)
st.sidebar.title("Navigation 🧭")
file_type = st.sidebar.radio(
"Choose a file type to learn more:",
("CSV", "XML", "JSON", "HTML")
)
if file_type == "CSV":
st.title("CSV : (Comma-Separated Values) πŸ“‹")
st.markdown("""A CSV file stores data in plain text format, where each line represents a row and values are separated by commas. πŸ—‚οΈ""", unsafe_allow_html=True)
st.code("Name,Age,Location\nAlice,25,New York\nBob,30,Los Angeles", language="csv")
code = '''
pd.read_csv(r"/Users/rajbunny/Downloads/phones_.csv")
'''
st.code(code, language="python")
st.subheader("IMPORTANT NOTE πŸ’‘")
st.markdown("""
In CSV files, the separator can be:
- **Comma (`,`)**
- **Semicolon (`;`)**
- **Tab (`\\t`)**
- **Pipe (`|`)**
- **Space (` `)** πŸ› οΈ
""", unsafe_allow_html=True)
st.header("Major Issues When Handling CSV Files ⚠️")
st.markdown("""
<ul class="icon-bullet">
<li>Parse error πŸ›‘</li>
<li>Encoding issue 🌐</li>
<li>Memory issue πŸ–₯️</li>
</ul>
""", unsafe_allow_html=True)
st.subheader("Parse Error πŸ“‰")
st.markdown("""A parsing error occurs when the number of values in a row doesn't match the number of columns in the header. This can cause issues during data processing.""", unsafe_allow_html=True)
st.code("Name,Age,Location\nAlice,25,New York\nBob,30,Los Angeles\nCharlie,22", language="csv")
st.markdown("""
To avoid parse errors, you can use `on_bad_lines`:
- **`skip`**: Ignores the bad rows πŸ—‘οΈ.
- **`warn`**: Skips the rows but provides a warning ⚠️.
""")
code = '''
data = pd.read_csv(r'/Users/rajbunny/Downloads/text.csv', on_bad_lines="skip")
data = pd.read_csv(r'/Users/rajbunny/Downloads/text.csv', on_bad_lines="warn")
'''
st.code(code, language="python")
st.subheader("Encoding Issue 🌐")
st.markdown("""
Encoding issues occur when the characters in a CSV file don't match the expected format.
Common encodings include:
- **UTF-8** (default for most files)
- **ISO-8859-1 (Latin-1)**
- **Windows-1252**
""", unsafe_allow_html=True)
st.code("""Name,Age,Location\nAlice,30,New York\nBob,25,Los Angeles\nMΓ‘rio,28,SΓ£o""", language='csv')
st.markdown("""
To resolve encoding issues, specify the correct encoding while reading the file.
""", unsafe_allow_html=True)
code = '''
pd.read_csv(r"/Users/rajbunny/Downloads/spam.csv", encoding="utf-8", on_bad_lines="skip")
'''
st.code(code, language="python")
st.subheader("Memory Issue πŸ–₯️")
st.markdown("""
For large CSV files causing memory issues, you can load them in chunks:
""", unsafe_allow_html=True)
code = '''
for chunk in pd.read_csv(r"/Users/rajbunny/Downloads/spam.csv", chunksize=1000):
print(chunk.shape)
'''
st.code(code, language="python")
st.subheader("Save Data Back to CSV πŸ’Ύ")
code = '''
data.to_csv(r"output.csv")
'''
st.code(code, language="python")
st.subheader("πŸ“ To view the coding part of the Jupyter notebook:")
elif file_type == "XML":
st.title("XML : (eXtensible Markup Language) 🏷️")
st.markdown("""An XML file stores data in a tree-like structure using custom tags to organize information. 🌳""", unsafe_allow_html=True)
st.code('''
<data>
<person>
<name>Alice</name>
<age>30</age>
<location>New York</location>
</person>
<person>
<name>Bob</name>
<age>25</age>
<location>Los Angeles</location>
</person>
</data>
''', language='xml')
code = '''
pd.read_xml(r"/Users/rajbunny/Downloads/sample1.xml")
'''
st.code(code, language="python")
st.subheader("XPath 🧭")
st.markdown("""
XPath is used to navigate and pick specific parts of an XML file, like selecting nodes or attributes.
""", unsafe_allow_html=True)
code = '''
data = pd.read_xml(r"/Users/rajbunny/Downloads/sample3.xml", xpath="person")
'''
st.code(code, language="python")
st.subheader("Save Back to XML πŸ’Ύ")
code = '''
data.to_xml(r"output.xml")
'''
st.code(code, language="python")
elif file_type == "JSON":
st.title("JSON : (JavaScript Object Notation) 🌐")
st.markdown("""JSON stores data as key-value pairs, making it easy to convert into a table or dictionary-like format.All the api data will be in the form of JSON and its of two types given below..
<ul class="icon-bullet">
<li>Structured</li>
<li>Un Structured</li>
</ul>""",unsafe_allow_html=True)
st.header("Structured JSON Format")
code='''
d1='{"name":["p1","p2"],"age":[21,22]}'
'''
st.code(code,language="python")
st.markdown("""it's in the form of dictonary given inside a string""",unsafe_allow_html=True)
st.header("How to read a Structured json file?")
code='''
data=pd.read_json(d1)
'''
st.code(code,language="python")
st.header("Data Frame to json")
st.subheader("Orient as index")
code='''
jason_with_index=data.to_json(orient="index")
output='{"0":{"name":"p1","age":21},"1":{"name":"p2","age":22}}'
'''
st.code(code,language="python")
st.markdown("""While converting when we give orient as index then keys will index and rest will be values..""",unsafe_allow_html=True)
st.subheader("To Convert Back to Data Frame")
code='''
pd.read_json(jason_with_index,orient="index")
'''
st.code(code,language="python")
st.subheader("Orient as column")
code='''
jason_with_index=data.to_json(orient="column")
output='{"name":{"0":"p1","1":"p2"},"age":{"0":21,"1":22}}'
'''
st.code(code,language="python")
st.markdown("""While converting when we give orient as column then keys will column and rest will be values..""",unsafe_allow_html=True)
st.subheader("To Convert Back to Data Frame")
code='''
pd.read_json(jason_with_index,orient="column")
'''
st.code(code,language="python")
st.subheader("Orient as values")
code='''
jason_with_index=data.to_json(orient="values")
output='[["p1",21],["p2",22]]'
'''
st.code(code,language="python")
st.markdown("""While converting when we give orient as values then we will be getting a nested list""",unsafe_allow_html=True)
st.subheader("To Convert Back to Data Frame")
code='''
pd.read_json(jason_with_index,orient="values")
'''
st.code(code,language="python")
st.subheader("Orient as split")
code='''
jason_with_index=data.to_json(orient="split")
output='{"columns":["name","age"],"index":[0,1],"data":[["p1",21],["p2",22]]}'
'''
st.code(code,language="python")
st.markdown("""While converting when we give orient as split then we will be getting index as seperate key,columns as seperate key,and finally data as seperate key...""",unsafe_allow_html=True)
st.subheader("To Convert Back to Data Frame")
code='''
pd.read_json(jason_with_index,orient="split")
'''
st.code(code,language="python")
st.subheader("First Un Structured JSON Format")
code='''
d2={"name":["p1","p2"],"marks":{"sem1":{"maths":[11,12],"hindi":[11,12]},"sem2":{"maths":[11,11],"hindi":[12,12]}}}
'''
st.code(code,"python")
st.markdown("""A json format can be said as unstructured json format when have dictonary inside a dictonary....""")
st.subheader("How to read Un Structured JSON Format?")
code='''
pd.json_normalize(d2,max_level=1)
pd.json_normalize(d2)
'''
st.code(code,"python")
st.markdown("""When we have a json file having dictonary inside a dictonary we havr use **json_normalize** if we jst pass the file as it is default i will check all the levels but when we pass max_level value it will check till that level only..""",unsafe_allow_html=True)
st.subheader("Second Un Structured JSON Format")
code='''
x=[{"name":"p1","age":21,"marks":[{"maths":11,"hindi":11}]},{"name":"p1","age":21,"marks":[{"maths":11,"hindi":11}]}]
'''
st.code(code,language="python")
st.markdown("""A json formal is also said as unstructured json format when we have dictonary inside a list.""",unsafe_allow_html=True)
st.subheader("How to read Un Structured JSON Format?")
code='''
pd.json_normalize(x,record_path="marks",meta=["name","age"])
'''
st.code(code,language="python")
st.markdown("""When we have a dictonary inside a list for that column we will be useing **record_path** and to say also to include remaing columns we use **meta** and pass those columns also..""")
elif file_type == "HTML":
st.title("HTML : (HyperText Markup Language) πŸ–₯️")
st.markdown("""HTML is a Semi Structured data by this html we can retrive only the tables present inside the particular table..""")
st.subheader("How to read and get the Tabular data from the url.?")
code='''
data=pd.read_html("https://en.wikipedia.org/wiki/Indian_Premier_League")
data=pd.read_html("https://en.wikipedia.org/wiki/Indian_Premier_League",match="Texas Super Kings")
'''
st.code(code,language="python")
st.subheader("Note")
st.markdown("""First one will give all the tables and Second one will give only the matched word tabels only..""")