import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import requests
st.markdown("""
""", unsafe_allow_html=True)
st.title("Semi-Structured Data ππ")
st.markdown("""
Semi-structured data is a type of data that doesnβt follow strict rules like a table but still has some structure, like labels or tags, to organize it. Examples include:
- CSV π
- XML π·οΈ
- JSON π
- HTML π₯οΈ
""", unsafe_allow_html=True)
if st.button("π Open Jupyter Notebook for entire semi-structured data"):
notebook_url = "https://github.com/raj2216/my-code/blob/main/DATA%20HANDLING%20(1).ipynb"
st.markdown(f"[Click here to go to the Jupyter notebook]({notebook_url})", unsafe_allow_html=True)
st.sidebar.title("Navigation π§")
file_type = st.sidebar.radio(
"Choose a file type to learn more:",
("CSV", "XML", "JSON", "HTML")
)
if file_type == "CSV":
st.title("CSV : (Comma-Separated Values) π")
st.markdown("""A CSV file stores data in plain text format, where each line represents a row and values are separated by commas. ποΈ""", unsafe_allow_html=True)
st.code("Name,Age,Location\nAlice,25,New York\nBob,30,Los Angeles", language="csv")
code = '''
pd.read_csv(r"/Users/rajbunny/Downloads/phones_.csv")
'''
st.code(code, language="python")
st.subheader("IMPORTANT NOTE π‘")
st.markdown("""
In CSV files, the separator can be:
- **Comma (`,`)**
- **Semicolon (`;`)**
- **Tab (`\\t`)**
- **Pipe (`|`)**
- **Space (` `)** π οΈ
""", unsafe_allow_html=True)
st.header("Major Issues When Handling CSV Files β οΈ")
st.markdown("""
- Parse error π
- Encoding issue π
- Memory issue π₯οΈ
""", unsafe_allow_html=True)
st.subheader("Parse Error π")
st.markdown("""A parsing error occurs when the number of values in a row doesn't match the number of columns in the header. This can cause issues during data processing.""", unsafe_allow_html=True)
st.code("Name,Age,Location\nAlice,25,New York\nBob,30,Los Angeles\nCharlie,22", language="csv")
st.markdown("""
To avoid parse errors, you can use `on_bad_lines`:
- **`skip`**: Ignores the bad rows ποΈ.
- **`warn`**: Skips the rows but provides a warning β οΈ.
""")
code = '''
data = pd.read_csv(r'/Users/rajbunny/Downloads/text.csv', on_bad_lines="skip")
data = pd.read_csv(r'/Users/rajbunny/Downloads/text.csv', on_bad_lines="warn")
'''
st.code(code, language="python")
st.subheader("Encoding Issue π")
st.markdown("""
Encoding issues occur when the characters in a CSV file don't match the expected format.
Common encodings include:
- **UTF-8** (default for most files)
- **ISO-8859-1 (Latin-1)**
- **Windows-1252**
""", unsafe_allow_html=True)
st.code("""Name,Age,Location\nAlice,30,New York\nBob,25,Los Angeles\nMΓ‘rio,28,SΓ£o""", language='csv')
st.markdown("""
To resolve encoding issues, specify the correct encoding while reading the file.
""", unsafe_allow_html=True)
code = '''
pd.read_csv(r"/Users/rajbunny/Downloads/spam.csv", encoding="utf-8", on_bad_lines="skip")
'''
st.code(code, language="python")
st.subheader("Memory Issue π₯οΈ")
st.markdown("""
For large CSV files causing memory issues, you can load them in chunks:
""", unsafe_allow_html=True)
code = '''
for chunk in pd.read_csv(r"/Users/rajbunny/Downloads/spam.csv", chunksize=1000):
print(chunk.shape)
'''
st.code(code, language="python")
st.subheader("Save Data Back to CSV πΎ")
code = '''
data.to_csv(r"output.csv")
'''
st.code(code, language="python")
st.subheader("π To view the coding part of the Jupyter notebook:")
elif file_type == "XML":
st.title("XML : (eXtensible Markup Language) π·οΈ")
st.markdown("""An XML file stores data in a tree-like structure using custom tags to organize information. π³""", unsafe_allow_html=True)
st.code('''
Alice
30
New York
Bob
25
Los Angeles
''', language='xml')
code = '''
pd.read_xml(r"/Users/rajbunny/Downloads/sample1.xml")
'''
st.code(code, language="python")
st.subheader("XPath π§")
st.markdown("""
XPath is used to navigate and pick specific parts of an XML file, like selecting nodes or attributes.
""", unsafe_allow_html=True)
code = '''
data = pd.read_xml(r"/Users/rajbunny/Downloads/sample3.xml", xpath="person")
'''
st.code(code, language="python")
st.subheader("Save Back to XML πΎ")
code = '''
data.to_xml(r"output.xml")
'''
st.code(code, language="python")
elif file_type == "JSON":
st.title("JSON : (JavaScript Object Notation) π")
st.markdown("""JSON stores data as key-value pairs, making it easy to convert into a table or dictionary-like format.All the api data will be in the form of JSON and its of two types given below..
""",unsafe_allow_html=True)
st.header("Structured JSON Format")
code='''
d1='{"name":["p1","p2"],"age":[21,22]}'
'''
st.code(code,language="python")
st.markdown("""it's in the form of dictonary given inside a string""",unsafe_allow_html=True)
st.header("How to read a Structured json file?")
code='''
data=pd.read_json(d1)
'''
st.code(code,language="python")
st.header("Data Frame to json")
st.subheader("Orient as index")
code='''
jason_with_index=data.to_json(orient="index")
output='{"0":{"name":"p1","age":21},"1":{"name":"p2","age":22}}'
'''
st.code(code,language="python")
st.markdown("""While converting when we give orient as index then keys will index and rest will be values..""",unsafe_allow_html=True)
st.subheader("To Convert Back to Data Frame")
code='''
pd.read_json(jason_with_index,orient="index")
'''
st.code(code,language="python")
st.subheader("Orient as column")
code='''
jason_with_index=data.to_json(orient="column")
output='{"name":{"0":"p1","1":"p2"},"age":{"0":21,"1":22}}'
'''
st.code(code,language="python")
st.markdown("""While converting when we give orient as column then keys will column and rest will be values..""",unsafe_allow_html=True)
st.subheader("To Convert Back to Data Frame")
code='''
pd.read_json(jason_with_index,orient="column")
'''
st.code(code,language="python")
st.subheader("Orient as values")
code='''
jason_with_index=data.to_json(orient="values")
output='[["p1",21],["p2",22]]'
'''
st.code(code,language="python")
st.markdown("""While converting when we give orient as values then we will be getting a nested list""",unsafe_allow_html=True)
st.subheader("To Convert Back to Data Frame")
code='''
pd.read_json(jason_with_index,orient="values")
'''
st.code(code,language="python")
st.subheader("Orient as split")
code='''
jason_with_index=data.to_json(orient="split")
output='{"columns":["name","age"],"index":[0,1],"data":[["p1",21],["p2",22]]}'
'''
st.code(code,language="python")
st.markdown("""While converting when we give orient as split then we will be getting index as seperate key,columns as seperate key,and finally data as seperate key...""",unsafe_allow_html=True)
st.subheader("To Convert Back to Data Frame")
code='''
pd.read_json(jason_with_index,orient="split")
'''
st.code(code,language="python")
st.subheader("First Un Structured JSON Format")
code='''
d2={"name":["p1","p2"],"marks":{"sem1":{"maths":[11,12],"hindi":[11,12]},"sem2":{"maths":[11,11],"hindi":[12,12]}}}
'''
st.code(code,"python")
st.markdown("""A json format can be said as unstructured json format when have dictonary inside a dictonary....""")
st.subheader("How to read Un Structured JSON Format?")
code='''
pd.json_normalize(d2,max_level=1)
pd.json_normalize(d2)
'''
st.code(code,"python")
st.markdown("""When we have a json file having dictonary inside a dictonary we havr use **json_normalize** if we jst pass the file as it is default i will check all the levels but when we pass max_level value it will check till that level only..""",unsafe_allow_html=True)
st.subheader("Second Un Structured JSON Format")
code='''
x=[{"name":"p1","age":21,"marks":[{"maths":11,"hindi":11}]},{"name":"p1","age":21,"marks":[{"maths":11,"hindi":11}]}]
'''
st.code(code,language="python")
st.markdown("""A json formal is also said as unstructured json format when we have dictonary inside a list.""",unsafe_allow_html=True)
st.subheader("How to read Un Structured JSON Format?")
code='''
pd.json_normalize(x,record_path="marks",meta=["name","age"])
'''
st.code(code,language="python")
st.markdown("""When we have a dictonary inside a list for that column we will be useing **record_path** and to say also to include remaing columns we use **meta** and pass those columns also..""")
elif file_type == "HTML":
st.title("HTML : (HyperText Markup Language) π₯οΈ")
st.markdown("""HTML is a Semi Structured data by this html we can retrive only the tables present inside the particular table..""")
st.subheader("How to read and get the Tabular data from the url.?")
code='''
data=pd.read_html("https://en.wikipedia.org/wiki/Indian_Premier_League")
data=pd.read_html("https://en.wikipedia.org/wiki/Indian_Premier_League",match="Texas Super Kings")
'''
st.code(code,language="python")
st.subheader("Note")
st.markdown("""First one will give all the tables and Second one will give only the matched word tabels only..""")