Spaces:

DOMMETI
/

From_Zero_to_ML_Hero

Sleeping

App Files Files Community

From_Zero_to_ML_Hero / pages /6_semi_structured_data.py

DOMMETI

Update pages/6_semi_structured_data.py

3cef663 verified about 1 year ago

raw

history blame contribute delete

11.8 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import random
	import requests

	st.markdown("""
	<style>
	/* Set a soft background color */
	body {
	background-color: #eef2f7;
	}
	/* Style for main title */
	h1 {
	color: #00FFFF;
	font-family: 'Roboto', sans-serif;
	font-weight: 700;
	text-align: center;
	margin-bottom: 25px;
	}
	/* Style for headers */
	h2 {
	color: #FFFACD;
	font-family: 'Roboto', sans-serif;
	font-weight: 600;
	margin-top: 30px;
	}

	/* Style for subheaders */
	h3 {
	color: #ba95b0;
	font-family: 'Roboto', sans-serif;
	font-weight: 500;
	margin-top: 20px;
	}
	.custom-subheader {
	color: #00FFFF;
	font-family: 'Roboto', sans-serif;
	font-weight: 600;
	margin-bottom: 15px;
	}
	/* Paragraph styling */
	p {
	font-family: 'Georgia', serif;
	line-height: 1.8;
	color: #FFFFFF;
	margin-bottom: 20px;
	}
	/* List styling with checkmark bullets */
	.icon-bullet {
	list-style-type: none;
	padding-left: 20px;
	}
	.icon-bullet li {
	font-family: 'Georgia', serif;
	font-size: 1.1em;
	margin-bottom: 10px;
	color: #FFFFF0;
	}
	.icon-bullet li::before {
	content: "✔️";
	padding-right: 10px;
	color: #b3b3ff;
	}
	/* Sidebar styling */
	.sidebar .sidebar-content {
	background-color: #ffffff;
	border-radius: 10px;
	padding: 15px;
	}
	.sidebar h2 {
	color: #495057;
	}
	/* Custom button style */
	.streamlit-button {
	background-color: #00FFFF;
	color: #000000;
	font-weight: bold;
	}
	</style>
	""", unsafe_allow_html=True)

	st.title("Semi-Structured Data 📊📜")
	st.markdown("""
	Semi-structured data is a type of data that doesn’t follow strict rules like a table but still has some structure, like labels or tags, to organize it. Examples include:
	<ul class="icon-bullet">
	<li>CSV 📋</li>
	<li>XML 🏷️</li>
	<li>JSON 🌐</li>
	<li>HTML 🖥️</li>
	</ul>
	""", unsafe_allow_html=True)
	if st.button("🚀 Open Jupyter Notebook for entire semi-structured data"):
	notebook_url = "https://github.com/raj2216/my-code/blob/main/DATA%20HANDLING%20(1).ipynb"
	st.markdown(f"[Click here to go to the Jupyter notebook]({notebook_url})", unsafe_allow_html=True)


	st.sidebar.title("Navigation 🧭")
	file_type = st.sidebar.radio(
	"Choose a file type to learn more:",
	("CSV", "XML", "JSON", "HTML")
	)

	if file_type == "CSV":
	st.title("CSV : (Comma-Separated Values) 📋")
	st.markdown("""A CSV file stores data in plain text format, where each line represents a row and values are separated by commas. 🗂️""", unsafe_allow_html=True)
	st.code("Name,Age,Location\nAlice,25,New York\nBob,30,Los Angeles", language="csv")
	code = '''
	pd.read_csv(r"/Users/rajbunny/Downloads/phones_.csv")
	'''
	st.code(code, language="python")
	st.subheader("IMPORTANT NOTE 💡")
	st.markdown("""
	In CSV files, the separator can be:
	- Comma (`,`)
	- Semicolon (`;`)
	- Tab (`\\t`)
	- Pipe (`\|`)
	- Space (` `) 🛠️
	""", unsafe_allow_html=True)
	st.header("Major Issues When Handling CSV Files ⚠️")
	st.markdown("""
	<ul class="icon-bullet">
	<li>Parse error 🛑</li>
	<li>Encoding issue 🌐</li>
	<li>Memory issue 🖥️</li>
	</ul>
	""", unsafe_allow_html=True)
	st.subheader("Parse Error 📉")
	st.markdown("""A parsing error occurs when the number of values in a row doesn't match the number of columns in the header. This can cause issues during data processing.""", unsafe_allow_html=True)
	st.code("Name,Age,Location\nAlice,25,New York\nBob,30,Los Angeles\nCharlie,22", language="csv")
	st.markdown("""
	To avoid parse errors, you can use `on_bad_lines`:
	- `skip`: Ignores the bad rows 🗑️.
	- `warn`: Skips the rows but provides a warning ⚠️.
	""")
	code = '''
	data = pd.read_csv(r'/Users/rajbunny/Downloads/text.csv', on_bad_lines="skip")
	data = pd.read_csv(r'/Users/rajbunny/Downloads/text.csv', on_bad_lines="warn")
	'''
	st.code(code, language="python")
	st.subheader("Encoding Issue 🌐")
	st.markdown("""
	Encoding issues occur when the characters in a CSV file don't match the expected format.
	Common encodings include:
	- UTF-8 (default for most files)
	- ISO-8859-1 (Latin-1)
	- Windows-1252
	""", unsafe_allow_html=True)
	st.code("""Name,Age,Location\nAlice,30,New York\nBob,25,Los Angeles\nMário,28,São""", language='csv')
	st.markdown("""
	To resolve encoding issues, specify the correct encoding while reading the file.
	""", unsafe_allow_html=True)
	code = '''
	pd.read_csv(r"/Users/rajbunny/Downloads/spam.csv", encoding="utf-8", on_bad_lines="skip")
	'''
	st.code(code, language="python")
	st.subheader("Memory Issue 🖥️")
	st.markdown("""
	For large CSV files causing memory issues, you can load them in chunks:
	""", unsafe_allow_html=True)
	code = '''
	for chunk in pd.read_csv(r"/Users/rajbunny/Downloads/spam.csv", chunksize=1000):
	print(chunk.shape)
	'''
	st.code(code, language="python")
	st.subheader("Save Data Back to CSV 💾")
	code = '''
	data.to_csv(r"output.csv")
	'''
	st.code(code, language="python")
	st.subheader("📝 To view the coding part of the Jupyter notebook:")

	elif file_type == "XML":
	st.title("XML : (eXtensible Markup Language) 🏷️")
	st.markdown("""An XML file stores data in a tree-like structure using custom tags to organize information. 🌳""", unsafe_allow_html=True)
	st.code('''
	<data>
	<person>
	<name>Alice</name>
	<age>30</age>
	<location>New York</location>
	</person>
	<person>
	<name>Bob</name>
	<age>25</age>
	<location>Los Angeles</location>
	</person>
	</data>
	''', language='xml')
	code = '''
	pd.read_xml(r"/Users/rajbunny/Downloads/sample1.xml")
	'''
	st.code(code, language="python")
	st.subheader("XPath 🧭")
	st.markdown("""
	XPath is used to navigate and pick specific parts of an XML file, like selecting nodes or attributes.
	""", unsafe_allow_html=True)
	code = '''
	data = pd.read_xml(r"/Users/rajbunny/Downloads/sample3.xml", xpath="person")
	'''
	st.code(code, language="python")
	st.subheader("Save Back to XML 💾")
	code = '''
	data.to_xml(r"output.xml")
	'''
	st.code(code, language="python")

	elif file_type == "JSON":
	st.title("JSON : (JavaScript Object Notation) 🌐")
	st.markdown("""JSON stores data as key-value pairs, making it easy to convert into a table or dictionary-like format.All the api data will be in the form of JSON and its of two types given below..
	<ul class="icon-bullet">
	<li>Structured</li>
	<li>Un Structured</li>
	</ul>""",unsafe_allow_html=True)
	st.header("Structured JSON Format")
	code='''
	d1='{"name":["p1","p2"],"age":[21,22]}'
	'''
	st.code(code,language="python")
	st.markdown("""it's in the form of dictonary given inside a string""",unsafe_allow_html=True)
	st.header("How to read a Structured json file?")
	code='''
	data=pd.read_json(d1)
	'''
	st.code(code,language="python")
	st.header("Data Frame to json")
	st.subheader("Orient as index")
	code='''
	jason_with_index=data.to_json(orient="index")
	output='{"0":{"name":"p1","age":21},"1":{"name":"p2","age":22}}'
	'''
	st.code(code,language="python")
	st.markdown("""While converting when we give orient as index then keys will index and rest will be values..""",unsafe_allow_html=True)
	st.subheader("To Convert Back to Data Frame")
	code='''
	pd.read_json(jason_with_index,orient="index")
	'''
	st.code(code,language="python")
	st.subheader("Orient as column")
	code='''
	jason_with_index=data.to_json(orient="column")
	output='{"name":{"0":"p1","1":"p2"},"age":{"0":21,"1":22}}'
	'''
	st.code(code,language="python")
	st.markdown("""While converting when we give orient as column then keys will column and rest will be values..""",unsafe_allow_html=True)
	st.subheader("To Convert Back to Data Frame")
	code='''
	pd.read_json(jason_with_index,orient="column")
	'''
	st.code(code,language="python")
	st.subheader("Orient as values")
	code='''
	jason_with_index=data.to_json(orient="values")
	output='[["p1",21],["p2",22]]'
	'''
	st.code(code,language="python")
	st.markdown("""While converting when we give orient as values then we will be getting a nested list""",unsafe_allow_html=True)
	st.subheader("To Convert Back to Data Frame")
	code='''
	pd.read_json(jason_with_index,orient="values")
	'''
	st.code(code,language="python")
	st.subheader("Orient as split")
	code='''
	jason_with_index=data.to_json(orient="split")
	output='{"columns":["name","age"],"index":[0,1],"data":[["p1",21],["p2",22]]}'
	'''
	st.code(code,language="python")
	st.markdown("""While converting when we give orient as split then we will be getting index as seperate key,columns as seperate key,and finally data as seperate key...""",unsafe_allow_html=True)
	st.subheader("To Convert Back to Data Frame")
	code='''
	pd.read_json(jason_with_index,orient="split")
	'''
	st.code(code,language="python")
	st.subheader("First Un Structured JSON Format")
	code='''
	d2={"name":["p1","p2"],"marks":{"sem1":{"maths":[11,12],"hindi":[11,12]},"sem2":{"maths":[11,11],"hindi":[12,12]}}}
	'''
	st.code(code,"python")
	st.markdown("""A json format can be said as unstructured json format when have dictonary inside a dictonary....""")
	st.subheader("How to read Un Structured JSON Format?")
	code='''
	pd.json_normalize(d2,max_level=1)
	pd.json_normalize(d2)
	'''
	st.code(code,"python")
	st.markdown("""When we have a json file having dictonary inside a dictonary we havr use json_normalize if we jst pass the file as it is default i will check all the levels but when we pass max_level value it will check till that level only..""",unsafe_allow_html=True)
	st.subheader("Second Un Structured JSON Format")
	code='''
	x=[{"name":"p1","age":21,"marks":[{"maths":11,"hindi":11}]},{"name":"p1","age":21,"marks":[{"maths":11,"hindi":11}]}]
	'''
	st.code(code,language="python")
	st.markdown("""A json formal is also said as unstructured json format when we have dictonary inside a list.""",unsafe_allow_html=True)
	st.subheader("How to read Un Structured JSON Format?")
	code='''
	pd.json_normalize(x,record_path="marks",meta=["name","age"])
	'''
	st.code(code,language="python")
	st.markdown("""When we have a dictonary inside a list for that column we will be useing record_path and to say also to include remaing columns we use meta and pass those columns also..""")

	elif file_type == "HTML":
	st.title("HTML : (HyperText Markup Language) 🖥️")
	st.markdown("""HTML is a Semi Structured data by this html we can retrive only the tables present inside the particular table..""")
	st.subheader("How to read and get the Tabular data from the url.?")
	code='''
	data=pd.read_html("https://en.wikipedia.org/wiki/Indian_Premier_League")
	data=pd.read_html("https://en.wikipedia.org/wiki/Indian_Premier_League",match="Texas Super Kings")
	'''
	st.code(code,language="python")
	st.subheader("Note")
	st.markdown("""First one will give all the tables and Second one will give only the matched word tabels only..""")