import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import random import requests st.markdown(""" """, unsafe_allow_html=True) st.title("Semi-Structured Data 📊📜") st.markdown(""" Semi-structured data is a type of data that doesn’t follow strict rules like a table but still has some structure, like labels or tags, to organize it. Examples include:

CSV 📋
XML 🏷️
JSON 🌐
HTML 🖥️

""", unsafe_allow_html=True) if st.button("🚀 Open Jupyter Notebook for entire semi-structured data"): notebook_url = "https://github.com/raj2216/my-code/blob/main/DATA%20HANDLING%20(1).ipynb" st.markdown(f"[Click here to go to the Jupyter notebook]({notebook_url})", unsafe_allow_html=True) st.sidebar.title("Navigation 🧭") file_type = st.sidebar.radio( "Choose a file type to learn more:", ("CSV", "XML", "JSON", "HTML") ) if file_type == "CSV": st.title("CSV : (Comma-Separated Values) 📋") st.markdown("""A CSV file stores data in plain text format, where each line represents a row and values are separated by commas. 🗂️""", unsafe_allow_html=True) st.code("Name,Age,Location\nAlice,25,New York\nBob,30,Los Angeles", language="csv") code = ''' pd.read_csv(r"/Users/rajbunny/Downloads/phones_.csv") ''' st.code(code, language="python") st.subheader("IMPORTANT NOTE 💡") st.markdown(""" In CSV files, the separator can be: - **Comma (`,`)** - **Semicolon (`;`)** - **Tab (`\\t`)** - **Pipe (`|`)** - **Space (` `)** 🛠️ """, unsafe_allow_html=True) st.header("Major Issues When Handling CSV Files ⚠️") st.markdown("""

Parse error 🛑
Encoding issue 🌐
Memory issue 🖥️

""", unsafe_allow_html=True) st.subheader("Parse Error 📉") st.markdown("""A parsing error occurs when the number of values in a row doesn't match the number of columns in the header. This can cause issues during data processing.""", unsafe_allow_html=True) st.code("Name,Age,Location\nAlice,25,New York\nBob,30,Los Angeles\nCharlie,22", language="csv") st.markdown(""" To avoid parse errors, you can use `on_bad_lines`: - **`skip`**: Ignores the bad rows 🗑️. - **`warn`**: Skips the rows but provides a warning ⚠️. """) code = ''' data = pd.read_csv(r'/Users/rajbunny/Downloads/text.csv', on_bad_lines="skip") data = pd.read_csv(r'/Users/rajbunny/Downloads/text.csv', on_bad_lines="warn") ''' st.code(code, language="python") st.subheader("Encoding Issue 🌐") st.markdown(""" Encoding issues occur when the characters in a CSV file don't match the expected format. Common encodings include: - **UTF-8** (default for most files) - **ISO-8859-1 (Latin-1)** - **Windows-1252** """, unsafe_allow_html=True) st.code("""Name,Age,Location\nAlice,30,New York\nBob,25,Los Angeles\nMário,28,São""", language='csv') st.markdown(""" To resolve encoding issues, specify the correct encoding while reading the file. """, unsafe_allow_html=True) code = ''' pd.read_csv(r"/Users/rajbunny/Downloads/spam.csv", encoding="utf-8", on_bad_lines="skip") ''' st.code(code, language="python") st.subheader("Memory Issue 🖥️") st.markdown(""" For large CSV files causing memory issues, you can load them in chunks: """, unsafe_allow_html=True) code = ''' for chunk in pd.read_csv(r"/Users/rajbunny/Downloads/spam.csv", chunksize=1000): print(chunk.shape) ''' st.code(code, language="python") st.subheader("Save Data Back to CSV 💾") code = ''' data.to_csv(r"output.csv") ''' st.code(code, language="python") st.subheader("📝 To view the coding part of the Jupyter notebook:") elif file_type == "XML": st.title("XML : (eXtensible Markup Language) 🏷️") st.markdown("""An XML file stores data in a tree-like structure using custom tags to organize information. 🌳""", unsafe_allow_html=True) st.code(''' Alice 30 New York Bob 25 Los Angeles ''', language='xml') code = ''' pd.read_xml(r"/Users/rajbunny/Downloads/sample1.xml") ''' st.code(code, language="python") st.subheader("XPath 🧭") st.markdown(""" XPath is used to navigate and pick specific parts of an XML file, like selecting nodes or attributes. """, unsafe_allow_html=True) code = ''' data = pd.read_xml(r"/Users/rajbunny/Downloads/sample3.xml", xpath="person") ''' st.code(code, language="python") st.subheader("Save Back to XML 💾") code = ''' data.to_xml(r"output.xml") ''' st.code(code, language="python") elif file_type == "JSON": st.title("JSON : (JavaScript Object Notation) 🌐") st.markdown("""JSON stores data as key-value pairs, making it easy to convert into a table or dictionary-like format.All the api data will be in the form of JSON and its of two types given below..

Structured
Un Structured

""",unsafe_allow_html=True) st.header("Structured JSON Format") code=''' d1='{"name":["p1","p2"],"age":[21,22]}' ''' st.code(code,language="python") st.markdown("""it's in the form of dictonary given inside a string""",unsafe_allow_html=True) st.header("How to read a Structured json file?") code=''' data=pd.read_json(d1) ''' st.code(code,language="python") st.header("Data Frame to json") st.subheader("Orient as index") code=''' jason_with_index=data.to_json(orient="index") output='{"0":{"name":"p1","age":21},"1":{"name":"p2","age":22}}' ''' st.code(code,language="python") st.markdown("""While converting when we give orient as index then keys will index and rest will be values..""",unsafe_allow_html=True) st.subheader("To Convert Back to Data Frame") code=''' pd.read_json(jason_with_index,orient="index") ''' st.code(code,language="python") st.subheader("Orient as column") code=''' jason_with_index=data.to_json(orient="column") output='{"name":{"0":"p1","1":"p2"},"age":{"0":21,"1":22}}' ''' st.code(code,language="python") st.markdown("""While converting when we give orient as column then keys will column and rest will be values..""",unsafe_allow_html=True) st.subheader("To Convert Back to Data Frame") code=''' pd.read_json(jason_with_index,orient="column") ''' st.code(code,language="python") st.subheader("Orient as values") code=''' jason_with_index=data.to_json(orient="values") output='[["p1",21],["p2",22]]' ''' st.code(code,language="python") st.markdown("""While converting when we give orient as values then we will be getting a nested list""",unsafe_allow_html=True) st.subheader("To Convert Back to Data Frame") code=''' pd.read_json(jason_with_index,orient="values") ''' st.code(code,language="python") st.subheader("Orient as split") code=''' jason_with_index=data.to_json(orient="split") output='{"columns":["name","age"],"index":[0,1],"data":[["p1",21],["p2",22]]}' ''' st.code(code,language="python") st.markdown("""While converting when we give orient as split then we will be getting index as seperate key,columns as seperate key,and finally data as seperate key...""",unsafe_allow_html=True) st.subheader("To Convert Back to Data Frame") code=''' pd.read_json(jason_with_index,orient="split") ''' st.code(code,language="python") st.subheader("First Un Structured JSON Format") code=''' d2={"name":["p1","p2"],"marks":{"sem1":{"maths":[11,12],"hindi":[11,12]},"sem2":{"maths":[11,11],"hindi":[12,12]}}} ''' st.code(code,"python") st.markdown("""A json format can be said as unstructured json format when have dictonary inside a dictonary....""") st.subheader("How to read Un Structured JSON Format?") code=''' pd.json_normalize(d2,max_level=1) pd.json_normalize(d2) ''' st.code(code,"python") st.markdown("""When we have a json file having dictonary inside a dictonary we havr use **json_normalize** if we jst pass the file as it is default i will check all the levels but when we pass max_level value it will check till that level only..""",unsafe_allow_html=True) st.subheader("Second Un Structured JSON Format") code=''' x=[{"name":"p1","age":21,"marks":[{"maths":11,"hindi":11}]},{"name":"p1","age":21,"marks":[{"maths":11,"hindi":11}]}] ''' st.code(code,language="python") st.markdown("""A json formal is also said as unstructured json format when we have dictonary inside a list.""",unsafe_allow_html=True) st.subheader("How to read Un Structured JSON Format?") code=''' pd.json_normalize(x,record_path="marks",meta=["name","age"]) ''' st.code(code,language="python") st.markdown("""When we have a dictonary inside a list for that column we will be useing **record_path** and to say also to include remaing columns we use **meta** and pass those columns also..""") elif file_type == "HTML": st.title("HTML : (HyperText Markup Language) 🖥️") st.markdown("""HTML is a Semi Structured data by this html we can retrive only the tables present inside the particular table..""") st.subheader("How to read and get the Tabular data from the url.?") code=''' data=pd.read_html("https://en.wikipedia.org/wiki/Indian_Premier_League") data=pd.read_html("https://en.wikipedia.org/wiki/Indian_Premier_League",match="Texas Super Kings") ''' st.code(code,language="python") st.subheader("Note") st.markdown("""First one will give all the tables and Second one will give only the matched word tabels only..""")