Spaces:
Sleeping
Sleeping
Update pages/Data Collection.py
Browse files- pages/Data Collection.py +64 -213
pages/Data Collection.py
CHANGED
|
@@ -1,220 +1,71 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
-
import pandas as pd
|
| 3 |
-
|
| 4 |
-
# Function for the Excel details page
|
| 5 |
-
def excel_details_page():
|
| 6 |
-
st.title("Structured Data - Excel Details")
|
| 7 |
-
|
| 8 |
-
st.markdown("<h3 style='text-align:; color: #4a90e2;'>1. Handling Excel Files (.xlsx)</h3>", unsafe_allow_html=True)
|
| 9 |
-
st.markdown("""
|
| 10 |
-
<ul style="font-family: Arial; line-height: 1.6;">
|
| 11 |
-
<li>Excel Files are (XLSX) Created using the Microsoft Excel application.</li>
|
| 12 |
-
<li>Structured data format.</li>
|
| 13 |
-
<li>Excel files automatically handle encoding during creation, so no encoding issues arise.</li>
|
| 14 |
-
<li>If there are extra values in a row, Excel creates a new column and fills it with <b>null values</b> instead of throwing a <b>parsing error</b>.</li>
|
| 15 |
-
</ul>
|
| 16 |
-
""", unsafe_allow_html=True)
|
| 17 |
-
|
| 18 |
-
st.markdown("<h3 style='text-align:; color: #ffa500;'>2. Reading Excel Files (.xlsx)</h3>", unsafe_allow_html=True)
|
| 19 |
-
st.markdown("""
|
| 20 |
-
<ul style="font-family: Arial; line-height: 1.6;">
|
| 21 |
-
<li>Use the <b>pandas</b> function, <b>pd.read_excel("path")</b>, to read an Excel file.</li>
|
| 22 |
-
<li>By default, it reads only one sheet.</li>
|
| 23 |
-
<li>To read multiple sheets, specify the <b>sheet_name</b> parameter with a list of sheet indices.</li>
|
| 24 |
-
</ul>""", unsafe_allow_html=True)
|
| 25 |
-
|
| 26 |
-
st.code('df = pd.read_excel("path", sheet_name=[0, 1, 2])', language="python")
|
| 27 |
-
|
| 28 |
-
st.markdown("""
|
| 29 |
-
<ul style="font-family: Arial; line-height: 1.6;">
|
| 30 |
-
<li><b>The Result is a Dictionary</b></li>
|
| 31 |
-
<li>Keys: Sheet names.</li>
|
| 32 |
-
<li>Values: DataFrames corresponding to each sheet.</li>
|
| 33 |
-
</ul>""", unsafe_allow_html=True)
|
| 34 |
-
|
| 35 |
-
st.code('df_first_sheet = df[0] # First sheet\n'
|
| 36 |
-
'df_second_sheet = df[1] # Second sheet\n'
|
| 37 |
-
'df_third_sheet = df[2] # Third sheet', language="python")
|
| 38 |
-
|
| 39 |
-
st.markdown("<h3 style='text-align:; color: #dda0dd;'>3. Converting Data to Excel Files (.xlsx)</h3>", unsafe_allow_html=True)
|
| 40 |
-
st.markdown("""
|
| 41 |
-
<ul style="font-family: Arial; line-height: 1.6;">
|
| 42 |
-
<li>To save a single DataFrame to an Excel file</li>
|
| 43 |
-
</ul>""", unsafe_allow_html=True)
|
| 44 |
-
|
| 45 |
-
st.code('df[0].to_excel("path")', language="python")
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
<li>To save multiple sheets, use <b>pd.ExcelWriter</b></li>
|
| 50 |
-
</ul>""", unsafe_allow_html=True)
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
df[1].to_excel(writer, sheet_name="Sheet2")""", language="python")
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
-
#
|
| 61 |
-
|
| 62 |
-
import streamlit as st
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
# Create a button
|
| 67 |
-
# Display the content about semi-structured data
|
| 68 |
-
st.header("1. What is Semi-Structured Data?")
|
| 69 |
-
st.markdown("""
|
| 70 |
-
<ul style="font-family: Arial; line-height: 1.6;">
|
| 71 |
-
<li>Semi-structured data does not follow a strict tabular format but still has some organizational properties.</li>
|
| 72 |
-
<li>Examples include CSV files, JSON, and XML.</li>
|
| 73 |
-
</ul>
|
| 74 |
-
""", unsafe_allow_html=True)
|
| 75 |
-
|
| 76 |
-
st.header("2. Working with CSV Files")
|
| 77 |
-
st.subheader("a) Reading a CSV File")
|
| 78 |
-
st.markdown("""
|
| 79 |
-
<ul style="font-family: Arial; line-height: 1.6;">
|
| 80 |
-
<li>Use the <b>pandas</b> function, <code>pd.read_csv("file.csv")</code>, to read a CSV file.</li>
|
| 81 |
-
<li>This function loads the file into a DataFrame.</li>
|
| 82 |
-
</ul>
|
| 83 |
-
""", unsafe_allow_html=True)
|
| 84 |
-
|
| 85 |
-
# Code example for reading CSV
|
| 86 |
-
st.code("""
|
| 87 |
import pandas as pd
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
# Get all encodings
|
| 138 |
-
encodings_list = list(encodings.aliases.aliases.keys())
|
| 139 |
-
|
| 140 |
-
# Try reading the file with different encodings
|
| 141 |
-
for encoding in encodings_list:
|
| 142 |
-
try:
|
| 143 |
-
df = pd.read_csv("file.csv", encoding=encoding)
|
| 144 |
-
print(f"Success with encoding: {encoding}")
|
| 145 |
-
break
|
| 146 |
-
except:
|
| 147 |
-
pass # Skip to the next encoding
|
| 148 |
-
""", language="python")
|
| 149 |
-
|
| 150 |
-
st.subheader("Lookup Error:")
|
| 151 |
-
st.markdown("""
|
| 152 |
-
<ul style="font-family: Arial; line-height: 1.6;">
|
| 153 |
-
<li>Occurs if you try to access an encoding that is not available or supported.</li>
|
| 154 |
-
<li>Use a try-except block to handle it gracefully</li>
|
| 155 |
-
</ul>
|
| 156 |
-
""", unsafe_allow_html=True)
|
| 157 |
-
|
| 158 |
-
st.code('''
|
| 159 |
-
except LookupError:
|
| 160 |
-
print("Incorrect Encoding".format(y))
|
| 161 |
-
''')
|
| 162 |
-
|
| 163 |
-
st.markdown("""
|
| 164 |
-
<ul style="font-family: Arial; line-height: 1.6;">
|
| 165 |
-
<li>After this when we get <code> Parse error </code> to solve that error add <code> on_badlines = "skip" parametre </code> .</li>
|
| 166 |
-
</ul>
|
| 167 |
-
""", unsafe_allow_html=True)
|
| 168 |
-
|
| 169 |
-
st.subheader("d) Handling Large CSV Files")
|
| 170 |
-
st.markdown("""
|
| 171 |
-
<ul style="font-family: Arial; line-height: 1.6;">
|
| 172 |
-
<li>When working with large CSV files, the file might not fit into memory, leading to a <code>MemoryError</code>.</li>
|
| 173 |
-
<li><code>Solution: Use chunksize to break the file into smaller chunks.</code></li>
|
| 174 |
-
<li>: To handle each chunk, you can iterate through the chunks and process them as needed.</li>
|
| 175 |
-
</ul>
|
| 176 |
-
""", unsafe_allow_html=True)
|
| 177 |
-
|
| 178 |
-
# Code example for handling large files
|
| 179 |
-
st.code("""
|
| 180 |
-
|
| 181 |
-
chunk_size = 100
|
| 182 |
-
chunks = pd.read_csv("large_file.csv", chunksize=chunk_size)
|
| 183 |
-
|
| 184 |
-
for i, chunk in enumerate(chunks):
|
| 185 |
-
print(f"Processing chunk {i + 1} with {chunk.shape[0]} rows")
|
| 186 |
-
""", language="python")
|
| 187 |
-
|
| 188 |
-
st.header("3. Summary")
|
| 189 |
-
st.markdown("""
|
| 190 |
-
<ul style="font-family: Arial; line-height: 1.6;">
|
| 191 |
-
<li><b>Parse Errors:</b> Use <code>on_bad_lines</code> to handle them (<code>skip</code> or <code>warn</code>).</li>
|
| 192 |
-
<li><b>Encoding Issues:</b> Try different encodings to fix <b>UnicodeDecodeError</b>.</li>
|
| 193 |
-
<li><b>Large Files:</b> Use <code>chunksize</code> to process files in smaller parts.</li>
|
| 194 |
-
</ul>
|
| 195 |
-
""", unsafe_allow_html=True)
|
| 196 |
-
|
| 197 |
-
# Button to go back to the main page
|
| 198 |
-
if st.button("Back to Home"):
|
| 199 |
-
st.session_state['page'] = "home"
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
# Main page function
|
| 203 |
-
def main_page():
|
| 204 |
-
# Buttons for navigation
|
| 205 |
-
if st.button("Go to Structured Data - Excel"):
|
| 206 |
-
st.session_state['page'] = "excel_details"
|
| 207 |
-
if st.button("Go to Semi-Structured Data - CSV"):
|
| 208 |
-
st.session_state['page'] = "csv_details"
|
| 209 |
-
|
| 210 |
-
# Initialize session state
|
| 211 |
-
if 'page' not in st.session_state:
|
| 212 |
-
st.session_state['page'] = "home"
|
| 213 |
-
|
| 214 |
-
# Route to the appropriate page
|
| 215 |
-
if st.session_state['page'] == "home":
|
| 216 |
-
main_page()
|
| 217 |
-
elif st.session_state['page'] == "excel_details":
|
| 218 |
-
excel_details_page()
|
| 219 |
-
elif st.session_state['page'] == "csv_details":
|
| 220 |
-
csv_details_page()
|
|
|
|
| 1 |
import streamlit as st
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
# App title
|
| 4 |
+
st.title("Working with HTML Data using Python")
|
|
|
|
|
|
|
| 5 |
|
| 6 |
+
# HTML and DataFrames Section
|
| 7 |
+
st.header("HTML and DataFrames")
|
|
|
|
| 8 |
|
| 9 |
+
st.write("""
|
| 10 |
+
- **HTML (HyperText Markup Language)** is a semi-structured data format.
|
| 11 |
+
- HTML uses tags like `<table>`, `<tr>`, `<th>`, and `<td>` to structure tabular data.
|
| 12 |
+
- Unlike XML, HTML does not allow creating custom tags freely.
|
| 13 |
+
- Not all HTML content can be converted into dataframes, especially paragraph text or unstructured data.
|
| 14 |
+
- Typically, only table-related elements (`<table>`, `<tr>`, `<th>`, `<td>`) can be converted into dataframes.
|
| 15 |
+
""")
|
| 16 |
|
| 17 |
+
# Reading HTML Files Section
|
| 18 |
+
st.header("Reading HTML Files into DataFrames")
|
|
|
|
| 19 |
|
| 20 |
+
st.write("**Reading HTML Files:**")
|
| 21 |
+
st.code("""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
import pandas as pd
|
| 23 |
+
tables = pd.read_html(path_or_buffer)
|
| 24 |
+
""", language="python")
|
| 25 |
+
|
| 26 |
+
st.write("""
|
| 27 |
+
- **`pd.read_html(path_or_buffer)`** reads HTML files or websites containing tables.
|
| 28 |
+
- Extracts all tables and returns them as a list of dataframes.
|
| 29 |
+
""")
|
| 30 |
+
|
| 31 |
+
st.write("**Accessing Specific Tables:**")
|
| 32 |
+
st.code("""
|
| 33 |
+
# Accessing the first table from the list
|
| 34 |
+
table = tables[0]
|
| 35 |
+
""", language="python")
|
| 36 |
+
|
| 37 |
+
st.write("""
|
| 38 |
+
- Each table is stored in the list by index.
|
| 39 |
+
- Use indexing to select the table you want to work with.
|
| 40 |
+
""")
|
| 41 |
+
|
| 42 |
+
st.write("**Limitations:**")
|
| 43 |
+
st.write("""
|
| 44 |
+
- Not all websites or HTML files can be read, even if they have tables.
|
| 45 |
+
- Issues like authorization restrictions can prevent reading certain tables.
|
| 46 |
+
""")
|
| 47 |
+
|
| 48 |
+
st.write("**Using the `match` Parameter:**")
|
| 49 |
+
st.code("""
|
| 50 |
+
# Reading a specific table using the match parameter
|
| 51 |
+
tables = pd.read_html(path, match="keyword")
|
| 52 |
+
""", language="python")
|
| 53 |
+
|
| 54 |
+
st.write("""
|
| 55 |
+
- To locate specific tables, use `match="keyword"` while reading HTML.
|
| 56 |
+
- The `match` parameter searches for tables containing the specified keyword.
|
| 57 |
+
""")
|
| 58 |
+
|
| 59 |
+
# Exporting DataFrames Section
|
| 60 |
+
st.header("Exporting DataFrames to HTML")
|
| 61 |
+
|
| 62 |
+
st.write("**Exporting DataFrame to HTML:**")
|
| 63 |
+
st.code("""
|
| 64 |
+
# Exporting a dataframe to an HTML file
|
| 65 |
+
df.to_html("output.html")
|
| 66 |
+
""", language="python")
|
| 67 |
+
|
| 68 |
+
st.write("""
|
| 69 |
+
- Converts a dataframe into an HTML file.
|
| 70 |
+
- Saves the dataframe in an HTML-compatible table format at the specified path.
|
| 71 |
+
""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|