hari3485 commited on
Commit
2ecffec
·
verified ·
1 Parent(s): 140a95d

Update pages/Data Collection.py

Browse files
Files changed (1) hide show
  1. pages/Data Collection.py +64 -213
pages/Data Collection.py CHANGED
@@ -1,220 +1,71 @@
1
  import streamlit as st
2
- import pandas as pd
3
-
4
- # Function for the Excel details page
5
- def excel_details_page():
6
- st.title("Structured Data - Excel Details")
7
-
8
- st.markdown("<h3 style='text-align:; color: #4a90e2;'>1. Handling Excel Files (.xlsx)</h3>", unsafe_allow_html=True)
9
- st.markdown("""
10
- <ul style="font-family: Arial; line-height: 1.6;">
11
- <li>Excel Files are (XLSX) Created using the Microsoft Excel application.</li>
12
- <li>Structured data format.</li>
13
- <li>Excel files automatically handle encoding during creation, so no encoding issues arise.</li>
14
- <li>If there are extra values in a row, Excel creates a new column and fills it with <b>null values</b> instead of throwing a <b>parsing error</b>.</li>
15
- </ul>
16
- """, unsafe_allow_html=True)
17
-
18
- st.markdown("<h3 style='text-align:; color: #ffa500;'>2. Reading Excel Files (.xlsx)</h3>", unsafe_allow_html=True)
19
- st.markdown("""
20
- <ul style="font-family: Arial; line-height: 1.6;">
21
- <li>Use the <b>pandas</b> function, <b>pd.read_excel("path")</b>, to read an Excel file.</li>
22
- <li>By default, it reads only one sheet.</li>
23
- <li>To read multiple sheets, specify the <b>sheet_name</b> parameter with a list of sheet indices.</li>
24
- </ul>""", unsafe_allow_html=True)
25
-
26
- st.code('df = pd.read_excel("path", sheet_name=[0, 1, 2])', language="python")
27
-
28
- st.markdown("""
29
- <ul style="font-family: Arial; line-height: 1.6;">
30
- <li><b>The Result is a Dictionary</b></li>
31
- <li>Keys: Sheet names.</li>
32
- <li>Values: DataFrames corresponding to each sheet.</li>
33
- </ul>""", unsafe_allow_html=True)
34
-
35
- st.code('df_first_sheet = df[0] # First sheet\n'
36
- 'df_second_sheet = df[1] # Second sheet\n'
37
- 'df_third_sheet = df[2] # Third sheet', language="python")
38
-
39
- st.markdown("<h3 style='text-align:; color: #dda0dd;'>3. Converting Data to Excel Files (.xlsx)</h3>", unsafe_allow_html=True)
40
- st.markdown("""
41
- <ul style="font-family: Arial; line-height: 1.6;">
42
- <li>To save a single DataFrame to an Excel file</li>
43
- </ul>""", unsafe_allow_html=True)
44
-
45
- st.code('df[0].to_excel("path")', language="python")
46
 
47
- st.markdown("""
48
- <ul style="font-family: Arial; line-height: 1.6;">
49
- <li>To save multiple sheets, use <b>pd.ExcelWriter</b></li>
50
- </ul>""", unsafe_allow_html=True)
51
 
52
- st.code("""with pd.ExcelWriter("path") as writer:
53
- df[0].to_excel(writer, sheet_name="Sheet1")
54
- df[1].to_excel(writer, sheet_name="Sheet2")""", language="python")
55
 
56
- # Button to go back to the main page
57
- if st.button("Back to Home"):
58
- st.session_state['page'] = "home"
 
 
 
 
59
 
60
- # Function for the CSV details page
61
- def csv_details_page():
62
- import streamlit as st
63
 
64
- # App header
65
-
66
- # Create a button
67
- # Display the content about semi-structured data
68
- st.header("1. What is Semi-Structured Data?")
69
- st.markdown("""
70
- <ul style="font-family: Arial; line-height: 1.6;">
71
- <li>Semi-structured data does not follow a strict tabular format but still has some organizational properties.</li>
72
- <li>Examples include CSV files, JSON, and XML.</li>
73
- </ul>
74
- """, unsafe_allow_html=True)
75
-
76
- st.header("2. Working with CSV Files")
77
- st.subheader("a) Reading a CSV File")
78
- st.markdown("""
79
- <ul style="font-family: Arial; line-height: 1.6;">
80
- <li>Use the <b>pandas</b> function, <code>pd.read_csv("file.csv")</code>, to read a CSV file.</li>
81
- <li>This function loads the file into a DataFrame.</li>
82
- </ul>
83
- """, unsafe_allow_html=True)
84
-
85
- # Code example for reading CSV
86
- st.code("""
87
  import pandas as pd
88
- df = pd.read_csv("file.csv")
89
- print(df.head())
90
- """, language="python")
91
-
92
- st.subheader("b) Handling Parse Errors")
93
- st.markdown("""
94
- <ul style="font-family: Arial; line-height: 1.6;">
95
- <li>If extra value is added to a row a <code> Parsing Error </code> </li>
96
- <li>It happens when we create csv with the help of <code> text editors </code> .</li>
97
- <li>If we add extra value to row it don't throw error instead it creates the new column for extra value it fills with <b> null</b> when converted from excel to csv.</li>
98
- </ul>
99
- """, unsafe_allow_html=True)
100
-
101
- st.markdown("""
102
- <p><b>Solution:</b> Use the <code>on_bad_lines</code> parameter in pandas:</p>
103
- <ul style="font-family: Arial; line-height: 1.6;">
104
- <li><code>"error"</code>: Stops the program and raises an error.</li>
105
- <li><code>"skip"</code>: Skips rows with errors.</li>
106
- <li><code>"warn"</code>: Skips rows with errors and shows the line numbers.</li>
107
- </ul>
108
- """, unsafe_allow_html=True)
109
-
110
- # Code example for handling parse errors
111
- st.code("""
112
-
113
- # Skip bad lines
114
- df = pd.read_csv("file.csv", on_bad_lines="skip")
115
-
116
- # Warn about bad lines
117
- df = pd.read_csv("file.csv", on_bad_lines="warn")
118
- """, language="python")
119
-
120
- st.subheader("c) Unicode Decode Error")
121
- st.markdown("""
122
- <ul style="font-family: Arial; line-height: 1.6;">
123
- <li>Each character, when saved, is represented by a unique number (ASCII/Unicode code point).</li>
124
- <li> ord("a") 97 , bin(97) → 0b1100001 (Binary representation of 'a') </li>
125
- <li>Characters are saved in memory using a specific encoding, typically UTF-8 by default.</li>
126
- <li>Unicode Decode Error: Occurs when the system is unable to decode a file due to an incorrect or incompatible encoding.To solve this, you need to find the appropriate encoding for the file.</li>
127
- <li>Python uses utf-8 by default for encoding, but files may be saved with other encodings.</li>
128
- <li><code>Using the encodings module</code>: To explore the available encodings, you can import encodings in Python</li>
129
- <li> There are <code>326</code> different encoding aliases available in Python, which can be accessed via <code>encodings.aliases.aliases.,/code></li>
130
- </ul>
131
- """, unsafe_allow_html=True)
132
-
133
- # Code example for trying multiple encodings
134
- st.code("""
135
- import encodings
136
-
137
- # Get all encodings
138
- encodings_list = list(encodings.aliases.aliases.keys())
139
-
140
- # Try reading the file with different encodings
141
- for encoding in encodings_list:
142
- try:
143
- df = pd.read_csv("file.csv", encoding=encoding)
144
- print(f"Success with encoding: {encoding}")
145
- break
146
- except:
147
- pass # Skip to the next encoding
148
- """, language="python")
149
-
150
- st.subheader("Lookup Error:")
151
- st.markdown("""
152
- <ul style="font-family: Arial; line-height: 1.6;">
153
- <li>Occurs if you try to access an encoding that is not available or supported.</li>
154
- <li>Use a try-except block to handle it gracefully</li>
155
- </ul>
156
- """, unsafe_allow_html=True)
157
-
158
- st.code('''
159
- except LookupError:
160
- print("Incorrect Encoding".format(y))
161
- ''')
162
-
163
- st.markdown("""
164
- <ul style="font-family: Arial; line-height: 1.6;">
165
- <li>After this when we get <code> Parse error </code> to solve that error add <code> on_badlines = "skip" parametre </code> .</li>
166
- </ul>
167
- """, unsafe_allow_html=True)
168
-
169
- st.subheader("d) Handling Large CSV Files")
170
- st.markdown("""
171
- <ul style="font-family: Arial; line-height: 1.6;">
172
- <li>When working with large CSV files, the file might not fit into memory, leading to a <code>MemoryError</code>.</li>
173
- <li><code>Solution: Use chunksize to break the file into smaller chunks.</code></li>
174
- <li>: To handle each chunk, you can iterate through the chunks and process them as needed.</li>
175
- </ul>
176
- """, unsafe_allow_html=True)
177
-
178
- # Code example for handling large files
179
- st.code("""
180
-
181
- chunk_size = 100
182
- chunks = pd.read_csv("large_file.csv", chunksize=chunk_size)
183
-
184
- for i, chunk in enumerate(chunks):
185
- print(f"Processing chunk {i + 1} with {chunk.shape[0]} rows")
186
- """, language="python")
187
-
188
- st.header("3. Summary")
189
- st.markdown("""
190
- <ul style="font-family: Arial; line-height: 1.6;">
191
- <li><b>Parse Errors:</b> Use <code>on_bad_lines</code> to handle them (<code>skip</code> or <code>warn</code>).</li>
192
- <li><b>Encoding Issues:</b> Try different encodings to fix <b>UnicodeDecodeError</b>.</li>
193
- <li><b>Large Files:</b> Use <code>chunksize</code> to process files in smaller parts.</li>
194
- </ul>
195
- """, unsafe_allow_html=True)
196
-
197
- # Button to go back to the main page
198
- if st.button("Back to Home"):
199
- st.session_state['page'] = "home"
200
-
201
-
202
- # Main page function
203
- def main_page():
204
- # Buttons for navigation
205
- if st.button("Go to Structured Data - Excel"):
206
- st.session_state['page'] = "excel_details"
207
- if st.button("Go to Semi-Structured Data - CSV"):
208
- st.session_state['page'] = "csv_details"
209
-
210
- # Initialize session state
211
- if 'page' not in st.session_state:
212
- st.session_state['page'] = "home"
213
-
214
- # Route to the appropriate page
215
- if st.session_state['page'] == "home":
216
- main_page()
217
- elif st.session_state['page'] == "excel_details":
218
- excel_details_page()
219
- elif st.session_state['page'] == "csv_details":
220
- csv_details_page()
 
1
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
 
3
+ # App title
4
+ st.title("Working with HTML Data using Python")
 
 
5
 
6
+ # HTML and DataFrames Section
7
+ st.header("HTML and DataFrames")
 
8
 
9
+ st.write("""
10
+ - **HTML (HyperText Markup Language)** is a semi-structured data format.
11
+ - HTML uses tags like `<table>`, `<tr>`, `<th>`, and `<td>` to structure tabular data.
12
+ - Unlike XML, HTML does not allow creating custom tags freely.
13
+ - Not all HTML content can be converted into dataframes, especially paragraph text or unstructured data.
14
+ - Typically, only table-related elements (`<table>`, `<tr>`, `<th>`, `<td>`) can be converted into dataframes.
15
+ """)
16
 
17
+ # Reading HTML Files Section
18
+ st.header("Reading HTML Files into DataFrames")
 
19
 
20
+ st.write("**Reading HTML Files:**")
21
+ st.code("""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  import pandas as pd
23
+ tables = pd.read_html(path_or_buffer)
24
+ """, language="python")
25
+
26
+ st.write("""
27
+ - **`pd.read_html(path_or_buffer)`** reads HTML files or websites containing tables.
28
+ - Extracts all tables and returns them as a list of dataframes.
29
+ """)
30
+
31
+ st.write("**Accessing Specific Tables:**")
32
+ st.code("""
33
+ # Accessing the first table from the list
34
+ table = tables[0]
35
+ """, language="python")
36
+
37
+ st.write("""
38
+ - Each table is stored in the list by index.
39
+ - Use indexing to select the table you want to work with.
40
+ """)
41
+
42
+ st.write("**Limitations:**")
43
+ st.write("""
44
+ - Not all websites or HTML files can be read, even if they have tables.
45
+ - Issues like authorization restrictions can prevent reading certain tables.
46
+ """)
47
+
48
+ st.write("**Using the `match` Parameter:**")
49
+ st.code("""
50
+ # Reading a specific table using the match parameter
51
+ tables = pd.read_html(path, match="keyword")
52
+ """, language="python")
53
+
54
+ st.write("""
55
+ - To locate specific tables, use `match="keyword"` while reading HTML.
56
+ - The `match` parameter searches for tables containing the specified keyword.
57
+ """)
58
+
59
+ # Exporting DataFrames Section
60
+ st.header("Exporting DataFrames to HTML")
61
+
62
+ st.write("**Exporting DataFrame to HTML:**")
63
+ st.code("""
64
+ # Exporting a dataframe to an HTML file
65
+ df.to_html("output.html")
66
+ """, language="python")
67
+
68
+ st.write("""
69
+ - Converts a dataframe into an HTML file.
70
+ - Saves the dataframe in an HTML-compatible table format at the specified path.
71
+ """)