hari3485 commited on
Commit
140a95d
·
verified ·
1 Parent(s): b8484bf

Update pages/hari.py

Browse files
Files changed (1) hide show
  1. pages/hari.py +66 -3
pages/hari.py CHANGED
@@ -232,10 +232,73 @@ def html_details_page():
232
  - Semi-structured data with nested tags.
233
  - Libraries like `BeautifulSoup` help parse and extract information.
234
  """)
235
- st.code('from bs4 import BeautifulSoup\nsoup = BeautifulSoup(open("file.html"))', language="python")
 
 
 
 
 
 
236
 
237
- if st.button("Back to Home"):
238
- st.session_state['page'] = "home"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
239
 
240
  # Unstructured Data - Image Page
241
  def image_details_page():
 
232
  - Semi-structured data with nested tags.
233
  - Libraries like `BeautifulSoup` help parse and extract information.
234
  """)
235
+ st.write("""
236
+ - **HTML (HyperText Markup Language)** is a semi-structured data format.
237
+ - HTML uses tags like `<table>`, `<tr>`, `<th>`, and `<td>` to structure tabular data.
238
+ - Unlike XML, HTML does not allow creating custom tags freely.
239
+ - Not all HTML content can be converted into dataframes, especially paragraph text or unstructured data.
240
+ - Typically, only table-related elements (`<table>`, `<tr>`, `<th>`, `<td>`) can be converted into dataframes.
241
+ """)
242
 
243
+ # Reading HTML Files Section
244
+ st.header("Reading HTML Files into DataFrames")
245
+
246
+ st.write("**Reading HTML Files:**")
247
+ st.code("""
248
+ import pandas as pd
249
+ tables = pd.read_html(path_or_buffer)
250
+ """, language="python")
251
+
252
+ st.write("""
253
+ - **`pd.read_html(path_or_buffer)`** reads HTML files or websites containing tables.
254
+ - Extracts all tables and returns them as a list of dataframes.
255
+ """)
256
+
257
+ st.write("**Accessing Specific Tables:**")
258
+ st.code("""
259
+ # Accessing the first table from the list
260
+ table = tables[0]
261
+ """, language="python")
262
+
263
+ st.write("""
264
+ - Each table is stored in the list by index.
265
+ - Use indexing to select the table you want to work with.
266
+ """)
267
+
268
+ st.write("**Limitations:**")
269
+ st.write("""
270
+ - Not all websites or HTML files can be read, even if they have tables.
271
+ - Issues like authorization restrictions can prevent reading certain tables.
272
+ """)
273
+
274
+ st.write("**Using the `match` Parameter:**")
275
+ st.code("""
276
+ # Reading a specific table using the match parameter
277
+ tables = pd.read_html(path, match="keyword")
278
+ """, language="python")
279
+
280
+ st.write("""
281
+ - To locate specific tables, use `match="keyword"` while reading HTML.
282
+ - The `match` parameter searches for tables containing the specified keyword.
283
+ """)
284
+
285
+ # Exporting DataFrames Section
286
+ st.header("Exporting DataFrames to HTML")
287
+
288
+ st.write("**Exporting DataFrame to HTML:**")
289
+ st.code("""
290
+ # Exporting a dataframe to an HTML file
291
+ df.to_html("output.html")
292
+ """, language="python")
293
+
294
+ st.write("""
295
+ - Converts a dataframe into an HTML file.
296
+ - Saves the dataframe in an HTML-compatible table format at the specified path.
297
+ """)
298
+ st.code('from bs4 import BeautifulSoup\nsoup = BeautifulSoup(open("file.html"))', language="python")
299
+
300
+ if st.button("Back to Home"):
301
+ st.session_state['page'] = "home"
302
 
303
  # Unstructured Data - Image Page
304
  def image_details_page():