Spaces:
Sleeping
Sleeping
Update pages/hari.py
Browse files- pages/hari.py +66 -3
pages/hari.py
CHANGED
|
@@ -232,10 +232,73 @@ def html_details_page():
|
|
| 232 |
- Semi-structured data with nested tags.
|
| 233 |
- Libraries like `BeautifulSoup` help parse and extract information.
|
| 234 |
""")
|
| 235 |
-
st.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 236 |
|
| 237 |
-
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
|
| 240 |
# Unstructured Data - Image Page
|
| 241 |
def image_details_page():
|
|
|
|
| 232 |
- Semi-structured data with nested tags.
|
| 233 |
- Libraries like `BeautifulSoup` help parse and extract information.
|
| 234 |
""")
|
| 235 |
+
st.write("""
|
| 236 |
+
- **HTML (HyperText Markup Language)** is a semi-structured data format.
|
| 237 |
+
- HTML uses tags like `<table>`, `<tr>`, `<th>`, and `<td>` to structure tabular data.
|
| 238 |
+
- Unlike XML, HTML does not allow creating custom tags freely.
|
| 239 |
+
- Not all HTML content can be converted into dataframes, especially paragraph text or unstructured data.
|
| 240 |
+
- Typically, only table-related elements (`<table>`, `<tr>`, `<th>`, `<td>`) can be converted into dataframes.
|
| 241 |
+
""")
|
| 242 |
|
| 243 |
+
# Reading HTML Files Section
|
| 244 |
+
st.header("Reading HTML Files into DataFrames")
|
| 245 |
+
|
| 246 |
+
st.write("**Reading HTML Files:**")
|
| 247 |
+
st.code("""
|
| 248 |
+
import pandas as pd
|
| 249 |
+
tables = pd.read_html(path_or_buffer)
|
| 250 |
+
""", language="python")
|
| 251 |
+
|
| 252 |
+
st.write("""
|
| 253 |
+
- **`pd.read_html(path_or_buffer)`** reads HTML files or websites containing tables.
|
| 254 |
+
- Extracts all tables and returns them as a list of dataframes.
|
| 255 |
+
""")
|
| 256 |
+
|
| 257 |
+
st.write("**Accessing Specific Tables:**")
|
| 258 |
+
st.code("""
|
| 259 |
+
# Accessing the first table from the list
|
| 260 |
+
table = tables[0]
|
| 261 |
+
""", language="python")
|
| 262 |
+
|
| 263 |
+
st.write("""
|
| 264 |
+
- Each table is stored in the list by index.
|
| 265 |
+
- Use indexing to select the table you want to work with.
|
| 266 |
+
""")
|
| 267 |
+
|
| 268 |
+
st.write("**Limitations:**")
|
| 269 |
+
st.write("""
|
| 270 |
+
- Not all websites or HTML files can be read, even if they have tables.
|
| 271 |
+
- Issues like authorization restrictions can prevent reading certain tables.
|
| 272 |
+
""")
|
| 273 |
+
|
| 274 |
+
st.write("**Using the `match` Parameter:**")
|
| 275 |
+
st.code("""
|
| 276 |
+
# Reading a specific table using the match parameter
|
| 277 |
+
tables = pd.read_html(path, match="keyword")
|
| 278 |
+
""", language="python")
|
| 279 |
+
|
| 280 |
+
st.write("""
|
| 281 |
+
- To locate specific tables, use `match="keyword"` while reading HTML.
|
| 282 |
+
- The `match` parameter searches for tables containing the specified keyword.
|
| 283 |
+
""")
|
| 284 |
+
|
| 285 |
+
# Exporting DataFrames Section
|
| 286 |
+
st.header("Exporting DataFrames to HTML")
|
| 287 |
+
|
| 288 |
+
st.write("**Exporting DataFrame to HTML:**")
|
| 289 |
+
st.code("""
|
| 290 |
+
# Exporting a dataframe to an HTML file
|
| 291 |
+
df.to_html("output.html")
|
| 292 |
+
""", language="python")
|
| 293 |
+
|
| 294 |
+
st.write("""
|
| 295 |
+
- Converts a dataframe into an HTML file.
|
| 296 |
+
- Saves the dataframe in an HTML-compatible table format at the specified path.
|
| 297 |
+
""")
|
| 298 |
+
st.code('from bs4 import BeautifulSoup\nsoup = BeautifulSoup(open("file.html"))', language="python")
|
| 299 |
+
|
| 300 |
+
if st.button("Back to Home"):
|
| 301 |
+
st.session_state['page'] = "home"
|
| 302 |
|
| 303 |
# Unstructured Data - Image Page
|
| 304 |
def image_details_page():
|