Spaces:
Sleeping
Sleeping
Update pages/Data Collection.py
Browse files- pages/Data Collection.py +165 -4
pages/Data Collection.py
CHANGED
|
@@ -269,14 +269,175 @@ if st.session_state.current_page == "main":
|
|
| 269 |
|
| 270 |
# Pages for Each Format
|
| 271 |
elif st.session_state.current_page == "explore_excel":
|
|
|
|
| 272 |
st.markdown("""
|
| 273 |
-
<
|
| 274 |
""", unsafe_allow_html=True)
|
|
|
|
| 275 |
st.write("""
|
| 276 |
-
Excel is a
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
""")
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
elif st.session_state.current_page == "explore_images_video":
|
| 282 |
st.markdown("""
|
|
|
|
| 269 |
|
| 270 |
# Pages for Each Format
|
| 271 |
elif st.session_state.current_page == "explore_excel":
|
| 272 |
+
# Section about Excel
|
| 273 |
st.markdown("""
|
| 274 |
+
<h2 style="color: #BB3385;">Excel</h2>
|
| 275 |
""", unsafe_allow_html=True)
|
| 276 |
+
|
| 277 |
st.write("""
|
| 278 |
+
- **Excel** is a powerful spreadsheet software developed by Microsoft.
|
| 279 |
+
- It is widely used for:
|
| 280 |
+
- Data organization
|
| 281 |
+
- Analysis
|
| 282 |
+
- Visualization
|
| 283 |
+
- Key features include:
|
| 284 |
+
- Storing data in tabular format
|
| 285 |
+
- Performing complex calculations
|
| 286 |
+
- Creating charts
|
| 287 |
+
- Applying various data manipulation techniques
|
| 288 |
+
- Excel is an essential tool for managing and analyzing structured data in various industries.
|
| 289 |
""")
|
| 290 |
+
|
| 291 |
+
st.markdown("""
|
| 292 |
+
<h3 style="color: #5b2c6f;">Reading Excel Files in Python</h3>
|
| 293 |
+
""", unsafe_allow_html=True)
|
| 294 |
+
|
| 295 |
+
# Code example
|
| 296 |
+
st.code("""
|
| 297 |
+
import pandas as pd
|
| 298 |
+
|
| 299 |
+
# Read the Excel file
|
| 300 |
+
data = pd.read_excel('path_to_file.xlsx')
|
| 301 |
+
|
| 302 |
+
print(data.head()) # displays first 5 rows in excel file
|
| 303 |
+
""", language="python")
|
| 304 |
+
|
| 305 |
+
st.write("### Working with Sheets in Excel")
|
| 306 |
+
|
| 307 |
+
# Importing a Single Sheet
|
| 308 |
+
st.write("#### Importing a Single Excel Sheet")
|
| 309 |
+
st.code("""
|
| 310 |
+
df = pd.read_excel('path_to_file.xlsx', sheet_name=0)
|
| 311 |
+
print(df)
|
| 312 |
+
""", language="python")
|
| 313 |
+
|
| 314 |
+
# Importing Multiple Sheets
|
| 315 |
+
st.write("#### Importing Multiple Sheets from Excel")
|
| 316 |
+
st.code("""
|
| 317 |
+
df_dict = pd.read_excel('path_to_file.xlsx', sheet_name=[0, 1, 2])
|
| 318 |
+
for sheet, data in df_dict.items():
|
| 319 |
+
print(f"Sheet: {sheet}")
|
| 320 |
+
print(data.head())
|
| 321 |
+
""", language="python")
|
| 322 |
+
|
| 323 |
+
st.write("### Exporting Data to Excel Files")
|
| 324 |
+
|
| 325 |
+
# Exporting a Single DataFrame to Excel
|
| 326 |
+
st.write("#### Exporting a Single DataFrame")
|
| 327 |
+
st.code("""
|
| 328 |
+
data = pd.DataFrame({
|
| 329 |
+
'name': ['a', 'b', 'c', 'd'],
|
| 330 |
+
'age': [12, 23, 44, 43]
|
| 331 |
+
})
|
| 332 |
+
|
| 333 |
+
# Export the DataFrame to an Excel file
|
| 334 |
+
data.to_excel('single_sheet_output.xlsx', index=False)
|
| 335 |
+
""", language="python")
|
| 336 |
+
|
| 337 |
+
# Exporting Multiple DataFrames to Multiple Sheets
|
| 338 |
+
st.write("#### Exporting Multiple DataFrames to Different Sheets")
|
| 339 |
+
st.code("""
|
| 340 |
+
data1 = pd.DataFrame({
|
| 341 |
+
'name': ['a', 'b', 'c', 'd'],
|
| 342 |
+
'age': [12, 23, 44, 43]
|
| 343 |
+
})
|
| 344 |
+
|
| 345 |
+
data2 = pd.DataFrame({
|
| 346 |
+
'maths': [43, 32, 45, 45],
|
| 347 |
+
'science': [32, 54, 45, 13]
|
| 348 |
+
})
|
| 349 |
+
|
| 350 |
+
data3 = pd.DataFrame({
|
| 351 |
+
'hindi': [32, 45, 53, 53],
|
| 352 |
+
'english': [53, 32, 24, 65]
|
| 353 |
+
})
|
| 354 |
+
|
| 355 |
+
# Export multiple DataFrames to an Excel file with multiple sheets
|
| 356 |
+
with pd.ExcelWriter('multi_sheet_output.xlsx') as writer:
|
| 357 |
+
data1.to_excel(writer, sheet_name='Personal Info', index=False)
|
| 358 |
+
data2.to_excel(writer, sheet_name='Academic Scores', index=False)
|
| 359 |
+
data3.to_excel(writer, sheet_name='Language Scores', index=False)
|
| 360 |
+
""", language="python")
|
| 361 |
+
|
| 362 |
+
st.write("### Common Issues with Excel Files")
|
| 363 |
+
|
| 364 |
+
# 1. File Format Compatibility
|
| 365 |
+
st.write("#### 1. File Format Compatibility")
|
| 366 |
+
st.write("Excel files may come in different formats like `.xls` and `.xlsx`, which can lead to compatibility issues.")
|
| 367 |
+
st.code("""
|
| 368 |
+
data = pd.read_excel('file.xls', engine='xlrd') # For .xls files
|
| 369 |
+
data = pd.read_excel('file.xlsx', engine='openpyxl') # For .xlsx files
|
| 370 |
+
print(data.head())
|
| 371 |
+
""", language="python")
|
| 372 |
+
|
| 373 |
+
# 2. Encoding Issues
|
| 374 |
+
st.write("#### 2. Encoding Issues")
|
| 375 |
+
st.write("Sometimes Excel files might have special characters that cause encoding problems.")
|
| 376 |
+
st.code("""
|
| 377 |
+
data = pd.read_excel('file.xlsx', encoding='utf-8') # Replace with the correct encoding
|
| 378 |
+
print(data.head())
|
| 379 |
+
""", language="python")
|
| 380 |
+
|
| 381 |
+
# 3. Missing or Incomplete Data
|
| 382 |
+
st.write("#### 3. Missing or Incomplete Data")
|
| 383 |
+
st.write("Missing values can lead to errors during data processing.")
|
| 384 |
+
st.code("""
|
| 385 |
+
data = pd.read_excel('file.xlsx')
|
| 386 |
+
data.fillna(0, inplace=True) # Replace NaN values with 0 or other defaults
|
| 387 |
+
print(data.head())
|
| 388 |
+
""", language="python")
|
| 389 |
+
|
| 390 |
+
# 4. Large File Sizes
|
| 391 |
+
st.write("#### 4. Large File Sizes")
|
| 392 |
+
st.write("Large Excel files may cause performance issues or run out of memory.")
|
| 393 |
+
st.code("""
|
| 394 |
+
chunk_size = 1000
|
| 395 |
+
for chunk in pd.read_excel('large_file.xlsx', chunksize=chunk_size):
|
| 396 |
+
print(chunk.head())
|
| 397 |
+
""", language="python")
|
| 398 |
+
|
| 399 |
+
# 5. Sheet Name Selection
|
| 400 |
+
st.write("#### 5. Sheet Name Selection")
|
| 401 |
+
st.write("Excel files may have multiple sheets, and reading the wrong one can lead to incorrect analysis.")
|
| 402 |
+
st.code("""
|
| 403 |
+
data = pd.read_excel('file.xlsx', sheet_name='Sheet1')
|
| 404 |
+
print(data.head())
|
| 405 |
+
""", language="python")
|
| 406 |
+
|
| 407 |
+
# 6. Data Type Conversion
|
| 408 |
+
st.write("#### 6. Data Type Conversion")
|
| 409 |
+
st.write("Excel files may have columns with inconsistent or incorrect data types.")
|
| 410 |
+
st.code("""
|
| 411 |
+
data = pd.read_excel('file.xlsx')
|
| 412 |
+
data['column_name'] = data['column_name'].astype(int)
|
| 413 |
+
print(data.dtypes)
|
| 414 |
+
""", language="python")
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
# 8. Merged Cells
|
| 418 |
+
st.write("#### 7. Merged Cells")
|
| 419 |
+
st.write("Merged cells in Excel can lead to missing or misaligned data.")
|
| 420 |
+
st.code("""
|
| 421 |
+
data = pd.read_excel('file.xlsx', merge_cells=False)
|
| 422 |
+
print(data.head())
|
| 423 |
+
""", language="python")
|
| 424 |
+
|
| 425 |
+
|
| 426 |
+
# 10. Date Parsing
|
| 427 |
+
st.write("#### 8. Date Parsing")
|
| 428 |
+
st.write("Dates in Excel files may not be interpreted correctly.")
|
| 429 |
+
st.code("""
|
| 430 |
+
data = pd.read_excel('file.xlsx', parse_dates=['date_column'])
|
| 431 |
+
print(data.dtypes)
|
| 432 |
+
""", language="python")
|
| 433 |
+
|
| 434 |
+
col1 = st.columns(1)
|
| 435 |
+
|
| 436 |
+
with col1:
|
| 437 |
+
if st.button("⬅️ Back to Previous Page"):
|
| 438 |
+
navigate_to("main")
|
| 439 |
+
|
| 440 |
+
|
| 441 |
|
| 442 |
elif st.session_state.current_page == "explore_images_video":
|
| 443 |
st.markdown("""
|