Spaces:
Sleeping
Sleeping
Upload 2 files
Browse files- file/streamlit_CSV.ipynb +125 -0
- file/streamlit_JSON.ipynb +134 -0
file/streamlit_CSV.ipynb
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"provenance": []
|
| 7 |
+
},
|
| 8 |
+
"kernelspec": {
|
| 9 |
+
"name": "python3",
|
| 10 |
+
"display_name": "Python 3"
|
| 11 |
+
},
|
| 12 |
+
"language_info": {
|
| 13 |
+
"name": "python"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"cells": [
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "markdown",
|
| 19 |
+
"source": [
|
| 20 |
+
"## Reading a CSV File"
|
| 21 |
+
],
|
| 22 |
+
"metadata": {
|
| 23 |
+
"id": "QqqobpZnHA4L"
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "code",
|
| 28 |
+
"source": [
|
| 29 |
+
"import pandas as pd\n",
|
| 30 |
+
"\n",
|
| 31 |
+
"pd.read_csv(\"Sample.csv\")"
|
| 32 |
+
],
|
| 33 |
+
"metadata": {
|
| 34 |
+
"id": "--u1kQ0zHQ17"
|
| 35 |
+
},
|
| 36 |
+
"execution_count": null,
|
| 37 |
+
"outputs": []
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"cell_type": "markdown",
|
| 41 |
+
"source": [
|
| 42 |
+
"## **1. ParserError:**\n",
|
| 43 |
+
"\n",
|
| 44 |
+
"- This error occurs when we have extra column.\n",
|
| 45 |
+
"- This error is mostly caused when CSV is created in Text editor"
|
| 46 |
+
],
|
| 47 |
+
"metadata": {
|
| 48 |
+
"id": "y-cw10rIHREJ"
|
| 49 |
+
}
|
| 50 |
+
},
|
| 51 |
+
{
|
| 52 |
+
"cell_type": "code",
|
| 53 |
+
"source": [
|
| 54 |
+
"import pandas as pd\n",
|
| 55 |
+
"\n",
|
| 56 |
+
"pd.read_csv('sample.csv',on_bad_lines='warn')"
|
| 57 |
+
],
|
| 58 |
+
"metadata": {
|
| 59 |
+
"id": "kkN-ZyrbHRQF"
|
| 60 |
+
},
|
| 61 |
+
"execution_count": null,
|
| 62 |
+
"outputs": []
|
| 63 |
+
},
|
| 64 |
+
{
|
| 65 |
+
"cell_type": "markdown",
|
| 66 |
+
"source": [
|
| 67 |
+
"## **2. Encoding:**\n",
|
| 68 |
+
"\n",
|
| 69 |
+
"- Encoding is a process of translating a character, numbers, symbols, etc into ASCII and then binary number.\n",
|
| 70 |
+
"- If a proper enconding while reading csv is not used then the letter/characters will be decode to other binary number which will cause to loss the information.\n",
|
| 71 |
+
"- Most of the csv will be in `UTF-8`, but not all csv"
|
| 72 |
+
],
|
| 73 |
+
"metadata": {
|
| 74 |
+
"id": "7KBX5a6oHRbu"
|
| 75 |
+
}
|
| 76 |
+
},
|
| 77 |
+
{
|
| 78 |
+
"cell_type": "code",
|
| 79 |
+
"source": [
|
| 80 |
+
"import pandas as pd\n",
|
| 81 |
+
"import encodings\n",
|
| 82 |
+
"l=encodings.aliases.aliases.keys() # list of all encodings\n",
|
| 83 |
+
"for y in l:\n",
|
| 84 |
+
" try:\n",
|
| 85 |
+
" pd.read_csv('sample.csv',encoding='utf-8')\n",
|
| 86 |
+
" print('{} is an correct encoding')\n",
|
| 87 |
+
" except UnicodeDecodeError:\n",
|
| 88 |
+
" print('{} is not an correct encoding'.format(y))\n",
|
| 89 |
+
" except LookUpError:\n",
|
| 90 |
+
" print('{} is not supported'.format(y))"
|
| 91 |
+
],
|
| 92 |
+
"metadata": {
|
| 93 |
+
"id": "JO4cJrwFHRme"
|
| 94 |
+
},
|
| 95 |
+
"execution_count": null,
|
| 96 |
+
"outputs": []
|
| 97 |
+
},
|
| 98 |
+
{
|
| 99 |
+
"cell_type": "markdown",
|
| 100 |
+
"source": [
|
| 101 |
+
"## **3. Out of memory:**\n",
|
| 102 |
+
"\n",
|
| 103 |
+
"- If we dont have enough memory to load the dataset then we will divide them into chunks.\n",
|
| 104 |
+
"- Chunks are the part of the data, which takes chunksize as a number of rows.\n",
|
| 105 |
+
"- If we have 100_00_000 & `chunksize` = 1000, this means the data will be divided in 1000 rows called as chunks.\n",
|
| 106 |
+
"- Its output will be in generator"
|
| 107 |
+
],
|
| 108 |
+
"metadata": {
|
| 109 |
+
"id": "ZIVWCn22HRwj"
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"cell_type": "code",
|
| 114 |
+
"source": [
|
| 115 |
+
"import pandas as pd\n",
|
| 116 |
+
"pd.read_csv('spam.csv', encoding='latin', chunksize= 100)"
|
| 117 |
+
],
|
| 118 |
+
"metadata": {
|
| 119 |
+
"id": "hooz_lCRHR5u"
|
| 120 |
+
},
|
| 121 |
+
"execution_count": null,
|
| 122 |
+
"outputs": []
|
| 123 |
+
}
|
| 124 |
+
]
|
| 125 |
+
}
|
file/streamlit_JSON.ipynb
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"nbformat": 4,
|
| 3 |
+
"nbformat_minor": 0,
|
| 4 |
+
"metadata": {
|
| 5 |
+
"colab": {
|
| 6 |
+
"provenance": []
|
| 7 |
+
},
|
| 8 |
+
"kernelspec": {
|
| 9 |
+
"name": "python3",
|
| 10 |
+
"display_name": "Python 3"
|
| 11 |
+
},
|
| 12 |
+
"language_info": {
|
| 13 |
+
"name": "python"
|
| 14 |
+
}
|
| 15 |
+
},
|
| 16 |
+
"cells": [
|
| 17 |
+
{
|
| 18 |
+
"cell_type": "markdown",
|
| 19 |
+
"source": [
|
| 20 |
+
"## **Reading JSON File**"
|
| 21 |
+
],
|
| 22 |
+
"metadata": {
|
| 23 |
+
"id": "4L2BwncXK7Uv"
|
| 24 |
+
}
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"cell_type": "code",
|
| 28 |
+
"source": [
|
| 29 |
+
"import pandas as pd\n",
|
| 30 |
+
"\n",
|
| 31 |
+
"pd.read_json(\"sample.json\")"
|
| 32 |
+
],
|
| 33 |
+
"metadata": {
|
| 34 |
+
"id": "sVqhFskxK-r5"
|
| 35 |
+
},
|
| 36 |
+
"execution_count": null,
|
| 37 |
+
"outputs": []
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"cell_type": "markdown",
|
| 41 |
+
"source": [
|
| 42 |
+
"## **Handling Structured JSON**\n",
|
| 43 |
+
"\n",
|
| 44 |
+
"- The `orient` parameter in `pd.read_json()` specifies the format of JSON data being read:\n",
|
| 45 |
+
" - **\"split\"**: Dictionary format with keys as \"index\", \"columns\", and \"data\".\n",
|
| 46 |
+
" - **\"records\"**: List of dictionaries where each dictionary represents a row.\n",
|
| 47 |
+
" - **\"index\"**: Dictionary format with row indices as keys and dictionaries of column data as values.\n",
|
| 48 |
+
" - **\"columns\"**: Default format where keys are column names and values are arrays of data"
|
| 49 |
+
],
|
| 50 |
+
"metadata": {
|
| 51 |
+
"id": "clN_z7L8K-zD"
|
| 52 |
+
}
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"cell_type": "code",
|
| 56 |
+
"source": [
|
| 57 |
+
"import pandas as pd\n",
|
| 58 |
+
"\n",
|
| 59 |
+
"# Sample Structured JSON\n",
|
| 60 |
+
"structured_json = {\n",
|
| 61 |
+
" \"name\": [\"John\", \"Doe\", \"Jane\"],\n",
|
| 62 |
+
" \"age\": [30, 25, 28],\n",
|
| 63 |
+
" \"city\": [\"New York\", \"Los Angeles\", \"Chicago\"]\n",
|
| 64 |
+
"}\n",
|
| 65 |
+
"\n",
|
| 66 |
+
"# Reading JSON with different 'orient' values\n",
|
| 67 |
+
"df_default = pd.read_json('structured.json') # Default (columns)\n",
|
| 68 |
+
"df_split = pd.read_json('structured.json', orient='split')\n",
|
| 69 |
+
"df_index = pd.read_json('structured.json', orient='index')\n",
|
| 70 |
+
"\n",
|
| 71 |
+
"print(df_default)\n",
|
| 72 |
+
"print(df_split)\n",
|
| 73 |
+
"print(df_index)"
|
| 74 |
+
],
|
| 75 |
+
"metadata": {
|
| 76 |
+
"id": "yFW5vsHkK-6Q"
|
| 77 |
+
},
|
| 78 |
+
"execution_count": null,
|
| 79 |
+
"outputs": []
|
| 80 |
+
},
|
| 81 |
+
{
|
| 82 |
+
"cell_type": "markdown",
|
| 83 |
+
"source": [
|
| 84 |
+
"## **Handling Semi-Structured JSON**\n",
|
| 85 |
+
"\n",
|
| 86 |
+
"- `pandas.json_normalize()` is used to flatten nested JSON objects into a DataFrame.\n",
|
| 87 |
+
" - **`record_path`**: Specifies the path in the JSON to extract records from nested lists.\n",
|
| 88 |
+
" - **`meta`**: Includes additional metadata fields from parent records.\n",
|
| 89 |
+
" - **`max_level`**: Limits the number of levels to flatten."
|
| 90 |
+
],
|
| 91 |
+
"metadata": {
|
| 92 |
+
"id": "-xtBajp5K_Ge"
|
| 93 |
+
}
|
| 94 |
+
},
|
| 95 |
+
{
|
| 96 |
+
"cell_type": "code",
|
| 97 |
+
"source": [
|
| 98 |
+
"import pandas as pd\n",
|
| 99 |
+
"import json\n",
|
| 100 |
+
"\n",
|
| 101 |
+
"# Sample Semi-Structured JSON\n",
|
| 102 |
+
"semi_structured_json = [\n",
|
| 103 |
+
" {\n",
|
| 104 |
+
" \"name\": \"John\",\n",
|
| 105 |
+
" \"age\": 30,\n",
|
| 106 |
+
" \"address\": {\"city\": \"New York\", \"zip\": \"10001\"},\n",
|
| 107 |
+
" \"skills\": [\"Python\", \"SQL\"]\n",
|
| 108 |
+
" },\n",
|
| 109 |
+
" {\n",
|
| 110 |
+
" \"name\": \"Jane\",\n",
|
| 111 |
+
" \"age\": 28,\n",
|
| 112 |
+
" \"address\": {\"city\": \"Chicago\", \"zip\": \"60601\"},\n",
|
| 113 |
+
" \"skills\": [\"Java\", \"C++\"]\n",
|
| 114 |
+
" }\n",
|
| 115 |
+
"]\n",
|
| 116 |
+
"\n",
|
| 117 |
+
"# Flattening nested JSON\n",
|
| 118 |
+
"df = pd.json_normalize(\n",
|
| 119 |
+
" semi_structured_json,\n",
|
| 120 |
+
" record_path=['skills'],\n",
|
| 121 |
+
" meta=['name', 'age', ['address', 'city'], ['address', 'zip']],\n",
|
| 122 |
+
" max_level=1\n",
|
| 123 |
+
")\n",
|
| 124 |
+
"\n",
|
| 125 |
+
"print(df)"
|
| 126 |
+
],
|
| 127 |
+
"metadata": {
|
| 128 |
+
"id": "7tfcZuiRK_Ln"
|
| 129 |
+
},
|
| 130 |
+
"execution_count": null,
|
| 131 |
+
"outputs": []
|
| 132 |
+
}
|
| 133 |
+
]
|
| 134 |
+
}
|