Spaces:
Build error
Build error
Commit ·
c4b27ba
1
Parent(s): 9860d55
First Commit
Browse files- .gitattributes +1 -0
- README.md +5 -11
- app.py +64 -0
- create_object.py +160 -0
- data/online_retail.csv +3 -0
- requirements.txt +5 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
data/online_retail.csv filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,12 +1,6 @@
|
|
| 1 |
-
-
|
| 2 |
-
title: Streamlit ID-POS
|
| 3 |
-
emoji: 🌍
|
| 4 |
-
colorFrom: blue
|
| 5 |
-
colorTo: yellow
|
| 6 |
-
sdk: streamlit
|
| 7 |
-
sdk_version: 1.35.0
|
| 8 |
-
app_file: app.py
|
| 9 |
-
pinned: false
|
| 10 |
-
---
|
| 11 |
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ID-POS分析システム
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
+
## 利用データ
|
| 4 |
+
|
| 5 |
+
Online Retail Dataset
|
| 6 |
+
https://www.kaggle.com/datasets/ulrikthygepedersen/online-retail-dataset
|
app.py
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import datetime
|
| 4 |
+
|
| 5 |
+
import create_object as co
|
| 6 |
+
|
| 7 |
+
import duckdb
|
| 8 |
+
df = pd.read_csv("data/online_retail.csv")
|
| 9 |
+
|
| 10 |
+
# データ型整備:ID項目のSTRING化など
|
| 11 |
+
df = df.astype({'CustomerID': 'object'})
|
| 12 |
+
df = df[df["UnitPrice"] * df["Quantity"] > 0]
|
| 13 |
+
|
| 14 |
+
country_list = df["Country"].unique()
|
| 15 |
+
|
| 16 |
+
st.set_page_config(
|
| 17 |
+
page_title="購買データ分析App",
|
| 18 |
+
layout="wide",
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
def main():
|
| 22 |
+
st.title("購買データ分析App")
|
| 23 |
+
|
| 24 |
+
with st.sidebar.form(key="my_form"):
|
| 25 |
+
analysis_menu = st.selectbox("分析メニュー", co.analysis_menu_list)
|
| 26 |
+
|
| 27 |
+
country = st.multiselect("国を選択してください。", country_list)
|
| 28 |
+
if len(country) != 0:
|
| 29 |
+
country = "','".join(country)
|
| 30 |
+
country = f"Country in ('{country}')"
|
| 31 |
+
else:
|
| 32 |
+
country = "True"
|
| 33 |
+
|
| 34 |
+
st.write("2010年の日付を入れてください。")
|
| 35 |
+
start_date = st.date_input("開始日", datetime.date(2010, 1, 1))
|
| 36 |
+
end_date = st.date_input("終了日", datetime.date(2010, 12, 31))
|
| 37 |
+
|
| 38 |
+
submit_button = st.form_submit_button(label = "分析開始")
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
if submit_button:
|
| 42 |
+
# 処理を実行
|
| 43 |
+
sql = co.create_sql(analysis_menu, country, start_date, end_date)
|
| 44 |
+
df_output = duckdb.query(sql).to_df()
|
| 45 |
+
|
| 46 |
+
try:
|
| 47 |
+
fig = co.create_graph(analysis_menu, df_output)
|
| 48 |
+
st.plotly_chart(fig)
|
| 49 |
+
except:
|
| 50 |
+
print("グラフ無し")
|
| 51 |
+
|
| 52 |
+
st.table(df_output.head(100))
|
| 53 |
+
st.write("上位100行まで、全体を見たい場合はCSVでダウンロードしてください。")
|
| 54 |
+
|
| 55 |
+
st.download_button(
|
| 56 |
+
"Press to Download",
|
| 57 |
+
df_output.to_csv(index=False).encode('utf-8'),
|
| 58 |
+
"file.csv",
|
| 59 |
+
"text/csv",
|
| 60 |
+
key='download-csv'
|
| 61 |
+
)
|
| 62 |
+
|
| 63 |
+
if __name__ == '__main__':
|
| 64 |
+
main()
|
create_object.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import plotly.graph_objects as go
|
| 2 |
+
import plotly.express as px
|
| 3 |
+
|
| 4 |
+
# 分析メニュー
|
| 5 |
+
analysis_menu_list = ["ABC分析", "バスケット分析","時系列分析"]
|
| 6 |
+
|
| 7 |
+
# 分析メニューごとのSQL
|
| 8 |
+
def create_sql(analysis_menu, country, start_date, end_date):
|
| 9 |
+
if analysis_menu == "ABC分析":
|
| 10 |
+
sql = f"""
|
| 11 |
+
WITH
|
| 12 |
+
t_base AS(
|
| 13 |
+
-- 商品コードごとの売上(単価×個数)の合計値を算出
|
| 14 |
+
-- 期間を、2011年1~6月に絞る
|
| 15 |
+
SELECT
|
| 16 |
+
StockCode, Description,
|
| 17 |
+
SUM(UnitPrice * Quantity) AS SalesTotal
|
| 18 |
+
FROM df
|
| 19 |
+
WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
|
| 20 |
+
AND {country}
|
| 21 |
+
GROUP BY StockCode, Description
|
| 22 |
+
),
|
| 23 |
+
t_standard AS(
|
| 24 |
+
-- 全体の売上のうち、70%を占める売上額・90%を占める売上額を算出
|
| 25 |
+
SELECT
|
| 26 |
+
SUM(SalesTotal) AS Sum_SalesTotal,
|
| 27 |
+
FROM t_base
|
| 28 |
+
),
|
| 29 |
+
t_cumulative AS(
|
| 30 |
+
-- 売上を降順(高い順)でソートし、先頭からの累計売上額を算出
|
| 31 |
+
SELECT
|
| 32 |
+
StockCode,
|
| 33 |
+
Description,
|
| 34 |
+
SalesTotal,
|
| 35 |
+
SUM(SalesTotal) OVER (ORDER BY SalesTotal DESC) AS SalesCumulative
|
| 36 |
+
FROM t_base
|
| 37 |
+
ORDER BY SalesTotal DESC
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
SELECT
|
| 41 |
+
StockCode,
|
| 42 |
+
Description,
|
| 43 |
+
SalesTotal,
|
| 44 |
+
SalesCumulative,
|
| 45 |
+
SalesCumulative / Sum_SalesTotal AS Percentage_SalesCumulative,
|
| 46 |
+
-- 累計売上額が売上合計の70%以下の場合はランクA、90%以下の場合はランクB、それ以降はランクCとしてランク付け
|
| 47 |
+
CASE
|
| 48 |
+
WHEN SalesCumulative / Sum_SalesTotal <= 0.7 THEN 'A'
|
| 49 |
+
WHEN SalesCumulative / Sum_SalesTotal <= 0.9 THEN 'B'
|
| 50 |
+
ELSE 'C'
|
| 51 |
+
END AS SalesRank
|
| 52 |
+
|
| 53 |
+
FROM t_cumulative
|
| 54 |
+
FULL OUTER JOIN t_standard
|
| 55 |
+
ON TRUE
|
| 56 |
+
ORDER BY SalesTotal desc
|
| 57 |
+
"""
|
| 58 |
+
|
| 59 |
+
elif analysis_menu == "バスケット分析":
|
| 60 |
+
sql = f"""
|
| 61 |
+
WITH
|
| 62 |
+
t_all AS(
|
| 63 |
+
-- 総来店者数
|
| 64 |
+
SELECT
|
| 65 |
+
COUNT(DISTINCT CustomerID) AS Num_of_All
|
| 66 |
+
FROM df
|
| 67 |
+
WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
|
| 68 |
+
AND {country}
|
| 69 |
+
),
|
| 70 |
+
|
| 71 |
+
t_purchaser AS(
|
| 72 |
+
-- 商品ごとの購入者数
|
| 73 |
+
SELECT
|
| 74 |
+
CAST(StockCode AS STRING) AS ProductID, Description, COUNT(DISTINCT CustomerID) AS Num_of_Purchaser
|
| 75 |
+
FROM df
|
| 76 |
+
WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
|
| 77 |
+
AND {country}
|
| 78 |
+
GROUP BY ProductID, Description
|
| 79 |
+
),
|
| 80 |
+
|
| 81 |
+
t_simultaneous_purchaser_pre AS(
|
| 82 |
+
-- 商品ごとの同時購入者
|
| 83 |
+
SELECT
|
| 84 |
+
DISTINCT CAST(StockCode AS STRING) AS ProductID, Description, CAST(InvoiceDate AS DATE) Purchase_date, CustomerID
|
| 85 |
+
FROM df
|
| 86 |
+
WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
|
| 87 |
+
AND {country}
|
| 88 |
+
),
|
| 89 |
+
|
| 90 |
+
t_simultaneous_purchaser AS(
|
| 91 |
+
-- 商品ごとの同時購入者数
|
| 92 |
+
SELECT
|
| 93 |
+
t1.ProductID as ProductID_A, t1.Description AS Description_A, t2.ProductID as ProductID_B, t2.Description AS Description_B, COUNT(DISTINCT CustomerID) as Num_of_Simultaneous_Purchaser
|
| 94 |
+
FROM t_simultaneous_purchaser_pre as t1
|
| 95 |
+
INNER JOIN t_simultaneous_purchaser_pre as t2
|
| 96 |
+
USING(Purchase_date, CustomerID)
|
| 97 |
+
WHERE t1.ProductID != t2.ProductID
|
| 98 |
+
GROUP BY t1.ProductID, t1.Description, t2.ProductID, t2.Description
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
SELECT
|
| 102 |
+
ProductID_A, Description_A,
|
| 103 |
+
t_purchaser.Num_of_Purchaser AS Num_of_Purchaser_A,
|
| 104 |
+
t_purchaser.Num_of_Purchaser / Num_of_All AS PurchaseRate_A,
|
| 105 |
+
ProductID_B, Description_B,
|
| 106 |
+
t2.Num_of_Purchaser AS Num_of_Purchaser_B,
|
| 107 |
+
t2.Num_of_Purchaser / Num_of_All AS PurchaseRate_B,
|
| 108 |
+
Num_of_Simultaneous_Purchaser,
|
| 109 |
+
Num_of_Simultaneous_Purchaser / t_purchaser.Num_of_Purchaser AS CombinedSalesRate,
|
| 110 |
+
(Num_of_Simultaneous_Purchaser / t_purchaser.Num_of_Purchaser) / (t2.Num_of_Purchaser / Num_of_All) AS Lift
|
| 111 |
+
|
| 112 |
+
FROM t_purchaser
|
| 113 |
+
LEFT OUTER JOIN t_simultaneous_purchaser
|
| 114 |
+
ON t_purchaser.ProductID = t_simultaneous_purchaser.ProductID_A
|
| 115 |
+
AND t_purchaser.Description = t_simultaneous_purchaser.Description_A
|
| 116 |
+
|
| 117 |
+
LEFT OUTER JOIN t_purchaser as t2
|
| 118 |
+
ON t_simultaneous_purchaser.ProductID_B = t2.ProductID
|
| 119 |
+
AND t_simultaneous_purchaser.Description_B = t2.Description
|
| 120 |
+
|
| 121 |
+
FULL OUTER JOIN t_all
|
| 122 |
+
ON True
|
| 123 |
+
|
| 124 |
+
-- データが多くなりすぎるので、上位10商品同士の組み合わせに限定
|
| 125 |
+
WHERE
|
| 126 |
+
Description_A in (SELECT Description FROM t_purchaser ORDER BY Num_of_Purchaser DESC LIMIT 10)
|
| 127 |
+
AND
|
| 128 |
+
Description_B in (SELECT Description FROM t_purchaser ORDER BY Num_of_Purchaser DESC LIMIT 10)
|
| 129 |
+
"""
|
| 130 |
+
|
| 131 |
+
elif analysis_menu == "時系列分析":
|
| 132 |
+
sql = f"""
|
| 133 |
+
SELECT
|
| 134 |
+
CAST(InvoiceDate AS DATE) AS YearMonthDate,
|
| 135 |
+
COUNT(DISTINCT CustomerID) AS Num_of_Purchaser,
|
| 136 |
+
SUM(Quantity) AS Total_of_Amount,
|
| 137 |
+
SUM(UnitPrice * Quantity) AS SalesTotal
|
| 138 |
+
FROM df
|
| 139 |
+
WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
|
| 140 |
+
AND {country}
|
| 141 |
+
GROUP BY YearMonthDate
|
| 142 |
+
ORDER BY YearMonthDate
|
| 143 |
+
"""
|
| 144 |
+
|
| 145 |
+
return sql
|
| 146 |
+
|
| 147 |
+
|
| 148 |
+
# 分析メニューごとのグラフ
|
| 149 |
+
def create_graph(analysis_menu, df):
|
| 150 |
+
if analysis_menu == "バスケット分析":
|
| 151 |
+
# ヒートマップ
|
| 152 |
+
df = df.sort_values(["Description_A","Description_B"], ascending=[True, False]).reset_index()
|
| 153 |
+
fig = go.Figure([go.Heatmap(z=df.CombinedSalesRate,
|
| 154 |
+
x=df.Description_A.values,
|
| 155 |
+
y=df.Description_B.values)])
|
| 156 |
+
|
| 157 |
+
elif analysis_menu == "時系列分析":
|
| 158 |
+
# 折れ線グラフ
|
| 159 |
+
fig = px.line(df, x='YearMonthDate', y='Total_of_Amount')
|
| 160 |
+
return fig
|
data/online_retail.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c820e928a9cb01d05738b0c36b5033ef661eccfb82f09f2e5ce8542da73b0b99
|
| 3 |
+
size 48581636
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit==1.22.0
|
| 2 |
+
pandas==2.0.3
|
| 3 |
+
datetime==5.5
|
| 4 |
+
duckdb==0.10.0
|
| 5 |
+
plotly
|