naveed92 commited on
Commit
9e4f60f
·
verified ·
1 Parent(s): 2c4c320

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +243 -22
app.py CHANGED
@@ -13,6 +13,9 @@ from st_keyup import st_keyup
13
 
14
  # What does persist = disk do ?
15
  # @st.cache_data(persist="disk")
 
 
 
16
  @st.cache_data
17
  def load_pandas_xlsx(path):
18
  data = pd.read_excel(path)
@@ -24,6 +27,8 @@ def build_company_df(input_df):
24
  output_df = input_df[['companyLabel', 'companyLabelJA', 'company']].drop_duplicates()
25
  return output_df
26
 
 
 
27
  @st.cache_data
28
  def build_industry_df(input_df):
29
  # Pre compute unique number of companies per industry
@@ -31,6 +36,8 @@ def build_industry_df(input_df):
31
  output_df = output_df.rename(columns={'company': 'n_competitors'})
32
  return output_df
33
 
 
 
34
  @st.cache_data
35
  def build_product_df(input_df):
36
  # Pre compute unique number of companies per product
@@ -38,6 +45,58 @@ def build_product_df(input_df):
38
  output_df = output_df.rename(columns={'company': 'n_competitors'})
39
  return output_df
40
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  def search_df(inp, df, col):
42
  mask = df[col].str.contains(inp, case=False, regex=False)
43
  select_df = df[mask]
@@ -45,9 +104,14 @@ def search_df(inp, df, col):
45
 
46
  ##### Data Logic #####
47
 
 
48
  COMPETITOR_PATH = 'data/merged_competitors_all_20241122.xlsx'
49
  INDUSTRY_PATH = 'data/industry_hierarchy_20241125.xlsx'
50
 
 
 
 
 
51
  # Load data
52
  with st.spinner(text="Loading competitor data ..."):
53
  competitor_df = load_pandas_xlsx(COMPETITOR_PATH)
@@ -65,41 +129,26 @@ industry_hierarchy = load_pandas_xlsx(INDUSTRY_PATH)
65
  ### Pre computation Steps ###
66
 
67
  # Pre compute unique number of companies per industry
68
- # industry_to_counts = competitor_df[['company', 'companyLabel', 'companyLabelJA', 'industry', 'industryLabel', 'industryLabelJA']].drop_duplicates().groupby(['industry', 'industryLabel', 'industryLabelJA'])['company'].count().sort_values(ascending=False).reset_index().copy()
69
- # industry_to_counts = industry_to_counts.rename(columns={'company': 'n_competitors'})
70
-
71
  industry_to_counts = build_industry_df(competitor_df)
72
 
73
  # Pre compute unique number of companies per industry
74
- # product_to_counts = competitor_df[['company', 'companyLabel', 'companyLabelJA', 'product', 'productLabel', 'productLabelJA']].drop_duplicates().groupby(['product', 'productLabel', 'productLabelJA'])['company'].count().sort_values(ascending=False).reset_index().copy()
75
- # product_to_counts = product_to_counts.rename(columns={'company': 'n_competitors'})
76
-
77
  product_to_counts = build_product_df(competitor_df)
78
 
79
- ### end ###
80
-
81
- # Parse Data
82
- ###with st.spinner(text="Computing unique companies ..."):
83
- ### unique_companies = list(competitor_data.companyLabel.unique())
84
- ### st.success("Unique Companies Done!")
85
-
86
- import streamlit as st
87
 
88
  # Title
89
- st.title('3C Competitor Analysis Demo')
90
 
91
  option = st.selectbox(
92
- "Select Search Mode",
93
- ("By Company", "By Industry", "By Product")
94
  )
95
 
96
  st.write("You selected:", option)
97
 
98
-
99
  ##### App Logic #####
100
 
101
-
102
- if option == "By Company":
103
 
104
  st.title("Searching by Company")
105
 
@@ -201,7 +250,7 @@ if option == "By Company":
201
  st.dataframe(competitors_by_country)
202
 
203
 
204
- elif option == "By Industry":
205
 
206
  st.title("Searching by Industry")
207
 
@@ -278,7 +327,7 @@ elif option == "By Industry":
278
  st.graphviz_chart(graph)
279
 
280
 
281
- elif option == "By Product":
282
 
283
  st.title("Searching by Product")
284
 
@@ -308,6 +357,178 @@ elif option == "By Product":
308
  competitors = competitor_df[competitor_df['product'] == product['product']][['companyLabel', 'companyLabelJA', 'company', 'country', 'countryLabel']].drop_duplicates().copy()
309
  st.dataframe(competitors)
310
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
312
  else:
313
 
 
13
 
14
  # What does persist = disk do ?
15
  # @st.cache_data(persist="disk")
16
+
17
+ ### For competitor analysis
18
+
19
  @st.cache_data
20
  def load_pandas_xlsx(path):
21
  data = pd.read_excel(path)
 
27
  output_df = input_df[['companyLabel', 'companyLabelJA', 'company']].drop_duplicates()
28
  return output_df
29
 
30
+ ### For industry analysis
31
+
32
  @st.cache_data
33
  def build_industry_df(input_df):
34
  # Pre compute unique number of companies per industry
 
36
  output_df = output_df.rename(columns={'company': 'n_competitors'})
37
  return output_df
38
 
39
+ ### For product analysis
40
+
41
  @st.cache_data
42
  def build_product_df(input_df):
43
  # Pre compute unique number of companies per product
 
45
  output_df = output_df.rename(columns={'company': 'n_competitors'})
46
  return output_df
47
 
48
+ ### For customer analysis
49
+
50
+ @st.cache_data
51
+ def build_company_product_kg(company_product_path, product_manufacturer_path):
52
+ company_product_df = pd.read_csv(company_product_path)
53
+ product_manufacturer_df = pd.read_csv(product_manufacturer_path)
54
+
55
+ output_df = pd.concat([company_product_df, product_manufacturer_df])
56
+ return output_df
57
+
58
+ @st.cache_data
59
+ def build_company_df_2(input_df):
60
+ # build company df
61
+ output_df = input_df[['companyLabel', 'companyLabelJA', 'company']].drop_duplicates()
62
+ return output_df
63
+
64
+ property_mapping = {
65
+ 'http://www.wikidata.org/prop/direct/P186': 'made_from_material',
66
+ 'http://www.wikidata.org/prop/direct/P527': 'has_part',
67
+ 'http://www.wikidata.org/prop/direct/P2283': 'uses',
68
+ 'http://www.wikidata.org/prop/direct/P31': 'instance_of',
69
+ 'http://www.wikidata.org/prop/direct/P366': 'has_use',
70
+ 'http://www.wikidata.org/prop/direct/P361': 'part_of'
71
+ }
72
+
73
+ @st.cache_data
74
+ def build_product_kg_df():
75
+
76
+ product_df_1 = pd.read_csv('data/product_manufacturer_relations_haspart_uses_madefrom_out.csv')
77
+ product_df_2 = pd.read_csv('data/product_relations_haspart_uses_madefrom_out.csv')
78
+
79
+ product_df_3 = pd.read_csv('data/product_relations_partof_hasuse_out.csv')
80
+ product_df_4 = pd.read_csv('data/product_manufacturer_relations_hasuse_partof_out.csv')
81
+
82
+ product_kg_df = pd.concat([product_df_1, product_df_2, product_df_3, product_df_4]).drop_duplicates()
83
+ product_kg_df['propertyLabel'] = product_kg_df.propertyLabel.apply(lambda x: property_mapping[x])
84
+
85
+ return product_kg_df
86
+
87
+ @st.cache_data
88
+ def build_product_instance_df():
89
+
90
+ product_instance_df_1 = pd.read_csv('data/product_relations_instance_out.csv')
91
+ product_instance_df_2 = pd.read_csv('data/product_manufacturer_relations_instance_out.csv')
92
+
93
+ product_instance_df = pd.concat([product_instance_df_1, product_instance_df_2]).drop_duplicates()
94
+ product_instance_df['propertyLabel'] = product_instance_df.propertyLabel.apply(lambda x: property_mapping[x])
95
+
96
+ return product_instance_df
97
+
98
+ ### For searching
99
+
100
  def search_df(inp, df, col):
101
  mask = df[col].str.contains(inp, case=False, regex=False)
102
  select_df = df[mask]
 
104
 
105
  ##### Data Logic #####
106
 
107
+ # For competitor and industry analysis
108
  COMPETITOR_PATH = 'data/merged_competitors_all_20241122.xlsx'
109
  INDUSTRY_PATH = 'data/industry_hierarchy_20241125.xlsx'
110
 
111
+ # For customer analysis
112
+ COMPANY_PRODUCT_PATH = 'data/company_product_pairs.csv'
113
+ PRODUCT_MANUFACTURER_PATH = 'data/product_manufacturer_pair.csv'
114
+
115
  # Load data
116
  with st.spinner(text="Loading competitor data ..."):
117
  competitor_df = load_pandas_xlsx(COMPETITOR_PATH)
 
129
  ### Pre computation Steps ###
130
 
131
  # Pre compute unique number of companies per industry
 
 
 
132
  industry_to_counts = build_industry_df(competitor_df)
133
 
134
  # Pre compute unique number of companies per industry
 
 
 
135
  product_to_counts = build_product_df(competitor_df)
136
 
137
+ ### end ###
 
 
 
 
 
 
 
138
 
139
  # Title
140
+ st.title('3C Competitor / Customer Analysis Demo')
141
 
142
  option = st.selectbox(
143
+ "Analysis Mode",
144
+ ("Customer Analysis", "Competitor Analysis", "Industry Analysis", "Product Analysis")
145
  )
146
 
147
  st.write("You selected:", option)
148
 
 
149
  ##### App Logic #####
150
 
151
+ if option == "Competitor Analysis":
 
152
 
153
  st.title("Searching by Company")
154
 
 
250
  st.dataframe(competitors_by_country)
251
 
252
 
253
+ elif option == "Industry Analysis":
254
 
255
  st.title("Searching by Industry")
256
 
 
327
  st.graphviz_chart(graph)
328
 
329
 
330
+ elif option == "Product Analysis":
331
 
332
  st.title("Searching by Product")
333
 
 
357
  competitors = competitor_df[competitor_df['product'] == product['product']][['companyLabel', 'companyLabelJA', 'company', 'country', 'countryLabel']].drop_duplicates().copy()
358
  st.dataframe(competitors)
359
 
360
+ elif option == "Customer Analysis":
361
+
362
+ # Load data
363
+ with st.spinner(text="Build company product knowledge graph ..."):
364
+ company_product_kg_df = build_company_product_kg(COMPANY_PRODUCT_PATH, PRODUCT_MANUFACTURER_PATH)
365
+ company_df_2 = build_company_df_2(company_product_kg_df)
366
+ st.success("Company Product Knowledge Graph Loaded!")
367
+
368
+ with st.spinner(text="Build product relationship knowledge graph ..."):
369
+ product_kg_df = build_product_kg_df()
370
+ st.success("Product Relationship Knowledge Graph Loaded!")
371
+
372
+ with st.spinner(text="Build product instance knowledge graph ..."):
373
+ product_instance_df = build_product_instance_df()
374
+ st.success("Product Instance Knowledge Graph Loaded!")
375
+
376
+ ### Search Start
377
+
378
+ st.title("Searching by Company")
379
+
380
+ # Get input
381
+ inp = st_keyup("Enter a company name", value="toshiba", key="0", debounce=500)
382
+
383
+ # Perform search
384
+ select_df = search_df(inp, company_df_2, 'companyLabel')
385
+
386
+ # def show_data():
387
+ # select_value = st.session_state.value
388
+ # row_id=select_value['selection']['rows'][0]
389
+ # st.write(company_df.iloc[row_id])
390
+
391
+ # Show search results
392
+ with st.status("Searching ...", state="running", expanded=False) as status:
393
+ status.update(label=f"{len(select_df)} results found", state="complete", expanded=True)
394
+
395
+ ### Selection for Company ###
396
+ st.dataframe(select_df, on_select="rerun", key="value", selection_mode="single-row")
397
+
398
+
399
+ # Expand if company is selected
400
+ select_value = st.session_state.value
401
+ if len(select_value['selection']['rows']) > 0:
402
+
403
+ st.title("Company Data")
404
+
405
+ row_id = select_value['selection']['rows'][0]
406
+
407
+ row = select_df.iloc[row_id]
408
+
409
+ entries = company_product_kg_df[company_product_kg_df.company == row.company]
410
+
411
+ st.write(f"Company Name: {row.companyLabel}")
412
+ st.write(f"Japanese Name: {row.companyLabelJA}")
413
+ st.write(f"Wikidata URL: {row.company}")
414
+
415
+ # st.write(f"Products or Services Provided: {list(set(list(entries.productLabel.unique()) + list(entries.productLabelJA.unique())))}")
416
+
417
+ st.write(f"Products and services provided by {row.companyLabel}")
418
+
419
+ product_select_df = company_product_kg_df[(company_product_kg_df.company == row.company) & (company_product_kg_df.propertyLabel == 'product_or_service_provided')][['productLabel', 'productLabelJA', 'product', 'company', 'companyLabel', 'companyLabelJA']]
420
+
421
+ ### Selection for Product ###
422
+ st.dataframe(product_select_df, on_select="rerun", key="product", selection_mode="single-row")
423
+ select_product = st.session_state.product
424
+
425
+
426
+ # expand if product if selected
427
+ if len(select_product['selection']['rows']) > 0:
428
+
429
+ product_id = select_product['selection']['rows'][0]
430
+
431
+ target_product = product_select_df.iloc[product_id]
432
+
433
+ # st.title(f"All Product Categories produced by {row.companyLabel}")
434
+ # st.dataframe(competitors)
435
+
436
+ # Hypothesis
437
+ # for incoming relations: 'uses' of 'has_part' is useful, since it lists services that have selected product as a component
438
+ # for outgoing relations: 'has_use' and 'part_of' is useful, since it lists services that have selected product as a component
439
+
440
+
441
+ ####### Build kg paths
442
+
443
+ ### Step 1 ###
444
+
445
+ start_df = pd.DataFrame()
446
+ start_df[['company_start', 'companyLabel_start', 'companyLabelJA_start', 'product_start', 'productLabel_start', 'productLabelJA_start']] = [[target_product.company, target_product.companyLabel, target_product.companyLabelJA, target_product['product'], target_product.productLabel, target_product.productLabelJA]]
447
+
448
+ ### Step 2 ###
449
+
450
+ related_out_df = product_kg_df[(product_kg_df['product'] == target_product['product']) & (product_kg_df['propertyLabel'].apply(lambda x: x in ['has_use', 'part_of']))]
451
+ related_in_df = product_kg_df[(product_kg_df['object'] == target_product['product']) & (product_kg_df['propertyLabel'].apply(lambda x: x in ['uses', 'has_part']))]
452
+
453
+ path_df = pd.concat(
454
+ [
455
+ start_df.merge(related_out_df[['product', 'object', 'objectLabel', 'objectLabelJa']], left_on='product_start', right_on='product').drop(columns=['product']).rename(columns={'object': 'product_second', 'objectLabel': 'productLabel_second', 'objectLabelJa': 'productLabelJa_second'}),
456
+ start_df.merge(related_in_df[['object', 'product', 'productLabel', 'productLabelJa']], left_on='product_start', right_on='object').drop(columns=['object']).rename(columns={'product': 'product_second', 'productLabel': 'productLabel_second', 'productLabelJa': 'productLabelJa_second'}),
457
+ ]
458
+ )
459
+
460
+ # merge 1
461
+
462
+ ### Step 3a ###
463
+
464
+ path_df_1 = path_df.merge(company_product_kg_df[['company', 'companyLabel', 'companyLabelJA', 'product']], left_on='product_second', right_on='product').drop(columns=['product'])
465
+
466
+ ### Step 3b ###
467
+
468
+ path_df_2 = path_df.merge(product_instance_df[['object', 'objectLabel', 'objectLabelJa', 'product']], left_on='product_second', right_on='product').drop(columns=['product']).rename(columns={'object': 'product_third', 'objectLabel': 'productLabel_third', 'objectLabelJa': 'productLabelJa_third'})
469
+ path_df_2 = path_df_2.merge(company_product_kg_df[['company', 'companyLabel', 'companyLabelJA', 'product']], left_on='product_third', right_on='product').drop(columns=['product'])
470
+
471
+ ### Step 3c ###
472
+
473
+ path_df_3 = path_df.merge(product_instance_df[['object', 'objectLabel', 'objectLabelJa', 'product']], left_on='product_second', right_on='product').drop(columns=['product']).rename(columns={'object': 'product_third', 'objectLabel': 'productLabel_third', 'objectLabelJa': 'productLabelJa_third'})
474
+ path_df_3 = path_df_3.merge(product_instance_df[['product', 'productLabel', 'productLabelJa', 'object']], left_on='product_third', right_on='object').drop(columns=['object']).rename(columns={'product': 'product_fourth', 'productLabel': 'productLabel_fourth', 'productLabelJa': 'productLabelJa_fourth'})
475
+ path_df_3 = path_df_3.merge(company_product_kg_df[['company', 'companyLabel', 'companyLabelJA', 'product']], left_on='product_fourth', right_on='product').drop(columns=['product'])
476
+
477
+ ### Step 5 ###
478
+
479
+ path_df_1['length'] = 4
480
+ path_df_2['length'] = 5
481
+ path_df_3['length'] = 6
482
+
483
+ final_path_df = pd.concat([path_df_1, path_df_2, path_df_3])
484
+
485
+ final_path_df = final_path_df.reset_index(drop=True)
486
+ final_path_df['path_id'] = final_path_df.index
487
+ #final_path_df = final_path_df.set_index('path_id', drop=False)
488
+
489
+ final_company_df = final_path_df[['path_id', 'company', 'companyLabel', 'companyLabelJA']].copy()
490
+
491
+ ### Step 6 ###
492
+
493
+ st.title(f"Potential Customers for {target_product.companyLabel} for product {target_product.productLabel}")
494
+
495
+ # st.dataframe(final_company_df)
496
+
497
+ st.dataframe(final_company_df, on_select="rerun", key="customer", selection_mode="single-row")
498
+ select_customer = st.session_state.customer
499
+
500
+ if len(select_customer['selection']['rows']) > 0:
501
+
502
+ customer_id = select_customer['selection']['rows'][0]
503
+ target_customer = final_company_df.iloc[customer_id]
504
+
505
+ customer_df = final_path_df[final_path_df.path_id == target_customer.path_id].iloc[0]
506
+
507
+ # import graphviz
508
+
509
+ # Create a graphlib graph object
510
+ graph = graphviz.Digraph()
511
+
512
+ graph.edge(customer_df.companyLabel_start, customer_df.productLabel_start, label=' produces')
513
+ graph.edge(customer_df.productLabel_start, customer_df.productLabel_second, label=' part of')
514
+
515
+ if customer_df.length == 4:
516
+ graph.edge(customer_df.productLabel_second, customer_df.companyLabel, label= ' produced by')
517
+
518
+ elif customer_df.length == 5:
519
+ graph.edge(customer_df.productLabel_second, customer_df.productLabel_third, label=' instance of')
520
+ graph.edge(customer_df.productLabel_third, customer_df.companyLabel, label= ' produced by')
521
+
522
+ if customer_df.length == 6:
523
+ graph.edge(customer_df.productLabel_second, customer_df.productLabel_third, label=' instance of')
524
+ graph.edge(customer_df.productLabel_fourth, customer_df.productLabel_third, ' instance of')
525
+ graph.edge(customer_df.productLabel_fourth, customer_df.companyLabel, label= ' produced by')
526
+
527
+ st.graphviz_chart(graph)
528
+
529
+
530
+
531
+
532
 
533
  else:
534