Spaces:

naveed92
/

kg_competitor_analysis

Sleeping

App Files Files Community

naveed92 commited on Dec 17, 2024

Commit

9e4f60f

verified ·

1 Parent(s): 2c4c320

Update app.py

Browse files

Files changed (1) hide show

app.py +243 -22

app.py CHANGED Viewed

@@ -13,6 +13,9 @@ from st_keyup import st_keyup
 # What does persist = disk do ?
 # @st.cache_data(persist="disk")
 @st.cache_data
 def load_pandas_xlsx(path):
 	data = pd.read_excel(path)
@@ -24,6 +27,8 @@ def build_company_df(input_df):
 	output_df = input_df[['companyLabel', 'companyLabelJA', 'company']].drop_duplicates()
 	return output_df
 @st.cache_data
 def build_industry_df(input_df):
 	# Pre compute unique number of companies per industry
@@ -31,6 +36,8 @@ def build_industry_df(input_df):
 	output_df = output_df.rename(columns={'company': 'n_competitors'})
 	return output_df
 @st.cache_data
 def build_product_df(input_df):
 	# Pre compute unique number of companies per product
@@ -38,6 +45,58 @@ def build_product_df(input_df):
 	output_df = output_df.rename(columns={'company': 'n_competitors'})
 	return output_df
 def search_df(inp, df, col):
 	mask = df[col].str.contains(inp, case=False, regex=False)
 	select_df = df[mask]
@@ -45,9 +104,14 @@ def search_df(inp, df, col):
 ##### Data Logic #####
 COMPETITOR_PATH = 'data/merged_competitors_all_20241122.xlsx'
 INDUSTRY_PATH = 'data/industry_hierarchy_20241125.xlsx'
 # Load data
 with st.spinner(text="Loading competitor data ..."):
 	competitor_df = load_pandas_xlsx(COMPETITOR_PATH)
@@ -65,41 +129,26 @@ industry_hierarchy = load_pandas_xlsx(INDUSTRY_PATH)
 ### Pre computation Steps ###
 # Pre compute unique number of companies per industry
-# industry_to_counts = competitor_df[['company', 'companyLabel', 'companyLabelJA', 'industry', 'industryLabel', 'industryLabelJA']].drop_duplicates().groupby(['industry', 'industryLabel', 'industryLabelJA'])['company'].count().sort_values(ascending=False).reset_index().copy()
-# industry_to_counts = industry_to_counts.rename(columns={'company': 'n_competitors'})
 industry_to_counts = build_industry_df(competitor_df)
 # Pre compute unique number of companies per industry
-# product_to_counts = competitor_df[['company', 'companyLabel', 'companyLabelJA', 'product', 'productLabel', 'productLabelJA']].drop_duplicates().groupby(['product', 'productLabel', 'productLabelJA'])['company'].count().sort_values(ascending=False).reset_index().copy()
-# product_to_counts = product_to_counts.rename(columns={'company': 'n_competitors'})
 product_to_counts = build_product_df(competitor_df)
-### end ###
-# Parse Data
-###with st.spinner(text="Computing unique companies ..."):
-###	unique_companies = list(competitor_data.companyLabel.unique())
-### st.success("Unique Companies Done!")
-import streamlit as st
 # Title
-st.title('3C Competitor Analysis Demo')
 option = st.selectbox(
-    "Select Search Mode",
-    ("By Company", "By Industry", "By Product")
 )
 st.write("You selected:", option)
 ##### App Logic #####
-if option == "By Company":
 	st.title("Searching by Company")
@@ -201,7 +250,7 @@ if option == "By Company":
 				st.dataframe(competitors_by_country)
-elif option == "By Industry":
 	st.title("Searching by Industry")
@@ -278,7 +327,7 @@ elif option == "By Industry":
 			st.graphviz_chart(graph)
-elif option == "By Product":
 	st.title("Searching by Product")
@@ -308,6 +357,178 @@ elif option == "By Product":
 			competitors = competitor_df[competitor_df['product'] == product['product']][['companyLabel', 'companyLabelJA', 'company', 'country', 'countryLabel']].drop_duplicates().copy()
 			st.dataframe(competitors)
 else:

 # What does persist = disk do ?
 # @st.cache_data(persist="disk")
+### For competitor analysis
 @st.cache_data
 def load_pandas_xlsx(path):
 	data = pd.read_excel(path)
 	output_df = input_df[['companyLabel', 'companyLabelJA', 'company']].drop_duplicates()
 	return output_df
+### For industry analysis
 @st.cache_data
 def build_industry_df(input_df):
 	# Pre compute unique number of companies per industry
 	output_df = output_df.rename(columns={'company': 'n_competitors'})
 	return output_df
+### For product analysis
 @st.cache_data
 def build_product_df(input_df):
 	# Pre compute unique number of companies per product
 	output_df = output_df.rename(columns={'company': 'n_competitors'})
 	return output_df
+### For customer analysis
+@st.cache_data
+def build_company_product_kg(company_product_path, product_manufacturer_path):
+    company_product_df = pd.read_csv(company_product_path)
+    product_manufacturer_df = pd.read_csv(product_manufacturer_path)
+    output_df = pd.concat([company_product_df, product_manufacturer_df])
+    return output_df
+@st.cache_data
+def build_company_df_2(input_df):
+    # build company df
+    output_df = input_df[['companyLabel', 'companyLabelJA', 'company']].drop_duplicates()
+    return output_df
+property_mapping = {
+    'http://www.wikidata.org/prop/direct/P186': 'made_from_material',
+    'http://www.wikidata.org/prop/direct/P527': 'has_part',
+    'http://www.wikidata.org/prop/direct/P2283': 'uses',
+    'http://www.wikidata.org/prop/direct/P31': 'instance_of',
+    'http://www.wikidata.org/prop/direct/P366': 'has_use',
+    'http://www.wikidata.org/prop/direct/P361': 'part_of'
+}
+@st.cache_data
+def build_product_kg_df():
+    product_df_1 = pd.read_csv('data/product_manufacturer_relations_haspart_uses_madefrom_out.csv')
+    product_df_2 = pd.read_csv('data/product_relations_haspart_uses_madefrom_out.csv')
+    product_df_3 = pd.read_csv('data/product_relations_partof_hasuse_out.csv')
+    product_df_4 = pd.read_csv('data/product_manufacturer_relations_hasuse_partof_out.csv')
+    product_kg_df = pd.concat([product_df_1, product_df_2, product_df_3, product_df_4]).drop_duplicates()
+    product_kg_df['propertyLabel'] = product_kg_df.propertyLabel.apply(lambda x: property_mapping[x])
+    return product_kg_df
+@st.cache_data
+def build_product_instance_df():
+    product_instance_df_1 = pd.read_csv('data/product_relations_instance_out.csv')
+    product_instance_df_2 = pd.read_csv('data/product_manufacturer_relations_instance_out.csv')
+    product_instance_df = pd.concat([product_instance_df_1, product_instance_df_2]).drop_duplicates()
+    product_instance_df['propertyLabel'] = product_instance_df.propertyLabel.apply(lambda x: property_mapping[x])
+    return product_instance_df
+### For searching
 def search_df(inp, df, col):
 	mask = df[col].str.contains(inp, case=False, regex=False)
 	select_df = df[mask]
 ##### Data Logic #####
+# For competitor and industry analysis
 COMPETITOR_PATH = 'data/merged_competitors_all_20241122.xlsx'
 INDUSTRY_PATH = 'data/industry_hierarchy_20241125.xlsx'
+# For customer analysis
+COMPANY_PRODUCT_PATH = 'data/company_product_pairs.csv'
+PRODUCT_MANUFACTURER_PATH = 'data/product_manufacturer_pair.csv'
 # Load data
 with st.spinner(text="Loading competitor data ..."):
 	competitor_df = load_pandas_xlsx(COMPETITOR_PATH)
 ### Pre computation Steps ###
 # Pre compute unique number of companies per industry
 industry_to_counts = build_industry_df(competitor_df)
 # Pre compute unique number of companies per industry
 product_to_counts = build_product_df(competitor_df)
+### end ###
 # Title
+st.title('3C Competitor / Customer Analysis Demo')
 option = st.selectbox(
+    "Analysis Mode",
+    ("Customer Analysis", "Competitor Analysis", "Industry Analysis", "Product Analysis")
 )
 st.write("You selected:", option)
 ##### App Logic #####
+if option == "Competitor Analysis":
 	st.title("Searching by Company")
 				st.dataframe(competitors_by_country)
+elif option == "Industry Analysis":
 	st.title("Searching by Industry")
 			st.graphviz_chart(graph)
+elif option == "Product Analysis":
 	st.title("Searching by Product")
 			competitors = competitor_df[competitor_df['product'] == product['product']][['companyLabel', 'companyLabelJA', 'company', 'country', 'countryLabel']].drop_duplicates().copy()
 			st.dataframe(competitors)
+elif option == "Customer Analysis":
+	# Load data
+	with st.spinner(text="Build company product knowledge graph ..."):
+		company_product_kg_df = build_company_product_kg(COMPANY_PRODUCT_PATH, PRODUCT_MANUFACTURER_PATH)
+		company_df_2 = build_company_df_2(company_product_kg_df)
+	st.success("Company Product Knowledge Graph Loaded!")
+	with st.spinner(text="Build product relationship knowledge graph ..."):
+		product_kg_df = build_product_kg_df()
+	st.success("Product Relationship Knowledge Graph Loaded!")
+	with st.spinner(text="Build product instance knowledge graph ..."):
+		product_instance_df = build_product_instance_df()
+	st.success("Product Instance Knowledge Graph Loaded!")
+	### Search Start
+	st.title("Searching by Company")
+	# Get input
+	inp = st_keyup("Enter a company name", value="toshiba", key="0", debounce=500)
+	# Perform search
+	select_df = search_df(inp, company_df_2, 'companyLabel')
+	# def show_data():
+	# 	select_value = st.session_state.value
+	#	row_id=select_value['selection']['rows'][0]
+	#	st.write(company_df.iloc[row_id])
+	# Show search results
+	with st.status("Searching ...", state="running", expanded=False) as status:
+		status.update(label=f"{len(select_df)} results found", state="complete", expanded=True)
+		### Selection for Company ###
+		st.dataframe(select_df, on_select="rerun", key="value", selection_mode="single-row")
+		# Expand if company is selected
+		select_value = st.session_state.value
+		if len(select_value['selection']['rows']) > 0:
+			st.title("Company Data")
+			row_id = select_value['selection']['rows'][0]
+			row = select_df.iloc[row_id]
+			entries = company_product_kg_df[company_product_kg_df.company == row.company]
+			st.write(f"Company Name: {row.companyLabel}")
+			st.write(f"Japanese Name: {row.companyLabelJA}")
+			st.write(f"Wikidata URL: {row.company}")
+			# st.write(f"Products or Services Provided: {list(set(list(entries.productLabel.unique()) + list(entries.productLabelJA.unique())))}")
+			st.write(f"Products and services provided by {row.companyLabel}")
+			product_select_df = company_product_kg_df[(company_product_kg_df.company == row.company) & (company_product_kg_df.propertyLabel == 'product_or_service_provided')][['productLabel', 'productLabelJA', 'product', 'company', 'companyLabel', 'companyLabelJA']]
+			### Selection for Product ###
+			st.dataframe(product_select_df, on_select="rerun", key="product", selection_mode="single-row")
+			select_product = st.session_state.product
+			# expand if product if selected
+			if len(select_product['selection']['rows']) > 0:
+				product_id = select_product['selection']['rows'][0]
+				target_product = product_select_df.iloc[product_id]
+				# st.title(f"All Product Categories produced by {row.companyLabel}")
+				# st.dataframe(competitors)
+				# Hypothesis
+				# for incoming relations: 'uses' of 'has_part' is useful, since it lists services that have selected product as a component
+				# for outgoing relations: 'has_use' and 'part_of' is useful, since it lists services that have selected product as a component
+				####### Build kg paths
+				### Step 1 ###
+				start_df = pd.DataFrame()
+				start_df[['company_start', 'companyLabel_start', 'companyLabelJA_start', 'product_start', 'productLabel_start', 'productLabelJA_start']] = [[target_product.company, target_product.companyLabel, target_product.companyLabelJA, target_product['product'], target_product.productLabel, target_product.productLabelJA]]
+				### Step 2 ###
+				related_out_df = product_kg_df[(product_kg_df['product'] == target_product['product']) & (product_kg_df['propertyLabel'].apply(lambda x: x in ['has_use', 'part_of']))]
+				related_in_df = product_kg_df[(product_kg_df['object'] == target_product['product']) & (product_kg_df['propertyLabel'].apply(lambda x: x in ['uses', 'has_part']))]
+				path_df = pd.concat(
+					[
+						start_df.merge(related_out_df[['product', 'object', 'objectLabel', 'objectLabelJa']], left_on='product_start', right_on='product').drop(columns=['product']).rename(columns={'object': 'product_second', 'objectLabel': 'productLabel_second', 'objectLabelJa': 'productLabelJa_second'}),
+						start_df.merge(related_in_df[['object', 'product', 'productLabel', 'productLabelJa']], left_on='product_start', right_on='object').drop(columns=['object']).rename(columns={'product': 'product_second', 'productLabel': 'productLabel_second', 'productLabelJa': 'productLabelJa_second'}),
+					]
+				)
+				# merge 1
+				### Step 3a ###
+				path_df_1 = path_df.merge(company_product_kg_df[['company', 'companyLabel', 'companyLabelJA', 'product']], left_on='product_second', right_on='product').drop(columns=['product'])
+				### Step 3b ###
+				path_df_2 = path_df.merge(product_instance_df[['object', 'objectLabel', 'objectLabelJa', 'product']], left_on='product_second', right_on='product').drop(columns=['product']).rename(columns={'object': 'product_third', 'objectLabel': 'productLabel_third', 'objectLabelJa': 'productLabelJa_third'})
+				path_df_2 = path_df_2.merge(company_product_kg_df[['company', 'companyLabel', 'companyLabelJA', 'product']], left_on='product_third', right_on='product').drop(columns=['product'])
+				### Step 3c ###
+				path_df_3 = path_df.merge(product_instance_df[['object', 'objectLabel', 'objectLabelJa', 'product']], left_on='product_second', right_on='product').drop(columns=['product']).rename(columns={'object': 'product_third', 'objectLabel': 'productLabel_third', 'objectLabelJa': 'productLabelJa_third'})
+				path_df_3 = path_df_3.merge(product_instance_df[['product', 'productLabel', 'productLabelJa', 'object']], left_on='product_third', right_on='object').drop(columns=['object']).rename(columns={'product': 'product_fourth', 'productLabel': 'productLabel_fourth', 'productLabelJa': 'productLabelJa_fourth'})
+				path_df_3 = path_df_3.merge(company_product_kg_df[['company', 'companyLabel', 'companyLabelJA', 'product']], left_on='product_fourth', right_on='product').drop(columns=['product'])
+				### Step 5 ###
+				path_df_1['length'] = 4
+				path_df_2['length'] = 5
+				path_df_3['length'] = 6
+				final_path_df = pd.concat([path_df_1, path_df_2, path_df_3])
+				final_path_df = final_path_df.reset_index(drop=True)
+				final_path_df['path_id'] = final_path_df.index
+				#final_path_df = final_path_df.set_index('path_id', drop=False)
+				final_company_df = final_path_df[['path_id', 'company', 'companyLabel', 'companyLabelJA']].copy()
+				### Step 6 ###
+				st.title(f"Potential Customers for {target_product.companyLabel} for product {target_product.productLabel}")
+				# st.dataframe(final_company_df)
+				st.dataframe(final_company_df, on_select="rerun", key="customer", selection_mode="single-row")
+				select_customer = st.session_state.customer
+				if len(select_customer['selection']['rows']) > 0:
+					customer_id = select_customer['selection']['rows'][0]
+					target_customer = final_company_df.iloc[customer_id]
+					customer_df = final_path_df[final_path_df.path_id == target_customer.path_id].iloc[0]
+					# import graphviz
+					# Create a graphlib graph object
+					graph = graphviz.Digraph()
+					graph.edge(customer_df.companyLabel_start, customer_df.productLabel_start, label='  produces')
+					graph.edge(customer_df.productLabel_start, customer_df.productLabel_second, label='  part of')
+					if customer_df.length == 4:
+						graph.edge(customer_df.productLabel_second, customer_df.companyLabel, label= '  produced by')
+					elif customer_df.length == 5:
+						graph.edge(customer_df.productLabel_second, customer_df.productLabel_third, label='  instance of')
+						graph.edge(customer_df.productLabel_third, customer_df.companyLabel, label= '  produced by')
+					if customer_df.length == 6:
+						graph.edge(customer_df.productLabel_second, customer_df.productLabel_third, label='  instance of')
+						graph.edge(customer_df.productLabel_fourth, customer_df.productLabel_third, '  instance of')
+						graph.edge(customer_df.productLabel_fourth, customer_df.companyLabel, label= '  produced by')
+					st.graphviz_chart(graph)
 else: