matthewfarant commited on
Commit
88bc413
·
1 Parent(s): 9ba5aaa

Create functions/extract_function.py

Browse files
Files changed (1) hide show
  1. functions/extract_function.py +101 -0
functions/extract_function.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import yaml
3
+ import requests
4
+ import pandas as pd
5
+
6
+ def internal_data(type):
7
+ """
8
+ Extract internal data from either catalog or query.
9
+
10
+ :param type: str, 'catalog' or 'query'
11
+
12
+ :return: pandas.DataFrame, dataframe containing product name and category name
13
+ """
14
+ if type == 'catalog':
15
+ dfs = []
16
+ for file in os.listdir('catalog'):
17
+ if file.endswith('.xlsx'):
18
+ df = pd.read_excel('catalog/' + file)
19
+ dfs.append(df)
20
+ catalog = pd.concat(dfs, ignore_index=True)
21
+ return catalog
22
+
23
+ elif type == 'query':
24
+ dfs = []
25
+ for file in os.listdir('query'):
26
+ if file.endswith('.xlsx'):
27
+ df = pd.read_excel('query/' + file)
28
+ dfs.append(df)
29
+ query = pd.concat(dfs, ignore_index=True)
30
+ return query
31
+
32
+ else:
33
+ return 'Error: type must be either catalog or query'
34
+
35
+ def registered_fertilizer_data():
36
+ """
37
+ Scrape registered fertilizer data in Ministry of Agriculture website.
38
+
39
+ :param type: str, 'organik' or 'anorganik'
40
+
41
+ :return: pandas.DataFrame, dataframe containing registered fertilizer data
42
+ """
43
+ # check if the "external" folder is empty
44
+ if os.listdir('external') == []:
45
+ print('External folder is empty. Extracting data from Ministry of Agriculture website...')
46
+ print('Extracting Organic Fertilizer Data...')
47
+ dfs1 = []
48
+ # Scrape every table in every page: Organic
49
+ i = 1
50
+ while True:
51
+ url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['organik'][0] + str(i)
52
+ result = requests.get(url).content
53
+ try:
54
+ df = pd.read_html(result)[5].iloc[2:-1, [2, 3, 6]].rename(columns={2: 'Merek', 3: 'Jenis', 6: 'Nomor Pendaftaran'})
55
+ df['Page Number'] = i
56
+ dfs1.append(df)
57
+ i += 1
58
+ except IndexError:
59
+ break
60
+
61
+ registered_organic_fertilizers = pd.concat(dfs1, ignore_index=True).dropna()
62
+
63
+ print('Extracting Inorganic Fertilizer Data...')
64
+ dfs2 = []
65
+ # Scrape every table in every page: Inorganic
66
+ i = 1
67
+ while True:
68
+ url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['anorganik'][0] + str(i)
69
+ result = requests.get(url).content
70
+ try:
71
+ df = pd.read_html(result)[5].iloc[2:-1, 5:8].rename(columns={5: 'Merek', 6: 'Jenis', 7: 'Nomor Pendaftaran'})
72
+ df['Page Number'] = i
73
+ dfs2.append(df)
74
+ i += 1
75
+ except IndexError:
76
+ break
77
+
78
+ registered_inorganic_fertilizers = pd.concat(dfs2, ignore_index=True).dropna()
79
+
80
+ registered_fertilizers = pd.concat([registered_organic_fertilizers, registered_inorganic_fertilizers], ignore_index=True)
81
+ registered_fertilizers['Nama Lengkap'] = registered_fertilizers['Jenis'] + ' ' + registered_fertilizers['Merek']
82
+ return registered_fertilizers
83
+
84
+ else :
85
+ return pd.read_csv('external/registered_fertilizers.csv')
86
+
87
+ def scrape_result():
88
+ """
89
+ Extract scraped result data.
90
+
91
+ :return: pandas.DataFrame, dataframe containing scraped result data
92
+ """
93
+ dfs = []
94
+
95
+ for filename in os.listdir('scrape_result'):
96
+ df = pd.read_csv('scrape_result/'+filename)
97
+ dfs.append(df)
98
+
99
+ # combine
100
+ final_df = pd.concat(dfs, ignore_index=True)
101
+ return final_df