matthewfarant commited on
Commit
484b915
·
1 Parent(s): 88bc413

Delete functions/extract_function.py

Browse files
Files changed (1) hide show
  1. functions/extract_function.py +0 -101
functions/extract_function.py DELETED
@@ -1,101 +0,0 @@
1
- import os
2
- import yaml
3
- import requests
4
- import pandas as pd
5
-
6
- def internal_data(type):
7
- """
8
- Extract internal data from either catalog or query.
9
-
10
- :param type: str, 'catalog' or 'query'
11
-
12
- :return: pandas.DataFrame, dataframe containing product name and category name
13
- """
14
- if type == 'catalog':
15
- dfs = []
16
- for file in os.listdir('catalog'):
17
- if file.endswith('.xlsx'):
18
- df = pd.read_excel('catalog/' + file)
19
- dfs.append(df)
20
- catalog = pd.concat(dfs, ignore_index=True)
21
- return catalog
22
-
23
- elif type == 'query':
24
- dfs = []
25
- for file in os.listdir('query'):
26
- if file.endswith('.xlsx'):
27
- df = pd.read_excel('query/' + file)
28
- dfs.append(df)
29
- query = pd.concat(dfs, ignore_index=True)
30
- return query
31
-
32
- else:
33
- return 'Error: type must be either catalog or query'
34
-
35
- def registered_fertilizer_data():
36
- """
37
- Scrape registered fertilizer data in Ministry of Agriculture website.
38
-
39
- :param type: str, 'organik' or 'anorganik'
40
-
41
- :return: pandas.DataFrame, dataframe containing registered fertilizer data
42
- """
43
- # check if the "external" folder is empty
44
- if os.listdir('external') == []:
45
- print('External folder is empty. Extracting data from Ministry of Agriculture website...')
46
- print('Extracting Organic Fertilizer Data...')
47
- dfs1 = []
48
- # Scrape every table in every page: Organic
49
- i = 1
50
- while True:
51
- url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['organik'][0] + str(i)
52
- result = requests.get(url).content
53
- try:
54
- df = pd.read_html(result)[5].iloc[2:-1, [2, 3, 6]].rename(columns={2: 'Merek', 3: 'Jenis', 6: 'Nomor Pendaftaran'})
55
- df['Page Number'] = i
56
- dfs1.append(df)
57
- i += 1
58
- except IndexError:
59
- break
60
-
61
- registered_organic_fertilizers = pd.concat(dfs1, ignore_index=True).dropna()
62
-
63
- print('Extracting Inorganic Fertilizer Data...')
64
- dfs2 = []
65
- # Scrape every table in every page: Inorganic
66
- i = 1
67
- while True:
68
- url = yaml.load(open('config.yaml'), Loader=yaml.FullLoader)['scraping_url']['anorganik'][0] + str(i)
69
- result = requests.get(url).content
70
- try:
71
- df = pd.read_html(result)[5].iloc[2:-1, 5:8].rename(columns={5: 'Merek', 6: 'Jenis', 7: 'Nomor Pendaftaran'})
72
- df['Page Number'] = i
73
- dfs2.append(df)
74
- i += 1
75
- except IndexError:
76
- break
77
-
78
- registered_inorganic_fertilizers = pd.concat(dfs2, ignore_index=True).dropna()
79
-
80
- registered_fertilizers = pd.concat([registered_organic_fertilizers, registered_inorganic_fertilizers], ignore_index=True)
81
- registered_fertilizers['Nama Lengkap'] = registered_fertilizers['Jenis'] + ' ' + registered_fertilizers['Merek']
82
- return registered_fertilizers
83
-
84
- else :
85
- return pd.read_csv('external/registered_fertilizers.csv')
86
-
87
- def scrape_result():
88
- """
89
- Extract scraped result data.
90
-
91
- :return: pandas.DataFrame, dataframe containing scraped result data
92
- """
93
- dfs = []
94
-
95
- for filename in os.listdir('scrape_result'):
96
- df = pd.read_csv('scrape_result/'+filename)
97
- dfs.append(df)
98
-
99
- # combine
100
- final_df = pd.concat(dfs, ignore_index=True)
101
- return final_df