kristada673 commited on
Commit
9ad659c
·
1 Parent(s): b970341

Upload _base.py

Browse files
Files changed (1) hide show
  1. finnlp/data_sources/_base.py +80 -0
finnlp/data_sources/_base.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from finnlp.utils.get_proxy import get_china_free_proxy, get_us_free_proxy, Kuaidaili
2
+ import requests
3
+
4
+ class FinNLP_Downloader:
5
+ def __init__(self, args = {}):
6
+ self.use_proxy = True if "use_proxy" in args.keys() else False
7
+ if self.use_proxy:
8
+ self.country = args["use_proxy"]
9
+ else:
10
+ self.country = None
11
+ self.max_retry = args["max_retry"] if "max_retry" in args.keys() else 1
12
+ self.proxy_pages = args["proxy_pages"] if "proxy_pages" in args.keys() else 5
13
+ if self.use_proxy:
14
+ if "kuaidaili" in self.country:
15
+ # tunnel, username, password
16
+ assert "tunnel" in args.keys(), "Please make sure \'tunnel\' in your keys"
17
+ assert "username" in args.keys(), "Please make sure \'username\' in your keys"
18
+ assert "password" in args.keys(), "Please make sure \'password\' in your keys"
19
+ self.proxy_list = Kuaidaili(args["tunnel"], args["username"], args["password"])
20
+ else:
21
+ self.proxy_id = 0
22
+ self.proxy_list = self._update_proxy()
23
+ else:
24
+ self.proxy_list = []
25
+
26
+ def _get_proxy(self):
27
+ if self.use_proxy:
28
+ if "kuaidaili" in self.country:
29
+ proxy = self.proxy_list.get_kuaidaili_tunnel_proxy()
30
+ return proxy
31
+ elif len(self.proxy_list) >0:
32
+ proxy = self.proxy_list[self.proxy_id]
33
+ self.proxy_id += 1
34
+ if self.proxy_id == len(self.proxy_list):
35
+ self.proxy_id = 0
36
+ return proxy
37
+ else:
38
+ return None
39
+
40
+ def _update_proxy(self):
41
+ if "china" in self.country or "China" in self.country:
42
+ return get_china_free_proxy(self.proxy_pages)
43
+ else:
44
+ return get_us_free_proxy(self.proxy_pages)
45
+
46
+ def _request_get(self, url, headers = None, verify = None, params = None):
47
+ if headers is None:
48
+ headers = {
49
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/112.0"
50
+ }
51
+ max_retry = self.max_retry
52
+ proxies = self._get_proxy()
53
+ for _ in range(max_retry):
54
+ try:
55
+ response = requests.get(url = url, proxies = proxies, headers = headers, verify = verify, params = params)
56
+ if response.status_code == 200:
57
+ break
58
+ except:
59
+ response = None
60
+
61
+ if response is not None and response.status_code != 200:
62
+ response = None
63
+
64
+ return response
65
+
66
+ def _request_post(self, url, headers, json):
67
+ max_retry = self.max_retry
68
+ proxies = self._get_proxy()
69
+ for _ in range(max_retry):
70
+ try:
71
+ response = requests.post(url = url, headers = headers, json = json, proxies = proxies)
72
+ if response.status_code == 200:
73
+ break
74
+ except:
75
+ response = None
76
+
77
+ if response is not None and response.status_code != 200:
78
+ response = None
79
+
80
+ return response