Update app.py
Browse files
app.py
CHANGED
|
@@ -2,13 +2,33 @@ import gradio as gr
|
|
| 2 |
import requests
|
| 3 |
import bs4
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
def sort_doc(in_list,steps_in=0,control=None):
|
| 6 |
control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62}
|
| 7 |
text=str(in_list)
|
| 8 |
-
|
| 9 |
-
########################################
|
| 10 |
-
sen_list=in_list
|
| 11 |
-
######################################
|
| 12 |
key_cnt=len(in_list)
|
| 13 |
print(key_cnt)
|
| 14 |
control_char=list(control_json['control'])
|
|
@@ -76,9 +96,6 @@ def sort_doc(in_list,steps_in=0,control=None):
|
|
| 76 |
print(j)
|
| 77 |
out_js = out_js+control_char[j]
|
| 78 |
sen_obj=in_list[i]
|
| 79 |
-
#sen_obj=proc_sen(sen_list,i)
|
| 80 |
-
|
| 81 |
-
#json_out[out_js]={'nouns':ea}
|
| 82 |
json_out[out_js]=sen_obj
|
| 83 |
print ("#################")
|
| 84 |
print (out_js)
|
|
@@ -186,7 +203,25 @@ def sitemap(url,level):
|
|
| 186 |
except Exception as e:
|
| 187 |
print (e)
|
| 188 |
uri_key=sort_doc(link_box,8)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 190 |
return link1,link2,uri_key
|
| 191 |
|
| 192 |
|
|
|
|
| 2 |
import requests
|
| 3 |
import bs4
|
| 4 |
|
| 5 |
+
######## Load Database ########
|
| 6 |
+
|
| 7 |
+
from huggingface_hub import HfApi, upload_file
|
| 8 |
+
import json
|
| 9 |
+
import uuid
|
| 10 |
+
token=os.environ.get("HF_TOKEN")
|
| 11 |
+
username="omnibus"
|
| 12 |
+
dataset_name="tmp"
|
| 13 |
+
save_data=f'https://huggingface.co/datasets/{username}/{dataset_name}/raw/main/'
|
| 14 |
+
api=HfApi(token="")
|
| 15 |
+
filename="test"
|
| 16 |
+
|
| 17 |
+
r = requests.get(f'{save_data}crawl/{file_n}.json')
|
| 18 |
+
print(f'status code main:: {r.status_code}')
|
| 19 |
+
if r.status_code==200:
|
| 20 |
+
lod = json.loads(r.text)
|
| 21 |
+
#print(f'lod:: {lod}')
|
| 22 |
+
#lod[0]['comment']=lod[0]['comment']+1
|
| 23 |
+
#lod[0]['comment_list'].append({'user':persona[persona2]['name'],'datetime':'','comment':output,'reply_list':[]})
|
| 24 |
+
else:
|
| 25 |
+
lod={}
|
| 26 |
+
|
| 27 |
+
#############################
|
| 28 |
+
|
| 29 |
def sort_doc(in_list,steps_in=0,control=None):
|
| 30 |
control_json={'control':'0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ','char':'','leng':62}
|
| 31 |
text=str(in_list)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
key_cnt=len(in_list)
|
| 33 |
print(key_cnt)
|
| 34 |
control_char=list(control_json['control'])
|
|
|
|
| 96 |
print(j)
|
| 97 |
out_js = out_js+control_char[j]
|
| 98 |
sen_obj=in_list[i]
|
|
|
|
|
|
|
|
|
|
| 99 |
json_out[out_js]=sen_obj
|
| 100 |
print ("#################")
|
| 101 |
print (out_js)
|
|
|
|
| 203 |
except Exception as e:
|
| 204 |
print (e)
|
| 205 |
uri_key=sort_doc(link_box,8)
|
| 206 |
+
######## Save Database ########
|
| 207 |
+
uid=uuid.uuid4()
|
| 208 |
+
for ea in list(uri_key.keys()):
|
| 209 |
+
if not uri_key[ea] == x for x in list(lod.values()):
|
| 210 |
+
lod[ea]=uri_key[ea]
|
| 211 |
|
| 212 |
+
with open(f'{uid}.json', 'w') as f:
|
| 213 |
+
json_hist=json.dumps(uri_key, indent=4)
|
| 214 |
+
f.write(json_hist)
|
| 215 |
+
f.close()
|
| 216 |
+
|
| 217 |
+
upload_file(
|
| 218 |
+
path_or_fileobj =f"{uid}.json",
|
| 219 |
+
path_in_repo = f"crawl/{filename}.json",
|
| 220 |
+
repo_id =f"{username}/{dataset_name}",
|
| 221 |
+
repo_type = "dataset",
|
| 222 |
+
token=token,
|
| 223 |
+
)
|
| 224 |
+
#################################
|
| 225 |
return link1,link2,uri_key
|
| 226 |
|
| 227 |
|