Update app.py
Browse files
app.py
CHANGED
|
@@ -302,6 +302,60 @@ def link_find(url):
|
|
| 302 |
return node1,node2
|
| 303 |
#https://huggingface.co/spaces/Omnibus/crawl
|
| 304 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 305 |
def sitemap(url,file_state,level):
|
| 306 |
uri=""
|
| 307 |
uri0=""
|
|
@@ -416,16 +470,18 @@ def sitemap_OG(url,level):
|
|
| 416 |
return link1
|
| 417 |
|
| 418 |
def test():
|
|
|
|
| 419 |
with open("./seed.txt") as f:
|
| 420 |
this = f.readlines()
|
| 421 |
f.close()
|
| 422 |
for ea in this:
|
| 423 |
ea=ea.strip().strip("\n")
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
|
|
|
| 429 |
|
| 430 |
with gr.Blocks() as app:
|
| 431 |
file_state=gr.State()
|
|
|
|
| 302 |
return node1,node2
|
| 303 |
#https://huggingface.co/spaces/Omnibus/crawl
|
| 304 |
|
| 305 |
+
def sitemap_test(url,file_state,level):
|
| 306 |
+
url_page=[]
|
| 307 |
+
url_front=[]
|
| 308 |
+
url_json=[]
|
| 309 |
+
for each_url in url:
|
| 310 |
+
uri=""
|
| 311 |
+
uri0=""
|
| 312 |
+
if url != "" and url != None:
|
| 313 |
+
link1,link2=link_find(url)
|
| 314 |
+
if level >=2:
|
| 315 |
+
for i,ea in enumerate(link1['TREE']):
|
| 316 |
+
print(ea)
|
| 317 |
+
try:
|
| 318 |
+
out_list1,out_list2=link_find(f"{uri}{ea['URL']}")
|
| 319 |
+
link1['TREE'][i]=out_list1
|
| 320 |
+
link2['TREE'][i]=out_list2
|
| 321 |
+
#link1['TREE'].append(out_list)
|
| 322 |
+
|
| 323 |
+
if level>=3:
|
| 324 |
+
for n,na in enumerate(link1['TREE'][i]['TREE']):
|
| 325 |
+
print(na)
|
| 326 |
+
try:
|
| 327 |
+
out_list1,out_list2=link_find(f"{uri0}{na['URL']}")
|
| 328 |
+
link1['TREE'][i]['TREE'][n]=out_list1
|
| 329 |
+
link2['TREE'][i]['TREE'][n]=out_list2
|
| 330 |
+
#link1['TREE'][i]['TREE'].append(out_list1)
|
| 331 |
+
except Exception as e:
|
| 332 |
+
print (e)
|
| 333 |
+
except Exception as e:
|
| 334 |
+
print (e)
|
| 335 |
+
|
| 336 |
+
for ea_link in link2['TREE']:
|
| 337 |
+
url_list=ea_link['URL'].split("/")
|
| 338 |
+
url_front.append(f'{url_list[1]}//{url_list[3]}')
|
| 339 |
+
uri_key=sort_doc(url_front,file_state,8)
|
| 340 |
+
|
| 341 |
+
######## Save Database ########
|
| 342 |
+
uid=uuid.uuid4()
|
| 343 |
+
with open(f'{uid}.json', 'w') as f:
|
| 344 |
+
json_hist=json.dumps(uri_key, indent=4)
|
| 345 |
+
f.write(json_hist)
|
| 346 |
+
f.close()
|
| 347 |
+
|
| 348 |
+
upload_file(
|
| 349 |
+
path_or_fileobj =f"{uid}.json",
|
| 350 |
+
path_in_repo = f"crawl/{filename}.json",
|
| 351 |
+
repo_id =f"{username}/{dataset_name}",
|
| 352 |
+
repo_type = "dataset",
|
| 353 |
+
token=token,
|
| 354 |
+
)
|
| 355 |
+
#################################
|
| 356 |
+
return link1,link2,uri_key
|
| 357 |
+
|
| 358 |
+
|
| 359 |
def sitemap(url,file_state,level):
|
| 360 |
uri=""
|
| 361 |
uri0=""
|
|
|
|
| 470 |
return link1
|
| 471 |
|
| 472 |
def test():
|
| 473 |
+
seed_box=[]
|
| 474 |
with open("./seed.txt") as f:
|
| 475 |
this = f.readlines()
|
| 476 |
f.close()
|
| 477 |
for ea in this:
|
| 478 |
ea=ea.strip().strip("\n")
|
| 479 |
+
seed_box.append(ea)
|
| 480 |
+
#print(ea)
|
| 481 |
+
try:
|
| 482 |
+
a,b,c = sitemap_test(seed_box,None,1)
|
| 483 |
+
except Exception as e:
|
| 484 |
+
print (e)
|
| 485 |
|
| 486 |
with gr.Blocks() as app:
|
| 487 |
file_state=gr.State()
|