Spaces:
Sleeping
Sleeping
| import requests | |
| import json | |
| from tqdm import tqdm | |
| from requests import Response | |
| def scrape_url_and_save_in_json_file(url : str , json_file_path : str) -> None : | |
| response : Response = requests.post( | |
| 'http://localhost:8002/scrape-url' , | |
| json = {'url' : url} | |
| ) | |
| with open(json_file_path , 'w') as json_file : json.dump(response.json() , json_file) | |
| def scrape_pdfs(json_file_path : str , api_key : str) -> None : | |
| with open(json_file_path) as json_file : links = json.load(json_file) | |
| pdf_links = links['pdf_links'] | |
| for link in tqdm(pdf_links , total = len(pdf_links)) : | |
| response : Response = requests.post( | |
| 'http://localhost:8002/scrape-pdf' , | |
| json = { | |
| 'url' : link , | |
| 'api_key' : api_key , | |
| 'scrape-images' : False | |
| } | |
| ) | |
| def scrape_page(json_file_path : str , api_key : str) -> None : | |
| with open(json_file_path) as json_file : links = json.load(json_file) | |
| all_links = links['all_links'] | |
| for link in tqdm(all_links , total = len(all_links)) : | |
| response : Response = requests.post( | |
| 'http://localhost:8002/scrape-page' , | |
| json = { | |
| 'url' : link , | |
| 'api_key' : api_key , | |
| 'scrape-images' : False | |
| } | |
| ) | |
| def ask(query , session_id , api_key) -> dict : | |
| response : Response = requests.post( | |
| 'http://localhost:8002/ask' , | |
| json = { | |
| 'query' : query , | |
| 'session_id' : session_id , | |
| 'api_key' : api_key | |
| } | |
| ) | |
| print(json.dumps(response.json() , indent = 4)) | |
| return response.json() | |
| # scrape_url_and_save_in_json_file('https://www.thetravellerdmc.com/' , 'temp.json') | |
| scrape_pdfs('temp.json' , 'vk_LANm5E0tSA3bWdN46wW4DVOlJWAMbNoa874BiNSXz7dFIddNzp5I04BNoG2mlI4N') | |
| scrape_page('temp.json' , 'vk_LANm5E0tSA3bWdN46wW4DVOlJWAMbNoa874BiNSXz7dFIddNzp5I04BNoG2mlI4N') |