Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import urllib.request | |
| import requests | |
| import bs4 | |
| import lxml | |
| def find_all(url,q=None,num=None): | |
| rawp = [] | |
| source = urllib.request.urlopen(url).read() | |
| soup = bs4.BeautifulSoup(source,'lxml') | |
| # title of the page | |
| print(soup.title) | |
| # get attributes: | |
| print(soup.title.name) | |
| # get values: | |
| print(soup.title.string) | |
| # beginning navigation: | |
| print(soup.title.parent.name) | |
| # getting specific values: | |
| #print(soup.p) | |
| #print(soup.find_all('p')) | |
| rawp.append([tag.name for tag in soup.find_all()] ) | |
| #soup_list = [tag for tag in soup.find_all()] | |
| #for tag in soup.find_all('h1','h2','h3','p','div','ul'): | |
| #for tag in soup_list: | |
| #print(tag.findChildren("a" , recursive=False)) | |
| #try: | |
| #n = tag.get(tag.string) | |
| #rawp.append({f'{tag.name}':tag.string,"parent":tag.parent.name}) | |
| #rawp.append({tag.name:tag.string,"parent":tag.parent.name}) | |
| #except Exception as e: | |
| # print (e) | |
| # rawp.append({f'{tag.name}':f'{tag.string}'}) | |
| #rawp.append(tag.string) | |
| #for url in soup.find_all('a'): | |
| #print(url.get('href')) | |
| #print(soup.get_text()) | |
| return rawp | |
| def find_it(url,q=None,num=None): | |
| out = [] | |
| source = urllib.request.urlopen(url).read() | |
| soup = bs4.BeautifulSoup(source,'lxml') | |
| for p in soup.find_all(f'{q}'): | |
| try: | |
| print(soup.select(f'{p.name}:first-child').name) | |
| #print(p.findChildren()) | |
| except Exception as e: | |
| print (e) | |
| #out.append(p) | |
| out.append([{q:p.string,"parent":p.parent.parent.name,"first-child":soup.select(f'{p.name}:first-child').name,"content":p}]) | |
| #out.append(p.parent.name) | |
| for url in soup.find_all('a'): | |
| print(url.get('href')) | |
| #print(soup.get_text()) | |
| return out | |
| def find_it2(url): | |
| response = requests.get(url,a1=None,q2=None,q3=None) | |
| try: | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.content, 'lxml') | |
| out = 'URL Links:\n'.join([p.text for p in soup.find_all('a')]) | |
| return out | |
| except Exception as e: | |
| print (e) | |
| return e | |
| with gr.Blocks() as app: | |
| with gr.Row(): | |
| inp = gr.Textbox() | |
| q = gr.Textbox(value="p") | |
| num = gr.Number(value=1) | |
| with gr.Row(): | |
| all_btn = gr.Button("Load") | |
| find_btn = gr.Button("Find") | |
| with gr.Row(): | |
| rawp = gr.JSON() | |
| outp = gr.JSON() | |
| all_btn.click(find_all,[inp,q,num],[rawp]) | |
| find_btn.click(find_it,[inp,q,num],[outp]) | |
| app.launch() | |