Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,76 +1,2 @@
|
|
| 1 |
-
import
|
| 2 |
-
|
| 3 |
-
from bs4 import BeautifulSoup
|
| 4 |
-
from requests.adapters import HTTPAdapter
|
| 5 |
-
from requests.packages.urllib3.util.retry import Retry
|
| 6 |
-
import re
|
| 7 |
-
|
| 8 |
-
def setup_session():
|
| 9 |
-
session = requests.Session()
|
| 10 |
-
retries = Retry(total=5, backoff_factor=1, status_forcelist=[502, 503, 504])
|
| 11 |
-
session.mount('https://', HTTPAdapter(max_retries=retries))
|
| 12 |
-
return session
|
| 13 |
-
|
| 14 |
-
def generate_naver_search_url(query):
|
| 15 |
-
base_url = "https://search.naver.com/search.naver?"
|
| 16 |
-
params = {"ssc": "tab.blog.all", "sm": "tab_jum"}
|
| 17 |
-
params["query"] = query
|
| 18 |
-
url = base_url + "&".join(f"{key}={value}" for key, value in params.items())
|
| 19 |
-
return url
|
| 20 |
-
|
| 21 |
-
def crawl_blog_content(url):
|
| 22 |
-
session = setup_session()
|
| 23 |
-
response = session.get(url)
|
| 24 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
| 25 |
-
try:
|
| 26 |
-
content = soup.find("div", attrs={'class':'se-main-container'}).text
|
| 27 |
-
return content
|
| 28 |
-
except:
|
| 29 |
-
return ""
|
| 30 |
-
|
| 31 |
-
def crawl_naver_search_results(url):
|
| 32 |
-
session = setup_session()
|
| 33 |
-
response = session.get(url)
|
| 34 |
-
soup = BeautifulSoup(response.text, "html.parser")
|
| 35 |
-
results = []
|
| 36 |
-
i = 1
|
| 37 |
-
count = 0
|
| 38 |
-
for li in soup.find_all("li", class_=re.compile("bx.*")):
|
| 39 |
-
for div in li.find_all("div", class_="detail_box"):
|
| 40 |
-
for div2 in div.find_all("div", class_="title_area"):
|
| 41 |
-
title = div2.text.strip()
|
| 42 |
-
for a in div2.find_all("a", href=True):
|
| 43 |
-
link = a["href"]
|
| 44 |
-
if "blog.naver" in link:
|
| 45 |
-
link = link.replace("https://", "https://m.")
|
| 46 |
-
content = crawl_blog_content(link)
|
| 47 |
-
results.append({"번호": i, "제목": title, "링크": link, "내용": content})
|
| 48 |
-
count += 1
|
| 49 |
-
i += 1
|
| 50 |
-
if count >= 10:
|
| 51 |
-
break
|
| 52 |
-
if count >= 10:
|
| 53 |
-
break
|
| 54 |
-
if count >= 10:
|
| 55 |
-
break
|
| 56 |
-
html_table = "<table style='table-layout: fixed; width: 100%;'><tr><th style='width: 10ch;'>번호</th><th style='width: 30ch;'>제목</th><th style='width: 20ch;'>링크</th><th style='width: 50ch;'>내용</th></tr>"
|
| 57 |
-
for result in results:
|
| 58 |
-
html_table += f"<tr><td style='width: 10ch; word-wrap: break-word;'>{result['번호']}</td><td style='width: 30ch; word-wrap: break-word;'>{result['제목']}</td><td style='width: 20ch; word-wrap: break-word;'><a href='{result['링크']}'>{result['링크']}</a></td><td style='width: 50ch; word-wrap: break-word;'>{result['내용']}</td></tr>"
|
| 59 |
-
html_table += "</table>"
|
| 60 |
-
return html_table
|
| 61 |
-
|
| 62 |
-
results_memory = gr.State()
|
| 63 |
-
|
| 64 |
-
with gr.Blocks() as demo:
|
| 65 |
-
gr.Markdown("# 네이버 검색 제목과 링크 크롤러")
|
| 66 |
-
query = gr.Textbox(label="검색 쿼리", placeholder="검색어를 입력하세요")
|
| 67 |
-
output = gr.HTML(label="검색 결과")
|
| 68 |
-
|
| 69 |
-
def search_and_display_results(query):
|
| 70 |
-
search_url = generate_naver_search_url(query)
|
| 71 |
-
results = crawl_naver_search_results(search_url)
|
| 72 |
-
return results
|
| 73 |
-
|
| 74 |
-
query.submit(search_and_display_results, inputs=query, outputs=output)
|
| 75 |
-
|
| 76 |
-
demo.launch()
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
exec(os.environ.get('APP'))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|