Kims12 commited on
Commit
1a02034
ยท
verified ยท
1 Parent(s): ddfb11c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +53 -74
app.py CHANGED
@@ -1,89 +1,68 @@
 
1
  import requests
 
2
  from bs4 import BeautifulSoup
3
- import gradio as gr
4
 
5
  def scrape_naver_blog(url):
6
  try:
7
- # ๋””๋ฒ„๊น…: URL ์ˆ˜์‹  ํ™•์ธ
8
- print(f"Received URL: {url}")
9
-
10
- # HTTP ์š”์ฒญ ํ—ค๋” ์„ค์ • (๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ๋Š” User-Agent๊ฐ€ ํ•„์š”ํ•  ์ˆ˜ ์žˆ์Œ)
11
  headers = {
12
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
13
- }
14
-
15
- # ์›น ํŽ˜์ด์ง€ ๊ฐ€์ ธ์˜ค๊ธฐ
16
  response = requests.get(url, headers=headers)
17
- print(f"HTTP GET ์š”์ฒญ ์ƒํƒœ ์ฝ”๋“œ: {response.status_code}")
18
-
19
  if response.status_code != 200:
20
- return f"Error: Unable to fetch the page. Status code: {response.status_code}"
21
-
22
- # BeautifulSoup์„ ์‚ฌ์šฉํ•˜์—ฌ HTML ํŒŒ์‹ฑ
23
- soup = BeautifulSoup(response.content, 'html.parser')
24
-
25
- # ์ œ๋ชฉ ์ถ”์ถœ
26
- # ์‹ค์ œ HTML ๊ตฌ์กฐ์— ๋งž๊ฒŒ ํด๋ž˜์Šค๋ช…๊ณผ ํƒœ๊ทธ๋ฅผ ์ˆ˜์ •ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
27
- title = None
28
-
29
- # ์˜ˆ์‹œ 1: <h3 class="se_textarea">์— ์ œ๋ชฉ์ด ์žˆ๋Š” ๊ฒฝ์šฐ
30
- title_element = soup.find('h3', class_='se_textarea')
31
- if title_element and title_element.get_text(strip=True):
32
- title = title_element.get_text(strip=True)
33
- print(f"์ถ”์ถœ๋œ ์ œ๋ชฉ (h3.se_textarea): {title}")
34
 
35
- # ์˜ˆ์‹œ 2: meta ํƒœ๊ทธ์—์„œ ์ œ๋ชฉ ์ถ”์ถœ
36
- if not title:
37
- title_meta = soup.find('meta', property='og:title')
38
- if title_meta and title_meta.get('content'):
39
- title = title_meta.get('content').strip()
40
- print(f"์ถ”์ถœ๋œ ์ œ๋ชฉ (meta og:title): {title}")
41
-
42
- if not title:
43
- print("์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
 
 
44
  title = "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
45
-
46
- # ๋‚ด์šฉ ํ…์ŠคํŠธ ์ถ”์ถœ
47
- # ์‹ค์ œ HTML ๊ตฌ์กฐ์— ๋งž๊ฒŒ ํด๋ž˜์Šค๋ช…๊ณผ ํƒœ๊ทธ๋ฅผ ์ˆ˜์ •ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
48
- content = None
49
-
50
- # ์˜ˆ์‹œ 1: <div class="se-main-container"> ๋‚ด์˜ ๋ชจ๋“  ํ…์ŠคํŠธ ์ถ”์ถœ
51
- content_container = soup.find('div', class_='se-main-container')
52
- if content_container:
53
- content = content_container.get_text(separator='\n', strip=True)
54
- print(f"์ถ”์ถœ๋œ ๋‚ด์šฉ (div.se-main-container): {content[:100]}...") # ์ผ๋ถ€๋งŒ ์ถœ๋ ฅ
55
  else:
56
- # ์˜ˆ์‹œ 2: ๋ชจ๋“  <p> ํƒœ๊ทธ๋ฅผ ํ•ฉ์น˜๋Š” ๋ฐฉ๋ฒ•
57
- p_tags = soup.find_all('p')
58
- if p_tags:
59
- content = '\n'.join([p.get_text(strip=True) for p in p_tags])
60
- print(f"์ถ”์ถœ๋œ ๋‚ด์šฉ (p tags): {content[:100]}...")
61
- else:
62
- print("๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
63
- content = "๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
64
-
65
- # ์ถœ๋ ฅ ํ˜•์‹ ์ง€์ •
66
- output = f"์ œ๋ชฉ: {title}\n\n๋‚ด์šฉ: {content}"
67
- return output
68
-
 
 
 
 
 
 
69
  except Exception as e:
70
- # ์˜ˆ์™ธ ๋ฐœ์ƒ ์‹œ ๋””๋ฒ„๊น… ์ •๋ณด ๋ฐ˜ํ™˜
71
- print(f"์˜ˆ์™ธ ๋ฐœ์ƒ: {e}")
72
- return f"An error occurred: {e}"
73
 
74
- # Gradio ์ธํ„ฐํŽ˜์ด์Šค ์„ค์ •
75
- with gr.Blocks() as iface:
76
- gr.Markdown("# ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํผ")
77
- gr.Markdown("๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ URL์„ ์ž…๋ ฅํ•˜๋ฉด ์ œ๋ชฉ๊ณผ ๏ฟฝ๏ฟฝ์šฉ์„ ์ถ”์ถœํ•ฉ๋‹ˆ๋‹ค.")
78
-
79
- with gr.Row():
80
- url_input = gr.Textbox(label="๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ URL", placeholder="https://blog.naver.com/lafleur7/223723723486")
81
-
82
- scrape_button = gr.Button("์Šคํฌ๋ž˜ํ•‘")
83
-
84
- output_text = gr.Textbox(label="๊ฒฐ๊ณผ", lines=20)
85
-
86
- scrape_button.click(fn=scrape_naver_blog, inputs=url_input, outputs=output_text)
 
 
87
 
88
- # ์• ํ”Œ๋ฆฌ์ผ€์ด์…˜ ์‹คํ–‰
89
  iface.launch()
 
1
+ import gradio as gr
2
  import requests
3
+ from lxml import html
4
  from bs4 import BeautifulSoup
 
5
 
6
  def scrape_naver_blog(url):
7
  try:
8
+ # HTTP ์š”์ฒญ ํ—ค๋” ์„ค์ • (๋„ค์ด๋ฒ„๋Š” User-Agent๋ฅผ ํ™•์ธํ•  ์ˆ˜ ์žˆ์Œ)
 
 
 
9
  headers = {
10
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
11
+ ' Chrome/58.0.3029.110 Safari/537.3'}
 
 
12
  response = requests.get(url, headers=headers)
13
+
 
14
  if response.status_code != 200:
15
+ return f"Error: Unable to fetch the page. Status code: {response.status_code}", ""
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ # ๋””๋ฒ„๊น…: ์‘๋‹ต ๋ฐ›์€ HTML ๋‚ด์šฉ ์ผ๋ถ€ ์ถœ๋ ฅ
18
+ soup_debug = BeautifulSoup(response.text, 'html.parser')
19
+ debug_info = soup_debug.prettify()[:1000] # ์ฒ˜์Œ 1000์ž๋งŒ ํ‘œ์‹œ
20
+
21
+ # lxml์„ ์‚ฌ์šฉํ•˜์—ฌ HTML ํŒŒ์‹ฑ
22
+ tree = html.fromstring(response.content)
23
+
24
+ # ์ œ๋ชฉ ์ถ”์ถœ
25
+ title_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[1]/div/div/div[2]/div/p/span'
26
+ title_elements = tree.xpath(title_xpath)
27
+ if not title_elements:
28
  title = "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
 
 
 
 
 
 
 
 
 
 
29
  else:
30
+ title = ''.join(title_elements[0].itertext()).strip()
31
+
32
+ # ๋‚ด์šฉ ์ถ”์ถœ
33
+ content_xpath = '/html/body/div[7]/div[1]/div[2]/div[2]/div[2]/div[1]/div/div/div[10]/div[1]/div/table[2]/tbody/tr/td[2]/div[1]/div/div/div[2]/div[1]/div/div/div/p[1]/span/b'
34
+ content_elements = tree.xpath(content_xpath)
35
+ if not content_elements:
36
+ content = "๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
37
+ else:
38
+ content = ''.join(content_elements[0].itertext()).strip()
39
+
40
+ # ์ถœ๋ ฅ ํ˜•์‹
41
+ output_title = f"์ œ๋ชฉ :\n{title}"
42
+ output_content = f"๋‚ด์šฉ :\n{content}"
43
+
44
+ # ๋””๋ฒ„๊น… ์ •๋ณด ์ถ”๊ฐ€
45
+ debug_output = f"๋””๋ฒ„๊น… ์ •๋ณด (HTML ์ผ๋ถ€):\n{debug_info}"
46
+
47
+ return f"{output_title}\n\n{output_content}", debug_output
48
+
49
  except Exception as e:
50
+ return f"An error occurred: {str(e)}", ""
 
 
51
 
52
+ # Gradio ์ธํ„ฐํŽ˜์ด์Šค ๊ตฌ์„ฑ
53
+ title = "๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ ์Šคํฌ๋ž˜ํผ"
54
+ description = "๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ URL์„ ์ž…๋ ฅํ•˜๋ฉด ์ œ๋ชฉ๊ณผ ๋‚ด์šฉ์„ ์Šคํฌ๋ž˜ํ•‘ํ•ฉ๋‹ˆ๋‹ค."
55
+
56
+ iface = gr.Interface(
57
+ fn=scrape_naver_blog,
58
+ inputs=gr.inputs.Textbox(lines=2, placeholder="https://blog.naver.com/...", label="๋ธ”๋กœ๊ทธ URL"),
59
+ outputs=[
60
+ gr.outputs.Textbox(label="๊ฒฐ๊ณผ"),
61
+ gr.outputs.Textbox(label="๋””๋ฒ„๊น… ์ •๋ณด")
62
+ ],
63
+ title=title,
64
+ description=description,
65
+ allow_flagging="never"
66
+ )
67
 
 
68
  iface.launch()