Kims12 commited on
Commit
ddfb11c
ยท
verified ยท
1 Parent(s): a241c57

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -23
app.py CHANGED
@@ -23,36 +23,44 @@ def scrape_naver_blog(url):
23
  soup = BeautifulSoup(response.content, 'html.parser')
24
 
25
  # ์ œ๋ชฉ ์ถ”์ถœ
26
- # ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ์˜ ์ œ๋ชฉ์€ ์ผ๋ฐ˜์ ์œผ๋กœ <h3> ํƒœ๊ทธ ๋˜๋Š” ํŠน์ • ํด๋ž˜์Šค๋ช…์„ ๊ฐ€์ง‘๋‹ˆ๋‹ค.
27
- # ์‹ค์ œ HTML ๊ตฌ์กฐ์— ๋งž๊ฒŒ ์ˆ˜์ •ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
28
- title_element = soup.find('h3', {'class': 'se_textarea'}) # ์˜ˆ์‹œ ํด๋ž˜์Šค๋ช…
29
- if not title_element:
30
- # ๋‹ค๋ฅธ ๊ฐ€๋Šฅํ•œ ์œ„์น˜ ์‹œ๋„
31
- title_element = soup.find('span', {'class': 'se-fs- se-ff-'})
 
 
32
 
33
- if not title_element:
 
 
 
 
 
 
 
34
  print("์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
35
  title = "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
36
- else:
37
- title = title_element.get_text(strip=True)
38
- print(f"์ถ”์ถœ๋œ ์ œ๋ชฉ: {title}")
39
 
40
  # ๋‚ด์šฉ ํ…์ŠคํŠธ ์ถ”์ถœ
41
- # ๋„ค์ด๋ฒ„ ๋ธ”๋กœ๊ทธ์˜ ๋‚ด์šฉ์€ ์ผ๋ฐ˜์ ์œผ๋กœ <div> ํƒœ๊ทธ ๋‚ด์— ํŠน์ • ํด๋ž˜์Šค๋ช…์„ ๊ฐ€์ง‘๋‹ˆ๋‹ค.
42
- # ์‹ค์ œ HTML ๊ตฌ์กฐ์— ๋งž๊ฒŒ ์ˆ˜์ •ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
43
- content_elements = soup.find_all('span', {'class': 'se-fs- se-ff-'}) # ์˜ˆ์‹œ ํด๋ž˜์Šค๋ช…
44
 
45
- if not content_elements:
46
- # ๋‹ค๋ฅธ ๊ฐ€๋Šฅํ•œ ์œ„์น˜ ์‹œ๋„
47
- content_elements = soup.find_all('div', {'class': 'se-component se-text se-l-default'})
48
-
49
- if not content_elements:
50
- print("๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
51
- content = "๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
52
  else:
53
- # ์—ฌ๋Ÿฌ span ํƒœ๊ทธ๊ฐ€ ์žˆ์„ ์ˆ˜ ์žˆ์œผ๋ฏ€๋กœ, ํ•„์š”ํ•œ ๋ถ€๋ถ„์„ ํ•ฉ์นฉ๋‹ˆ๋‹ค.
54
- content = ' '.join([elem.get_text(strip=True) for elem in content_elements])
55
- print(f"์ถ”์ถœ๋œ ๋‚ด์šฉ: {content}")
 
 
 
 
 
56
 
57
  # ์ถœ๋ ฅ ํ˜•์‹ ์ง€์ •
58
  output = f"์ œ๋ชฉ: {title}\n\n๋‚ด์šฉ: {content}"
 
23
  soup = BeautifulSoup(response.content, 'html.parser')
24
 
25
  # ์ œ๋ชฉ ์ถ”์ถœ
26
+ # ์‹ค์ œ HTML ๊ตฌ์กฐ์— ๋งž๊ฒŒ ํด๋ž˜์Šค๋ช…๊ณผ ํƒœ๊ทธ๋ฅผ ์ˆ˜์ •ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
27
+ title = None
28
+
29
+ # ์˜ˆ์‹œ 1: <h3 class="se_textarea">์— ์ œ๋ชฉ์ด ์žˆ๋Š” ๊ฒฝ์šฐ
30
+ title_element = soup.find('h3', class_='se_textarea')
31
+ if title_element and title_element.get_text(strip=True):
32
+ title = title_element.get_text(strip=True)
33
+ print(f"์ถ”์ถœ๋œ ์ œ๋ชฉ (h3.se_textarea): {title}")
34
 
35
+ # ์˜ˆ์‹œ 2: meta ํƒœ๊ทธ์—์„œ ์ œ๋ชฉ ์ถ”์ถœ
36
+ if not title:
37
+ title_meta = soup.find('meta', property='og:title')
38
+ if title_meta and title_meta.get('content'):
39
+ title = title_meta.get('content').strip()
40
+ print(f"์ถ”์ถœ๋œ ์ œ๋ชฉ (meta og:title): {title}")
41
+
42
+ if not title:
43
  print("์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
44
  title = "์ œ๋ชฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
 
 
 
45
 
46
  # ๋‚ด์šฉ ํ…์ŠคํŠธ ์ถ”์ถœ
47
+ # ์‹ค์ œ HTML ๊ตฌ์กฐ์— ๋งž๊ฒŒ ํด๋ž˜์Šค๋ช…๊ณผ ํƒœ๊ทธ๋ฅผ ์ˆ˜์ •ํ•ด์•ผ ํ•ฉ๋‹ˆ๋‹ค.
48
+ content = None
 
49
 
50
+ # ์˜ˆ์‹œ 1: <div class="se-main-container"> ๋‚ด์˜ ๋ชจ๋“  ํ…์ŠคํŠธ ์ถ”์ถœ
51
+ content_container = soup.find('div', class_='se-main-container')
52
+ if content_container:
53
+ content = content_container.get_text(separator='\n', strip=True)
54
+ print(f"์ถ”์ถœ๋œ ๋‚ด์šฉ (div.se-main-container): {content[:100]}...") # ์ผ๋ถ€๋งŒ ์ถœ๋ ฅ
 
 
55
  else:
56
+ # ์˜ˆ์‹œ 2: ๋ชจ๋“  <p> ํƒœ๊ทธ๋ฅผ ํ•ฉ์น˜๋Š” ๋ฐฉ๋ฒ•
57
+ p_tags = soup.find_all('p')
58
+ if p_tags:
59
+ content = '\n'.join([p.get_text(strip=True) for p in p_tags])
60
+ print(f"์ถ”์ถœ๋œ ๋‚ด์šฉ (p tags): {content[:100]}...")
61
+ else:
62
+ print("๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค.")
63
+ content = "๋‚ด์šฉ์„ ์ฐพ์„ ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
64
 
65
  # ์ถœ๋ ฅ ํ˜•์‹ ์ง€์ •
66
  output = f"์ œ๋ชฉ: {title}\n\n๋‚ด์šฉ: {content}"