orgoflu commited on
Commit
df6d951
ยท
verified ยท
1 Parent(s): 569f84c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -3
app.py CHANGED
@@ -3,6 +3,7 @@ import trafilatura
3
  import requests
4
  from markdownify import markdownify as md
5
  from bs4 import BeautifulSoup
 
6
 
7
  def extract(url):
8
  headers = {"User-Agent": "Mozilla/5.0"}
@@ -21,13 +22,15 @@ def extract(url):
21
  if not html_content:
22
  return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
23
 
24
- # ์ด๋ฏธ์ง€ ํƒœ๊ทธ ์ถ”์ถœ
25
  soup = BeautifulSoup(r.text, "lxml")
26
  images = []
27
  for img in soup.find_all("img"):
28
  src = img.get("src")
29
- if src and src.startswith("http"):
30
- images.append(f"![์ด๋ฏธ์ง€]({src})")
 
 
31
 
32
  # HTML โ†’ Markdown ๋ณ€ํ™˜
33
  markdown_text = md(html_content, heading_style="ATX")
 
3
  import requests
4
  from markdownify import markdownify as md
5
  from bs4 import BeautifulSoup
6
+ from urllib.parse import urljoin
7
 
8
  def extract(url):
9
  headers = {"User-Agent": "Mozilla/5.0"}
 
22
  if not html_content:
23
  return "๋ณธ๋ฌธ์„ ์ถ”์ถœํ•  ์ˆ˜ ์—†์Šต๋‹ˆ๋‹ค."
24
 
25
+ # ์ด๋ฏธ์ง€ ์ ˆ๋Œ€ ๊ฒฝ๋กœ ๋ณ€ํ™˜
26
  soup = BeautifulSoup(r.text, "lxml")
27
  images = []
28
  for img in soup.find_all("img"):
29
  src = img.get("src")
30
+ if src:
31
+ full_url = urljoin(url, src) # ์ƒ๋Œ€ ๊ฒฝ๋กœ โ†’ ์ ˆ๋Œ€ ๊ฒฝ๋กœ
32
+ if full_url.startswith("http"):
33
+ images.append(f"![์ด๋ฏธ์ง€]({full_url})")
34
 
35
  # HTML โ†’ Markdown ๋ณ€ํ™˜
36
  markdown_text = md(html_content, heading_style="ATX")