Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,6 +3,7 @@ import trafilatura
|
|
| 3 |
import requests
|
| 4 |
from markdownify import markdownify as md
|
| 5 |
from bs4 import BeautifulSoup
|
|
|
|
| 6 |
|
| 7 |
def extract(url):
|
| 8 |
headers = {"User-Agent": "Mozilla/5.0"}
|
|
@@ -21,13 +22,15 @@ def extract(url):
|
|
| 21 |
if not html_content:
|
| 22 |
return "๋ณธ๋ฌธ์ ์ถ์ถํ ์ ์์ต๋๋ค."
|
| 23 |
|
| 24 |
-
# ์ด๋ฏธ์ง
|
| 25 |
soup = BeautifulSoup(r.text, "lxml")
|
| 26 |
images = []
|
| 27 |
for img in soup.find_all("img"):
|
| 28 |
src = img.get("src")
|
| 29 |
-
if src
|
| 30 |
-
|
|
|
|
|
|
|
| 31 |
|
| 32 |
# HTML โ Markdown ๋ณํ
|
| 33 |
markdown_text = md(html_content, heading_style="ATX")
|
|
|
|
| 3 |
import requests
|
| 4 |
from markdownify import markdownify as md
|
| 5 |
from bs4 import BeautifulSoup
|
| 6 |
+
from urllib.parse import urljoin
|
| 7 |
|
| 8 |
def extract(url):
|
| 9 |
headers = {"User-Agent": "Mozilla/5.0"}
|
|
|
|
| 22 |
if not html_content:
|
| 23 |
return "๋ณธ๋ฌธ์ ์ถ์ถํ ์ ์์ต๋๋ค."
|
| 24 |
|
| 25 |
+
# ์ด๋ฏธ์ง ์ ๋ ๊ฒฝ๋ก ๋ณํ
|
| 26 |
soup = BeautifulSoup(r.text, "lxml")
|
| 27 |
images = []
|
| 28 |
for img in soup.find_all("img"):
|
| 29 |
src = img.get("src")
|
| 30 |
+
if src:
|
| 31 |
+
full_url = urljoin(url, src) # ์๋ ๊ฒฝ๋ก โ ์ ๋ ๊ฒฝ๋ก
|
| 32 |
+
if full_url.startswith("http"):
|
| 33 |
+
images.append(f"")
|
| 34 |
|
| 35 |
# HTML โ Markdown ๋ณํ
|
| 36 |
markdown_text = md(html_content, heading_style="ATX")
|