Spaces:
Sleeping
Sleeping
rsm-roguchi commited on
Commit ·
7ebaa26
1
Parent(s): 3242f2a
update brokens
Browse files- app.py +4 -4
- server/general_blog.py +20 -22
app.py
CHANGED
|
@@ -5,7 +5,7 @@ import os
|
|
| 5 |
|
| 6 |
from ui import (
|
| 7 |
blog,
|
| 8 |
-
general_blog,
|
| 9 |
meta,
|
| 10 |
twitter,
|
| 11 |
price_matching
|
|
@@ -13,7 +13,7 @@ from ui import (
|
|
| 13 |
|
| 14 |
from server import (
|
| 15 |
blog as blog_srv,
|
| 16 |
-
general_blog as general_blog_srv,
|
| 17 |
meta as meta_srv,
|
| 18 |
twitter as twitter_srv,
|
| 19 |
price_matching as price_matching_srv
|
|
@@ -23,7 +23,7 @@ from server import (
|
|
| 23 |
ui = ui.page_fluid(
|
| 24 |
ui.page_navbar(
|
| 25 |
blog.ui,
|
| 26 |
-
general_blog.ui,
|
| 27 |
meta.ui,
|
| 28 |
twitter.ui,
|
| 29 |
price_matching.ui,
|
|
@@ -36,7 +36,7 @@ ui = ui.page_fluid(
|
|
| 36 |
|
| 37 |
def server(input, output, session):
|
| 38 |
blog_srv.server(input, output, session)
|
| 39 |
-
general_blog_srv.server(input, output, session)
|
| 40 |
meta_srv.server(input, output, session)
|
| 41 |
twitter_srv.server(input, output, session)
|
| 42 |
price_matching_srv.server(input, output, session)
|
|
|
|
| 5 |
|
| 6 |
from ui import (
|
| 7 |
blog,
|
| 8 |
+
#general_blog,
|
| 9 |
meta,
|
| 10 |
twitter,
|
| 11 |
price_matching
|
|
|
|
| 13 |
|
| 14 |
from server import (
|
| 15 |
blog as blog_srv,
|
| 16 |
+
#general_blog as general_blog_srv,
|
| 17 |
meta as meta_srv,
|
| 18 |
twitter as twitter_srv,
|
| 19 |
price_matching as price_matching_srv
|
|
|
|
| 23 |
ui = ui.page_fluid(
|
| 24 |
ui.page_navbar(
|
| 25 |
blog.ui,
|
| 26 |
+
#general_blog.ui,
|
| 27 |
meta.ui,
|
| 28 |
twitter.ui,
|
| 29 |
price_matching.ui,
|
|
|
|
| 36 |
|
| 37 |
def server(input, output, session):
|
| 38 |
blog_srv.server(input, output, session)
|
| 39 |
+
#general_blog_srv.server(input, output, session)
|
| 40 |
meta_srv.server(input, output, session)
|
| 41 |
twitter_srv.server(input, output, session)
|
| 42 |
price_matching_srv.server(input, output, session)
|
server/general_blog.py
CHANGED
|
@@ -17,29 +17,27 @@ SHOPIFY_API_VERSION = "2024-04"
|
|
| 17 |
BLOG_ID = "73667707064"
|
| 18 |
|
| 19 |
# === Static scraper for pokemon.com ===
|
| 20 |
-
def scrape_section_content_from_url(url: str) -> str:
|
| 21 |
try:
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
print(f"[INFO] Extracted {len(texts)} content blocks.")
|
| 42 |
-
return "\n\n".join(texts)
|
| 43 |
|
| 44 |
except Exception as e:
|
| 45 |
print(f"[ERROR] Scraping failed: {e}")
|
|
|
|
| 17 |
BLOG_ID = "73667707064"
|
| 18 |
|
| 19 |
# === Static scraper for pokemon.com ===
|
| 20 |
+
async def scrape_section_content_from_url(url: str) -> str:
|
| 21 |
try:
|
| 22 |
+
async with async_playwright() as p:
|
| 23 |
+
browser = await p.chromium.launch(headless=True)
|
| 24 |
+
page = await browser.new_page()
|
| 25 |
+
await page.goto(url, timeout=30000)
|
| 26 |
+
await page.wait_for_load_state("networkidle")
|
| 27 |
+
html = await page.content()
|
| 28 |
+
await browser.close()
|
| 29 |
+
|
| 30 |
+
soup = BeautifulSoup(html, "html.parser")
|
| 31 |
+
|
| 32 |
+
# Match all divs and extract text
|
| 33 |
+
content_blocks = soup.find_all("div")
|
| 34 |
+
if not content_blocks:
|
| 35 |
+
print("[WARN] No <div> elements found.")
|
| 36 |
+
return ""
|
| 37 |
+
|
| 38 |
+
texts = [div.get_text(separator=" ", strip=True) for div in content_blocks if div.get_text(strip=True)]
|
| 39 |
+
print(f"[INFO] Extracted {len(texts)} content blocks.")
|
| 40 |
+
return "\n\n".join(texts)
|
|
|
|
|
|
|
| 41 |
|
| 42 |
except Exception as e:
|
| 43 |
print(f"[ERROR] Scraping failed: {e}")
|