Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -55,17 +55,13 @@ async def get_page_content(session, url):
|
|
| 55 |
text = await response.text()
|
| 56 |
soup = BeautifulSoup(text, 'html.parser')
|
| 57 |
content = []
|
| 58 |
-
|
| 59 |
-
# Look for the main content area
|
| 60 |
-
main_content = soup.find('div', id='react-entry-point')
|
| 61 |
-
|
| 62 |
if main_content:
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
logger.info(f"Found {len(content)} content items for {url}")
|
| 70 |
return content
|
| 71 |
else:
|
|
@@ -82,18 +78,12 @@ async def get_links(session, url, base_url):
|
|
| 82 |
if response.status == 200:
|
| 83 |
text = await response.text()
|
| 84 |
soup = BeautifulSoup(text, 'html.parser')
|
|
|
|
| 85 |
valid_links = []
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
if main_content:
|
| 91 |
-
for link in main_content.find_all('a', href=True):
|
| 92 |
-
href = link['href']
|
| 93 |
-
full_url = urljoin(base_url, href)
|
| 94 |
-
if full_url.startswith(base_url) and full_url != url:
|
| 95 |
-
valid_links.append(full_url)
|
| 96 |
-
|
| 97 |
return valid_links
|
| 98 |
else:
|
| 99 |
logger.error(f"Error fetching links from {url}: HTTP {response.status}")
|
|
@@ -116,14 +106,27 @@ async def crawl_pages(base_url, max_depth):
|
|
| 116 |
visited.add(current_url)
|
| 117 |
start_time = time.time()
|
| 118 |
|
| 119 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
all_pages.append((current_url, content))
|
| 121 |
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
| 122 |
|
| 123 |
if depth < max_depth:
|
| 124 |
links = await get_links(session, current_url, base_url)
|
| 125 |
for link in links:
|
| 126 |
-
if link not in visited
|
| 127 |
to_visit.append((link, depth + 1))
|
| 128 |
|
| 129 |
return all_pages
|
|
|
|
| 55 |
text = await response.text()
|
| 56 |
soup = BeautifulSoup(text, 'html.parser')
|
| 57 |
content = []
|
| 58 |
+
main_content = soup.find('article') or soup.find('main') or soup
|
|
|
|
|
|
|
|
|
|
| 59 |
if main_content:
|
| 60 |
+
for tag in ['p', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'li']:
|
| 61 |
+
for element in main_content.find_all(tag):
|
| 62 |
+
text = clean_text(element.get_text(strip=True))
|
| 63 |
+
if text:
|
| 64 |
+
content.append(text)
|
|
|
|
| 65 |
logger.info(f"Found {len(content)} content items for {url}")
|
| 66 |
return content
|
| 67 |
else:
|
|
|
|
| 78 |
if response.status == 200:
|
| 79 |
text = await response.text()
|
| 80 |
soup = BeautifulSoup(text, 'html.parser')
|
| 81 |
+
links = soup.find_all('a', href=True)
|
| 82 |
valid_links = []
|
| 83 |
+
for link in links:
|
| 84 |
+
full_url = urljoin(url, link['href'])
|
| 85 |
+
if full_url.startswith(base_url) and full_url != url:
|
| 86 |
+
valid_links.append(full_url)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
return valid_links
|
| 88 |
else:
|
| 89 |
logger.error(f"Error fetching links from {url}: HTTP {response.status}")
|
|
|
|
| 106 |
visited.add(current_url)
|
| 107 |
start_time = time.time()
|
| 108 |
|
| 109 |
+
with get_db_connection() as conn:
|
| 110 |
+
c = conn.cursor()
|
| 111 |
+
c.execute("SELECT content FROM pages WHERE url = ?", (current_url,))
|
| 112 |
+
result = c.fetchone()
|
| 113 |
+
|
| 114 |
+
if result:
|
| 115 |
+
content = eval(result[0]) # Convert string back to list
|
| 116 |
+
else:
|
| 117 |
+
content = await get_page_content(session, current_url)
|
| 118 |
+
with get_db_connection() as conn:
|
| 119 |
+
c = conn.cursor()
|
| 120 |
+
c.execute("INSERT INTO pages VALUES (?, ?, ?)", (current_url, str(content), depth))
|
| 121 |
+
conn.commit()
|
| 122 |
+
|
| 123 |
all_pages.append((current_url, content))
|
| 124 |
logger.info(f"Processed page: {current_url} at depth {depth} in {time.time() - start_time:.2f} seconds")
|
| 125 |
|
| 126 |
if depth < max_depth:
|
| 127 |
links = await get_links(session, current_url, base_url)
|
| 128 |
for link in links:
|
| 129 |
+
if link not in visited:
|
| 130 |
to_visit.append((link, depth + 1))
|
| 131 |
|
| 132 |
return all_pages
|