Spaces:
Sleeping
Sleeping
Fix content extraction to provide clean, structured product data
Browse files
main.py
CHANGED
|
@@ -77,37 +77,79 @@ def scrape_links(soup: BeautifulSoup):
|
|
| 77 |
return df
|
| 78 |
|
| 79 |
def scrape_all_content(soup: BeautifulSoup):
|
| 80 |
-
# Extract
|
| 81 |
data = []
|
| 82 |
|
| 83 |
-
#
|
| 84 |
-
for
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
|
| 99 |
if not data:
|
| 100 |
-
raise HTTPException(status_code=400, detail="No content found on page")
|
| 101 |
|
| 102 |
-
# Remove
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
for item in data:
|
| 106 |
-
if item["Content"] not in seen:
|
| 107 |
-
seen.add(item["Content"])
|
| 108 |
-
unique_data.append(item)
|
| 109 |
|
| 110 |
-
df = pd.DataFrame(unique_data)
|
| 111 |
return df
|
| 112 |
|
| 113 |
@app.post("/scrape")
|
|
|
|
| 77 |
return df
|
| 78 |
|
| 79 |
def scrape_all_content(soup: BeautifulSoup):
|
| 80 |
+
# IMPROVED: Extract only meaningful product/content data
|
| 81 |
data = []
|
| 82 |
|
| 83 |
+
# Remove unwanted elements (navigation, scripts, styles, ads)
|
| 84 |
+
for tag in soup(["script", "style", "nav", "header", "footer", "aside", "iframe"]):
|
| 85 |
+
tag.decompose()
|
| 86 |
+
|
| 87 |
+
# Try to find product/article containers first (common e-commerce patterns)
|
| 88 |
+
product_containers = soup.find_all(
|
| 89 |
+
attrs={
|
| 90 |
+
"class": re.compile(r"product|item|card|listing|article", re.I)
|
| 91 |
+
}
|
| 92 |
+
)
|
| 93 |
+
|
| 94 |
+
# If we find product containers, extract from them
|
| 95 |
+
if product_containers and len(product_containers) > 5:
|
| 96 |
+
for container in product_containers[:100]: # Limit to first 100 items
|
| 97 |
+
# Extract title/name
|
| 98 |
+
title_elem = container.find(["h1", "h2", "h3", "h4", "a"],
|
| 99 |
+
attrs={"class": re.compile(r"title|name|heading", re.I)})
|
| 100 |
+
title = title_elem.get_text(strip=True) if title_elem else ""
|
| 101 |
+
|
| 102 |
+
# Extract price
|
| 103 |
+
price_elem = container.find(attrs={"class": re.compile(r"price|cost|amount", re.I)})
|
| 104 |
+
price = price_elem.get_text(strip=True) if price_elem else ""
|
| 105 |
+
|
| 106 |
+
# Extract description
|
| 107 |
+
desc_elem = container.find(["p", "div"],
|
| 108 |
+
attrs={"class": re.compile(r"desc|detail|summary", re.I)})
|
| 109 |
+
description = desc_elem.get_text(strip=True)[:200] if desc_elem else ""
|
| 110 |
|
| 111 |
+
# Extract link
|
| 112 |
+
link_elem = container.find("a", href=True)
|
| 113 |
+
link = link_elem["href"] if link_elem else ""
|
| 114 |
+
|
| 115 |
+
if title or price: # Only add if we have meaningful data
|
| 116 |
+
data.append({
|
| 117 |
+
"Title": title[:200],
|
| 118 |
+
"Price": price[:50],
|
| 119 |
+
"Description": description,
|
| 120 |
+
"Link": link[:300]
|
| 121 |
+
})
|
| 122 |
+
|
| 123 |
+
# Fallback: If no product containers found, extract main content
|
| 124 |
+
else:
|
| 125 |
+
# Look for main content area
|
| 126 |
+
main_content = soup.find(["main", "article", "div"],
|
| 127 |
+
attrs={"id": re.compile(r"main|content|primary", re.I)}) or soup
|
| 128 |
+
|
| 129 |
+
# Extract headings and associated content
|
| 130 |
+
for heading in main_content.find_all(["h1", "h2", "h3"]):
|
| 131 |
+
heading_text = heading.get_text(strip=True)
|
| 132 |
+
if len(heading_text) > 5: # Skip very short headings
|
| 133 |
+
# Get next sibling paragraph or div
|
| 134 |
+
content = ""
|
| 135 |
+
next_elem = heading.find_next_sibling(["p", "div", "ul"])
|
| 136 |
+
if next_elem:
|
| 137 |
+
content = next_elem.get_text(strip=True)[:300]
|
| 138 |
+
|
| 139 |
+
data.append({
|
| 140 |
+
"Title": heading_text[:200],
|
| 141 |
+
"Price": "",
|
| 142 |
+
"Description": content,
|
| 143 |
+
"Link": ""
|
| 144 |
+
})
|
| 145 |
|
| 146 |
if not data:
|
| 147 |
+
raise HTTPException(status_code=400, detail="No meaningful content found on page. Try 'Tables' or 'Links' mode instead.")
|
| 148 |
|
| 149 |
+
# Remove exact duplicates
|
| 150 |
+
df = pd.DataFrame(data)
|
| 151 |
+
df = df.drop_duplicates(subset=["Title"], keep="first")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
|
|
|
|
| 153 |
return df
|
| 154 |
|
| 155 |
@app.post("/scrape")
|