amalsp commited on
Commit
2db3fe7
·
verified ·
1 Parent(s): 82f5373

Fix content extraction to provide clean, structured product data

Browse files
Files changed (1) hide show
  1. main.py +66 -24
main.py CHANGED
@@ -77,37 +77,79 @@ def scrape_links(soup: BeautifulSoup):
77
  return df
78
 
79
  def scrape_all_content(soup: BeautifulSoup):
80
- # Extract ALL visible text content from the page
81
  data = []
82
 
83
- # Get all divs, spans, and p tags with text
84
- for element in soup.find_all(["div", "span", "p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "td", "th"]):
85
- text = element.get_text(strip=True)
86
- if text and len(text) > 2: # Only include meaningful text
87
- # Get element classes and id for context
88
- classes = " ".join(element.get("class", []))
89
- elem_id = element.get("id", "")
90
- elem_type = element.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
91
 
92
- data.append({
93
- "Type": elem_type,
94
- "Content": text[:500], # Limit to 500 chars per element
95
- "Class": classes[:100] if classes else "",
96
- "ID": elem_id[:50] if elem_id else ""
97
- })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
  if not data:
100
- raise HTTPException(status_code=400, detail="No content found on page")
101
 
102
- # Remove duplicate content
103
- seen = set()
104
- unique_data = []
105
- for item in data:
106
- if item["Content"] not in seen:
107
- seen.add(item["Content"])
108
- unique_data.append(item)
109
 
110
- df = pd.DataFrame(unique_data)
111
  return df
112
 
113
  @app.post("/scrape")
 
77
  return df
78
 
79
  def scrape_all_content(soup: BeautifulSoup):
80
+ # IMPROVED: Extract only meaningful product/content data
81
  data = []
82
 
83
+ # Remove unwanted elements (navigation, scripts, styles, ads)
84
+ for tag in soup(["script", "style", "nav", "header", "footer", "aside", "iframe"]):
85
+ tag.decompose()
86
+
87
+ # Try to find product/article containers first (common e-commerce patterns)
88
+ product_containers = soup.find_all(
89
+ attrs={
90
+ "class": re.compile(r"product|item|card|listing|article", re.I)
91
+ }
92
+ )
93
+
94
+ # If we find product containers, extract from them
95
+ if product_containers and len(product_containers) > 5:
96
+ for container in product_containers[:100]: # Limit to first 100 items
97
+ # Extract title/name
98
+ title_elem = container.find(["h1", "h2", "h3", "h4", "a"],
99
+ attrs={"class": re.compile(r"title|name|heading", re.I)})
100
+ title = title_elem.get_text(strip=True) if title_elem else ""
101
+
102
+ # Extract price
103
+ price_elem = container.find(attrs={"class": re.compile(r"price|cost|amount", re.I)})
104
+ price = price_elem.get_text(strip=True) if price_elem else ""
105
+
106
+ # Extract description
107
+ desc_elem = container.find(["p", "div"],
108
+ attrs={"class": re.compile(r"desc|detail|summary", re.I)})
109
+ description = desc_elem.get_text(strip=True)[:200] if desc_elem else ""
110
 
111
+ # Extract link
112
+ link_elem = container.find("a", href=True)
113
+ link = link_elem["href"] if link_elem else ""
114
+
115
+ if title or price: # Only add if we have meaningful data
116
+ data.append({
117
+ "Title": title[:200],
118
+ "Price": price[:50],
119
+ "Description": description,
120
+ "Link": link[:300]
121
+ })
122
+
123
+ # Fallback: If no product containers found, extract main content
124
+ else:
125
+ # Look for main content area
126
+ main_content = soup.find(["main", "article", "div"],
127
+ attrs={"id": re.compile(r"main|content|primary", re.I)}) or soup
128
+
129
+ # Extract headings and associated content
130
+ for heading in main_content.find_all(["h1", "h2", "h3"]):
131
+ heading_text = heading.get_text(strip=True)
132
+ if len(heading_text) > 5: # Skip very short headings
133
+ # Get next sibling paragraph or div
134
+ content = ""
135
+ next_elem = heading.find_next_sibling(["p", "div", "ul"])
136
+ if next_elem:
137
+ content = next_elem.get_text(strip=True)[:300]
138
+
139
+ data.append({
140
+ "Title": heading_text[:200],
141
+ "Price": "",
142
+ "Description": content,
143
+ "Link": ""
144
+ })
145
 
146
  if not data:
147
+ raise HTTPException(status_code=400, detail="No meaningful content found on page. Try 'Tables' or 'Links' mode instead.")
148
 
149
+ # Remove exact duplicates
150
+ df = pd.DataFrame(data)
151
+ df = df.drop_duplicates(subset=["Title"], keep="first")
 
 
 
 
152
 
 
153
  return df
154
 
155
  @app.post("/scrape")