Spaces:

niksahu
/

productextraction

Sleeping

Nishant Sahu

Updated to fetch product info directly from URLs

5592acd over 1 year ago

5.08 kB

	import gradio as gr
	from bs4 import BeautifulSoup
	from transformers import AutoTokenizer, AutoModel
	import torch
	from datetime import datetime
	import re
	import json

	# Initialize BERT model and tokenizer
	model_name = "bert-base-uncased"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModel.from_pretrained(model_name)

	def get_bert_embedding(text):
	"""Get BERT embedding for text"""
	inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
	with torch.no_grad():
	outputs = model(**inputs)
	return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

	def calculate_similarity(text1, text2):
	"""Calculate similarity between two texts using BERT embeddings"""
	emb1 = get_bert_embedding(text1)
	emb2 = get_bert_embedding(text2)
	return float(torch.nn.functional.cosine_similarity(
	torch.tensor(emb1).unsqueeze(0),
	torch.tensor(emb2).unsqueeze(0)
	))

	def extract_product_info(html_content):
	"""Extract product information using BERT-based analysis"""
	try:
	soup = BeautifulSoup(html_content, 'html.parser')

	# Define common patterns for each field
	patterns = {
	'title': [
	'product name',
	'item title',
	'product title'
	],
	'price': [
	'price',
	'current price',
	'sale price'
	],
	'description': [
	'product description',
	'about this item',
	'product details'
	]
	}

	product_info = {
	"title": {"value": None, "confidence": 0},
	"price": {"value": None, "confidence": 0},
	"description": {"value": None, "confidence": 0},
	"specifications": {"value": {}, "confidence": {}},
	"extracted_at": datetime.utcnow().isoformat()
	}

	# Extract text content from potential elements
	for tag in soup.find_all(['h1', 'h2', 'div', 'p', 'span']):
	text = ' '.join(tag.stripped_strings)
	if not text or len(text) < 3:
	continue

	# Check each field
	for field, examples in patterns.items():
	max_similarity = max(calculate_similarity(text, example) for example in examples)

	# Update if we found a better match
	if max_similarity > product_info[field]["confidence"]:
	clean_text = re.sub(r'\s+', ' ', text).strip()

	# Special handling for price
	if field == 'price':
	price_match = re.search(r'[\$\£\€\₹]?\s(\d+(?:,\d{3})(?:\.\d{2})?)', clean_text)
	if price_match:
	try:
	price = float(price_match.group(1).replace(',', ''))
	product_info[field]["value"] = price
	product_info[field]["confidence"] = max_similarity
	except ValueError:
	continue
	else:
	product_info[field]["value"] = clean_text
	product_info[field]["confidence"] = max_similarity

	# Extract specifications from structured elements
	spec_sections = soup.find_all(['table', 'dl', 'div'], class_=re.compile(r'spec\|detail\|info\|attribute', re.I))
	for section in spec_sections:
	# Look for key-value pairs
	pairs = re.findall(r'([^:]+):\s*([^:]+)(?=\s+[^:]+:\|$)', section.get_text())
	for key, value in pairs:
	key = key.strip()
	value = value.strip()
	if key and value and len(key) < 100: # Sanity check
	product_info["specifications"]["value"][key] = value
	product_info["specifications"]["confidence"][key] = 0.8

	# Clean up results
	# Remove entries with very low confidence
	confidence_threshold = 0.5
	for field in ["title", "price", "description"]:
	if product_info[field]["confidence"] < confidence_threshold:
	product_info[field]["value"] = None
	product_info[field]["confidence"] = 0

	return json.dumps(product_info, indent=2, ensure_ascii=False)

	except Exception as e:
	return f"Error: {str(e)}"

	# Create Gradio interface
	demo = gr.Interface(
	fn=extract_product_info,
	inputs=gr.Textbox(
	label="Product Page HTML",
	lines=10,
	placeholder="Paste the HTML content of a product page here..."
	),
	outputs=gr.Textbox(
	label="Extracted Product Information",
	lines=10
	),
	title="AI-Powered Product Information Extractor",
	description="Uses BERT to accurately extract product information from e-commerce pages",
	)

	if __name__ == "__main__":
	demo.launch()