Nishant Sahu
Updated to fetch product info directly from URLs
5592acd
import gradio as gr
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModel
import torch
from datetime import datetime
import re
import json
# Initialize BERT model and tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
def get_bert_embedding(text):
"""Get BERT embedding for text"""
inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True)
with torch.no_grad():
outputs = model(**inputs)
return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
def calculate_similarity(text1, text2):
"""Calculate similarity between two texts using BERT embeddings"""
emb1 = get_bert_embedding(text1)
emb2 = get_bert_embedding(text2)
return float(torch.nn.functional.cosine_similarity(
torch.tensor(emb1).unsqueeze(0),
torch.tensor(emb2).unsqueeze(0)
))
def extract_product_info(html_content):
"""Extract product information using BERT-based analysis"""
try:
soup = BeautifulSoup(html_content, 'html.parser')
# Define common patterns for each field
patterns = {
'title': [
'product name',
'item title',
'product title'
],
'price': [
'price',
'current price',
'sale price'
],
'description': [
'product description',
'about this item',
'product details'
]
}
product_info = {
"title": {"value": None, "confidence": 0},
"price": {"value": None, "confidence": 0},
"description": {"value": None, "confidence": 0},
"specifications": {"value": {}, "confidence": {}},
"extracted_at": datetime.utcnow().isoformat()
}
# Extract text content from potential elements
for tag in soup.find_all(['h1', 'h2', 'div', 'p', 'span']):
text = ' '.join(tag.stripped_strings)
if not text or len(text) < 3:
continue
# Check each field
for field, examples in patterns.items():
max_similarity = max(calculate_similarity(text, example) for example in examples)
# Update if we found a better match
if max_similarity > product_info[field]["confidence"]:
clean_text = re.sub(r'\s+', ' ', text).strip()
# Special handling for price
if field == 'price':
price_match = re.search(r'[\$\£\€\₹]?\s*(\d+(?:,\d{3})*(?:\.\d{2})?)', clean_text)
if price_match:
try:
price = float(price_match.group(1).replace(',', ''))
product_info[field]["value"] = price
product_info[field]["confidence"] = max_similarity
except ValueError:
continue
else:
product_info[field]["value"] = clean_text
product_info[field]["confidence"] = max_similarity
# Extract specifications from structured elements
spec_sections = soup.find_all(['table', 'dl', 'div'], class_=re.compile(r'spec|detail|info|attribute', re.I))
for section in spec_sections:
# Look for key-value pairs
pairs = re.findall(r'([^:]+):\s*([^:]+)(?=\s+[^:]+:|$)', section.get_text())
for key, value in pairs:
key = key.strip()
value = value.strip()
if key and value and len(key) < 100: # Sanity check
product_info["specifications"]["value"][key] = value
product_info["specifications"]["confidence"][key] = 0.8
# Clean up results
# Remove entries with very low confidence
confidence_threshold = 0.5
for field in ["title", "price", "description"]:
if product_info[field]["confidence"] < confidence_threshold:
product_info[field]["value"] = None
product_info[field]["confidence"] = 0
return json.dumps(product_info, indent=2, ensure_ascii=False)
except Exception as e:
return f"Error: {str(e)}"
# Create Gradio interface
demo = gr.Interface(
fn=extract_product_info,
inputs=gr.Textbox(
label="Product Page HTML",
lines=10,
placeholder="Paste the HTML content of a product page here..."
),
outputs=gr.Textbox(
label="Extracted Product Information",
lines=10
),
title="AI-Powered Product Information Extractor",
description="Uses BERT to accurately extract product information from e-commerce pages",
)
if __name__ == "__main__":
demo.launch()