Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from bs4 import BeautifulSoup | |
| from transformers import AutoTokenizer, AutoModel | |
| import torch | |
| from datetime import datetime | |
| import re | |
| import json | |
| # Initialize BERT model and tokenizer | |
| model_name = "bert-base-uncased" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModel.from_pretrained(model_name) | |
| def get_bert_embedding(text): | |
| """Get BERT embedding for text""" | |
| inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512, padding=True) | |
| with torch.no_grad(): | |
| outputs = model(**inputs) | |
| return outputs.last_hidden_state.mean(dim=1).squeeze().numpy() | |
| def calculate_similarity(text1, text2): | |
| """Calculate similarity between two texts using BERT embeddings""" | |
| emb1 = get_bert_embedding(text1) | |
| emb2 = get_bert_embedding(text2) | |
| return float(torch.nn.functional.cosine_similarity( | |
| torch.tensor(emb1).unsqueeze(0), | |
| torch.tensor(emb2).unsqueeze(0) | |
| )) | |
| def extract_product_info(html_content): | |
| """Extract product information using BERT-based analysis""" | |
| try: | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| # Define common patterns for each field | |
| patterns = { | |
| 'title': [ | |
| 'product name', | |
| 'item title', | |
| 'product title' | |
| ], | |
| 'price': [ | |
| 'price', | |
| 'current price', | |
| 'sale price' | |
| ], | |
| 'description': [ | |
| 'product description', | |
| 'about this item', | |
| 'product details' | |
| ] | |
| } | |
| product_info = { | |
| "title": {"value": None, "confidence": 0}, | |
| "price": {"value": None, "confidence": 0}, | |
| "description": {"value": None, "confidence": 0}, | |
| "specifications": {"value": {}, "confidence": {}}, | |
| "extracted_at": datetime.utcnow().isoformat() | |
| } | |
| # Extract text content from potential elements | |
| for tag in soup.find_all(['h1', 'h2', 'div', 'p', 'span']): | |
| text = ' '.join(tag.stripped_strings) | |
| if not text or len(text) < 3: | |
| continue | |
| # Check each field | |
| for field, examples in patterns.items(): | |
| max_similarity = max(calculate_similarity(text, example) for example in examples) | |
| # Update if we found a better match | |
| if max_similarity > product_info[field]["confidence"]: | |
| clean_text = re.sub(r'\s+', ' ', text).strip() | |
| # Special handling for price | |
| if field == 'price': | |
| price_match = re.search(r'[\$\£\€\₹]?\s*(\d+(?:,\d{3})*(?:\.\d{2})?)', clean_text) | |
| if price_match: | |
| try: | |
| price = float(price_match.group(1).replace(',', '')) | |
| product_info[field]["value"] = price | |
| product_info[field]["confidence"] = max_similarity | |
| except ValueError: | |
| continue | |
| else: | |
| product_info[field]["value"] = clean_text | |
| product_info[field]["confidence"] = max_similarity | |
| # Extract specifications from structured elements | |
| spec_sections = soup.find_all(['table', 'dl', 'div'], class_=re.compile(r'spec|detail|info|attribute', re.I)) | |
| for section in spec_sections: | |
| # Look for key-value pairs | |
| pairs = re.findall(r'([^:]+):\s*([^:]+)(?=\s+[^:]+:|$)', section.get_text()) | |
| for key, value in pairs: | |
| key = key.strip() | |
| value = value.strip() | |
| if key and value and len(key) < 100: # Sanity check | |
| product_info["specifications"]["value"][key] = value | |
| product_info["specifications"]["confidence"][key] = 0.8 | |
| # Clean up results | |
| # Remove entries with very low confidence | |
| confidence_threshold = 0.5 | |
| for field in ["title", "price", "description"]: | |
| if product_info[field]["confidence"] < confidence_threshold: | |
| product_info[field]["value"] = None | |
| product_info[field]["confidence"] = 0 | |
| return json.dumps(product_info, indent=2, ensure_ascii=False) | |
| except Exception as e: | |
| return f"Error: {str(e)}" | |
| # Create Gradio interface | |
| demo = gr.Interface( | |
| fn=extract_product_info, | |
| inputs=gr.Textbox( | |
| label="Product Page HTML", | |
| lines=10, | |
| placeholder="Paste the HTML content of a product page here..." | |
| ), | |
| outputs=gr.Textbox( | |
| label="Extracted Product Information", | |
| lines=10 | |
| ), | |
| title="AI-Powered Product Information Extractor", | |
| description="Uses BERT to accurately extract product information from e-commerce pages", | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |