""" Schema Generator Automatically generates JSON-LD schema markup for websites """ import json from typing import Dict, List from datetime import datetime def generate_organization_schema(audit: Dict) -> Dict: """Generate Organization schema from audit data""" pages = audit.get('pages', []) org_name = audit.get('org_name', 'Company') url = audit.get('url', '') # Extract contact info from pages emails = [] phones = [] social_links = [] for page in pages: text = page.get('text', '') + ' ' + str(page.get('paragraphs', '')) # Extract emails import re found_emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text) emails.extend(found_emails) # Extract phones found_phones = re.findall(r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text) phones.extend(found_phones) # Extract social links links = page.get('links', []) for link in links: href = link.get('href', '') if isinstance(link, dict) else str(link) if any(social in href.lower() for social in ['facebook', 'twitter', 'instagram', 'linkedin', 'youtube']): social_links.append(href) schema = { "@context": "https://schema.org", "@type": "Organization", "name": org_name, "url": url, "logo": f"{url}/logo.png", "description": f"{org_name} - خدمات متميزة" } if emails: schema["email"] = emails[0] if phones: schema["telephone"] = phones[0] if social_links: schema["sameAs"] = list(set(social_links))[:5] # Add contact point if emails or phones: schema["contactPoint"] = { "@type": "ContactPoint", "contactType": "customer service", "email": emails[0] if emails else None, "telephone": phones[0] if phones else None } return schema def generate_faq_schema(faqs: List[Dict]) -> Dict: """Generate FAQPage schema from FAQ list""" if not faqs: return None schema = { "@context": "https://schema.org", "@type": "FAQPage", "mainEntity": [] } for faq in faqs: question = faq.get('question', faq.get('q', '')) answer = faq.get('answer', faq.get('a', '')) if question and answer: schema["mainEntity"].append({ "@type": "Question", "name": question, "acceptedAnswer": { "@type": "Answer", "text": answer } }) return schema if schema["mainEntity"] else None def generate_breadcrumb_schema(url: str) -> Dict: """Generate BreadcrumbList schema from URL structure""" from urllib.parse import urlparse parsed = urlparse(url) path_parts = [p for p in parsed.path.split('/') if p] schema = { "@context": "https://schema.org", "@type": "BreadcrumbList", "itemListElement": [] } # Add home schema["itemListElement"].append({ "@type": "ListItem", "position": 1, "name": "الرئيسية", "item": f"{parsed.scheme}://{parsed.netloc}" }) # Add path parts current_url = f"{parsed.scheme}://{parsed.netloc}" for i, part in enumerate(path_parts, start=2): current_url += f"/{part}" schema["itemListElement"].append({ "@type": "ListItem", "position": i, "name": part.replace('-', ' ').replace('_', ' ').title(), "item": current_url }) return schema def generate_website_schema(audit: Dict) -> Dict: """Generate WebSite schema with search action""" org_name = audit.get('org_name', 'Company') url = audit.get('url', '') schema = { "@context": "https://schema.org", "@type": "WebSite", "name": org_name, "url": url, "potentialAction": { "@type": "SearchAction", "target": { "@type": "EntryPoint", "urlTemplate": f"{url}/search?q={{search_term_string}}" }, "query-input": "required name=search_term_string" } } return schema def generate_article_schema(page: Dict, org_name: str) -> Dict: """Generate Article schema for blog posts""" title = page.get('title', '') url = page.get('url', '') # Try to extract publish date text = page.get('text', '') import re date_match = re.search(r'(\d{4}-\d{2}-\d{2})', text) publish_date = date_match.group(1) if date_match else datetime.now().strftime('%Y-%m-%d') schema = { "@context": "https://schema.org", "@type": "Article", "headline": title, "url": url, "datePublished": publish_date, "dateModified": publish_date, "author": { "@type": "Organization", "name": org_name }, "publisher": { "@type": "Organization", "name": org_name } } # Extract image if available images = page.get('images', []) if images: first_image = images[0] if isinstance(images[0], str) else images[0].get('src', '') if first_image: schema["image"] = first_image return schema def generate_product_schema(page: Dict, org_name: str) -> Dict: """Generate Product schema for product pages""" title = page.get('title', '') url = page.get('url', '') text = page.get('text', '') # Try to extract price import re price_match = re.search(r'(\$|SAR|ريال)\s*(\d+(?:\.\d{2})?)', text) price = price_match.group(2) if price_match else "0" currency = "SAR" if "ريال" in text or "SAR" in text else "USD" schema = { "@context": "https://schema.org", "@type": "Product", "name": title, "url": url, "description": text[:200] if text else title, "offers": { "@type": "Offer", "price": price, "priceCurrency": currency, "availability": "https://schema.org/InStock", "seller": { "@type": "Organization", "name": org_name } } } # Extract image images = page.get('images', []) if images: first_image = images[0] if isinstance(images[0], str) else images[0].get('src', '') if first_image: schema["image"] = first_image return schema def generate_local_business_schema(audit: Dict) -> Dict: """Generate LocalBusiness schema""" pages = audit.get('pages', []) org_name = audit.get('org_name', 'Company') url = audit.get('url', '') # Extract location info addresses = [] for page in pages: text = page.get('text', '') # Look for Saudi cities saudi_cities = ['الرياض', 'جدة', 'مكة', 'المدينة', 'الدمام', 'الخبر', 'تبوك', 'أبها'] for city in saudi_cities: if city in text: addresses.append(city) break schema = { "@context": "https://schema.org", "@type": "LocalBusiness", "name": org_name, "url": url, "address": { "@type": "PostalAddress", "addressCountry": "SA", "addressLocality": addresses[0] if addresses else "الرياض" } } return schema def generate_all_schemas(audit: Dict) -> List[Dict]: """Generate all applicable schemas for a website""" schemas = [] # Always add Organization org_schema = generate_organization_schema(audit) schemas.append(org_schema) # Add WebSite with search website_schema = generate_website_schema(audit) schemas.append(website_schema) # Add LocalBusiness if applicable local_schema = generate_local_business_schema(audit) schemas.append(local_schema) # Add Breadcrumb for main page url = audit.get('url', '') if url: breadcrumb_schema = generate_breadcrumb_schema(url) schemas.append(breadcrumb_schema) return schemas def format_schema_for_html(schemas: List[Dict]) -> str: """Format schemas as HTML script tags""" html_parts = [] for schema in schemas: if schema: json_str = json.dumps(schema, ensure_ascii=False, indent=2) html_parts.append(f'') return '\n\n'.join(html_parts) def get_schema_recommendations(audit: Dict) -> List[Dict]: """Get recommendations for missing schemas""" recommendations = [] pages = audit.get('pages', []) # Check if Organization schema exists has_org_schema = False for page in pages: html = page.get('html', '') if '"@type":"Organization"' in html or '"@type": "Organization"' in html: has_org_schema = True break if not has_org_schema: recommendations.append({ 'type': 'organization', 'priority': 'high', 'title': 'أضف Organization Schema', 'description': 'يساعد محركات البحث على فهم معلومات شركتك', 'code': json.dumps(generate_organization_schema(audit), ensure_ascii=False, indent=2) }) # Check for FAQ schema has_faq = any('faq' in page.get('url', '').lower() for page in pages) if has_faq: recommendations.append({ 'type': 'faq', 'priority': 'medium', 'title': 'أضف FAQPage Schema', 'description': 'يظهر الأسئلة الشائعة مباشرة في نتائج البحث', 'code': 'استخدم generate_faq_schema() مع قائمة الأسئلة' }) # Check for BreadcrumbList recommendations.append({ 'type': 'breadcrumb', 'priority': 'medium', 'title': 'أضف BreadcrumbList Schema', 'description': 'يحسن التنقل في نتائج البحث', 'code': json.dumps(generate_breadcrumb_schema(audit.get('url', '')), ensure_ascii=False, indent=2) }) return recommendations