last_edit / server /schema_generator.py
Moharek
Deploy Moharek GEO Platform
a74b879
"""
Schema Generator
Automatically generates JSON-LD schema markup for websites
"""
import json
from typing import Dict, List
from datetime import datetime
def generate_organization_schema(audit: Dict) -> Dict:
"""Generate Organization schema from audit data"""
pages = audit.get('pages', [])
org_name = audit.get('org_name', 'Company')
url = audit.get('url', '')
# Extract contact info from pages
emails = []
phones = []
social_links = []
for page in pages:
text = page.get('text', '') + ' ' + str(page.get('paragraphs', ''))
# Extract emails
import re
found_emails = re.findall(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', text)
emails.extend(found_emails)
# Extract phones
found_phones = re.findall(r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}', text)
phones.extend(found_phones)
# Extract social links
links = page.get('links', [])
for link in links:
href = link.get('href', '') if isinstance(link, dict) else str(link)
if any(social in href.lower() for social in ['facebook', 'twitter', 'instagram', 'linkedin', 'youtube']):
social_links.append(href)
schema = {
"@context": "https://schema.org",
"@type": "Organization",
"name": org_name,
"url": url,
"logo": f"{url}/logo.png",
"description": f"{org_name} - خدمات متميزة"
}
if emails:
schema["email"] = emails[0]
if phones:
schema["telephone"] = phones[0]
if social_links:
schema["sameAs"] = list(set(social_links))[:5]
# Add contact point
if emails or phones:
schema["contactPoint"] = {
"@type": "ContactPoint",
"contactType": "customer service",
"email": emails[0] if emails else None,
"telephone": phones[0] if phones else None
}
return schema
def generate_faq_schema(faqs: List[Dict]) -> Dict:
"""Generate FAQPage schema from FAQ list"""
if not faqs:
return None
schema = {
"@context": "https://schema.org",
"@type": "FAQPage",
"mainEntity": []
}
for faq in faqs:
question = faq.get('question', faq.get('q', ''))
answer = faq.get('answer', faq.get('a', ''))
if question and answer:
schema["mainEntity"].append({
"@type": "Question",
"name": question,
"acceptedAnswer": {
"@type": "Answer",
"text": answer
}
})
return schema if schema["mainEntity"] else None
def generate_breadcrumb_schema(url: str) -> Dict:
"""Generate BreadcrumbList schema from URL structure"""
from urllib.parse import urlparse
parsed = urlparse(url)
path_parts = [p for p in parsed.path.split('/') if p]
schema = {
"@context": "https://schema.org",
"@type": "BreadcrumbList",
"itemListElement": []
}
# Add home
schema["itemListElement"].append({
"@type": "ListItem",
"position": 1,
"name": "الرئيسية",
"item": f"{parsed.scheme}://{parsed.netloc}"
})
# Add path parts
current_url = f"{parsed.scheme}://{parsed.netloc}"
for i, part in enumerate(path_parts, start=2):
current_url += f"/{part}"
schema["itemListElement"].append({
"@type": "ListItem",
"position": i,
"name": part.replace('-', ' ').replace('_', ' ').title(),
"item": current_url
})
return schema
def generate_website_schema(audit: Dict) -> Dict:
"""Generate WebSite schema with search action"""
org_name = audit.get('org_name', 'Company')
url = audit.get('url', '')
schema = {
"@context": "https://schema.org",
"@type": "WebSite",
"name": org_name,
"url": url,
"potentialAction": {
"@type": "SearchAction",
"target": {
"@type": "EntryPoint",
"urlTemplate": f"{url}/search?q={{search_term_string}}"
},
"query-input": "required name=search_term_string"
}
}
return schema
def generate_article_schema(page: Dict, org_name: str) -> Dict:
"""Generate Article schema for blog posts"""
title = page.get('title', '')
url = page.get('url', '')
# Try to extract publish date
text = page.get('text', '')
import re
date_match = re.search(r'(\d{4}-\d{2}-\d{2})', text)
publish_date = date_match.group(1) if date_match else datetime.now().strftime('%Y-%m-%d')
schema = {
"@context": "https://schema.org",
"@type": "Article",
"headline": title,
"url": url,
"datePublished": publish_date,
"dateModified": publish_date,
"author": {
"@type": "Organization",
"name": org_name
},
"publisher": {
"@type": "Organization",
"name": org_name
}
}
# Extract image if available
images = page.get('images', [])
if images:
first_image = images[0] if isinstance(images[0], str) else images[0].get('src', '')
if first_image:
schema["image"] = first_image
return schema
def generate_product_schema(page: Dict, org_name: str) -> Dict:
"""Generate Product schema for product pages"""
title = page.get('title', '')
url = page.get('url', '')
text = page.get('text', '')
# Try to extract price
import re
price_match = re.search(r'(\$|SAR|ريال)\s*(\d+(?:\.\d{2})?)', text)
price = price_match.group(2) if price_match else "0"
currency = "SAR" if "ريال" in text or "SAR" in text else "USD"
schema = {
"@context": "https://schema.org",
"@type": "Product",
"name": title,
"url": url,
"description": text[:200] if text else title,
"offers": {
"@type": "Offer",
"price": price,
"priceCurrency": currency,
"availability": "https://schema.org/InStock",
"seller": {
"@type": "Organization",
"name": org_name
}
}
}
# Extract image
images = page.get('images', [])
if images:
first_image = images[0] if isinstance(images[0], str) else images[0].get('src', '')
if first_image:
schema["image"] = first_image
return schema
def generate_local_business_schema(audit: Dict) -> Dict:
"""Generate LocalBusiness schema"""
pages = audit.get('pages', [])
org_name = audit.get('org_name', 'Company')
url = audit.get('url', '')
# Extract location info
addresses = []
for page in pages:
text = page.get('text', '')
# Look for Saudi cities
saudi_cities = ['الرياض', 'جدة', 'مكة', 'المدينة', 'الدمام', 'الخبر', 'تبوك', 'أبها']
for city in saudi_cities:
if city in text:
addresses.append(city)
break
schema = {
"@context": "https://schema.org",
"@type": "LocalBusiness",
"name": org_name,
"url": url,
"address": {
"@type": "PostalAddress",
"addressCountry": "SA",
"addressLocality": addresses[0] if addresses else "الرياض"
}
}
return schema
def generate_all_schemas(audit: Dict) -> List[Dict]:
"""Generate all applicable schemas for a website"""
schemas = []
# Always add Organization
org_schema = generate_organization_schema(audit)
schemas.append(org_schema)
# Add WebSite with search
website_schema = generate_website_schema(audit)
schemas.append(website_schema)
# Add LocalBusiness if applicable
local_schema = generate_local_business_schema(audit)
schemas.append(local_schema)
# Add Breadcrumb for main page
url = audit.get('url', '')
if url:
breadcrumb_schema = generate_breadcrumb_schema(url)
schemas.append(breadcrumb_schema)
return schemas
def format_schema_for_html(schemas: List[Dict]) -> str:
"""Format schemas as HTML script tags"""
html_parts = []
for schema in schemas:
if schema:
json_str = json.dumps(schema, ensure_ascii=False, indent=2)
html_parts.append(f'<script type="application/ld+json">\n{json_str}\n</script>')
return '\n\n'.join(html_parts)
def get_schema_recommendations(audit: Dict) -> List[Dict]:
"""Get recommendations for missing schemas"""
recommendations = []
pages = audit.get('pages', [])
# Check if Organization schema exists
has_org_schema = False
for page in pages:
html = page.get('html', '')
if '"@type":"Organization"' in html or '"@type": "Organization"' in html:
has_org_schema = True
break
if not has_org_schema:
recommendations.append({
'type': 'organization',
'priority': 'high',
'title': 'أضف Organization Schema',
'description': 'يساعد محركات البحث على فهم معلومات شركتك',
'code': json.dumps(generate_organization_schema(audit), ensure_ascii=False, indent=2)
})
# Check for FAQ schema
has_faq = any('faq' in page.get('url', '').lower() for page in pages)
if has_faq:
recommendations.append({
'type': 'faq',
'priority': 'medium',
'title': 'أضف FAQPage Schema',
'description': 'يظهر الأسئلة الشائعة مباشرة في نتائج البحث',
'code': 'استخدم generate_faq_schema() مع قائمة الأسئلة'
})
# Check for BreadcrumbList
recommendations.append({
'type': 'breadcrumb',
'priority': 'medium',
'title': 'أضف BreadcrumbList Schema',
'description': 'يحسن التنقل في نتائج البحث',
'code': json.dumps(generate_breadcrumb_schema(audit.get('url', '')), ensure_ascii=False, indent=2)
})
return recommendations