Spaces:

Alinabil1
/

last_edit

Sleeping

File size: 13,737 Bytes

a74b879

import os
import json
import hashlib
import re
from typing import List

try:
    import openai
except Exception:
    openai = None

try:
    from groq import Groq
except Exception:
    Groq = None

try:
    from langdetect import detect
except Exception:
    detect = None

if openai is not None:
    openai.api_key = os.getenv('OPENAI_API_KEY')

DEFAULT_MODEL = os.getenv('OPENAI_MODEL', 'gpt-4o-mini')

# Professional Bilingual Recommendations
RECS_CONTENT = {
    'ar': {
        'headings': 'تحسين تسلسل العناوين: تأكد من وجود H1 واحد فقط واستخدام H2 ثم H3 بشكل منطقي لتسهيل الفهرسة.',
        'density': 'زيادة عمق المحتوى: استهدف 40-120 كلمة في الفقرات الرئيسية مع تقديم إجابات مباشرة وسهلة القراءة.',
        'entities': 'إضافة الكيانات المسماة: اذكر أسماء المنظمة، الأشخاص، والمنتجات بوضوح واربطها ببيانات Schema.',
        'faq': 'إنشاء صفحات الأسئلة والأجوبة (FAQ): أضف قسم للأسئلة الشائعة باستخدام JSON-LD لزيادة فرص الظهور في المحركات التوليدية.',
        'ai_visibility': 'تحسين الظهور في الذكاء الاصطناعي: أنشئ محتوى تعريفياً قصيراً (Definitional Content) يسهل على النماذج مثل ChatGPT وPerplexity اقتباسه.',
        'h1_missing': 'إضافة عنوان H1: أضف عنواناً رئيسياً واضحاً يحتوي على الكلمة المفتاحية واسم العلامة التجارية.',
        'short_paras': 'تطوير الفقرات: اجعل الفقرة الأولى تبدأ بإجابة مباشرة ومختصرة لزيادة احتمالية الاقتباس.',
        'thin_content': 'محتوى ضيق: أضف فقرات تعريفية وبيانات منظمة للمؤسسة (Organization Schema).',
    },
    'en': {
        'headings': 'Fix heading hierarchy: Ensure one H1 per page and incremental H2 → H3 structure for better indexing.',
        'density': 'Increase content depth: Aim for 40–120 words in core paragraphs with direct, readable answers.',
        'entities': 'Add named entities: Clearly mention Organizations, People, and Products and link them via Schema.',
        'faq': 'Create FAQ sections: Use FAQPage JSON-LD to increase chances of being featured in AI search results.',
        'ai_visibility': 'Optimize for AI: Create short, authoritative definitions of your services to encourage LLM citations.',
        'h1_missing': 'Add H1 heading: Ensure a clear H1 containing your primary keyword and brand name.',
        'short_paras': 'Expand paragraphs: Lead with a one-sentence "direct answer" to improve AI extraction likelihood.',
        'thin_content': 'Thin content: Add a definitional paragraph and an Organization JSON-LD block.',
    }
}

def _is_arabic(text: str) -> bool:
    if not text: return False
    return bool(re.search(r'[\u0600-\u06FF]', text))

def _build_prompt(pages: List[dict]):
    lines = []
    for p in pages:
        title = p.get('title') or p.get('url')
        first_para = (p.get('paragraphs') or [None])[0] or ''
        lines.append(f"TITLE: {title}\nURL: {p.get('url')}\nTEXT: {first_para}\n---")
    return "\n".join(lines)

def _cache_key_for_prompt(prompt: str, prefix: str = 'openai') -> str:
    return f"{prefix}:{hashlib.sha256(prompt.encode('utf-8')).hexdigest()}"

def analyze_with_openai(pages: List[dict], api_key: str = None):
    key = api_key or os.getenv('OPENAI_API_KEY')
    if not key:
        return {'enabled': False, 'reason': 'OPENAI_API_KEY not set'}

    try:
        prompt_content = _build_prompt(pages)
        system = (
            "You are an analytics assistant. Given crawled pages (title, url, text),"
            " produce a JSON object with keys: summary (one-paragraph), topics (array of top 6 topics),"
            " suggestions (array of action items). Return ONLY valid JSON."
        )

        messages = [
            {'role': 'system', 'content': system},
            {'role': 'user', 'content': prompt_content}
        ]

        client = openai.OpenAI(api_key=key)
        resp = client.chat.completions.create(
            model=DEFAULT_MODEL,
            messages=messages,
            temperature=0.2,
            max_tokens=800
        )
        
        text = resp.choices[0].message.content
        try:
            parsed = json.loads(text)
            return {'enabled': True, 'result': parsed}
        except:
            return {'enabled': True, 'raw': text}
    except Exception as e:
        return {'enabled': False, 'error': str(e)}

def analyze_with_groq(pages: List[dict], api_key: str = None):
    if Groq is None:
        return {'enabled': False, 'reason': 'groq client not installed'}

    groq_key = api_key or os.getenv('GROQ_API_KEY')
    if not groq_key:
        return {'enabled': False, 'reason': 'GROQ_API_KEY not set'}
    
    try:
        client = Groq(api_key=groq_key)
        prompt = _build_prompt(pages)
        
        completion = client.chat.completions.create(
            model=os.getenv('GROQ_MODEL', 'llama-3.1-8b-instant'),
            messages=[
                {'role': 'system', 'content': 'You are an analytics assistant producing JSON output.'},
                {'role': 'user', 'content': prompt}
            ],
            temperature=0.2,
            max_completion_tokens=2048,
            stream=False
        )

        text = completion.choices[0].message.content
        try:
            parsed = json.loads(text)
            return {'enabled': True, 'result': parsed}
        except:
            return {'enabled': True, 'raw': text}
    except Exception as e:
        return {'enabled': False, 'error': str(e)}

def analyze_pages(pages: List[dict], api_keys: dict = None):
    api_keys = api_keys or {}
    out = {'openai': analyze_with_openai(pages, api_key=api_keys.get('openai'))}
    groq_res = analyze_with_groq(pages, api_key=api_keys.get('groq'))
    out['groq'] = groq_res
    return out

def compute_geo_score(pages: List[dict], audit: dict = None, ai_visibility: dict = None):
    """Compute GEO visibility score (0-100) from pages."""
    total_pages = max(1, len(pages))
    headings_ok_count = 0
    density_scores = []
    entity_counts = 0
    faq_count = 0
    critical_issues = 0
    warnings = 0
    passed = 0

    for p in pages:
        if p.get('headings'):
            tags = [h.get('tag', '') for h in p.get('headings', [])]
            if 'h1' in tags:
                headings_ok_count += 1

        dens = 0
        paras = p.get('paragraphs', [])
        if paras:
            avg = sum(len(str(x).split()) for x in paras) / len(paras)
            if avg >= 40 and avg <= 200:
                dens = 1.0
            else:
                dens = min(1.0, avg / 40.0)
        density_scores.append(dens)

        for h in p.get('headings', []):
            if h.get('tag') == 'h3' and paras:
                faq_count += 1

    headings_score = float(headings_ok_count / total_pages) * 20.0
    density_score = float(sum(density_scores) / total_pages) * 20.0 if density_scores else 0.0
    entity_score = 20.0 if entity_counts > 0 else 0.0
    faq_score = float(min(faq_count, total_pages) / total_pages) * 20.0

    ai_score = 0.0
    mentions = 0
    total_q = 0
    if ai_visibility and ai_visibility.get('enabled'):
        res = ai_visibility.get('results') or []
        mentions = sum(1 for r in res if r.get('mentioned'))
        total_q = max(1, len(res))
        ai_score = (mentions / total_q) * 20

    raw_score = headings_score + density_score + entity_score + faq_score + ai_score
    score = int(round(min(raw_score, 100)))
    status = 'Elite' if score >= 85 else ('Authority' if score >= 70 else ('Needs Work' if score >= 40 else 'Critical'))

    for p in pages:
        paras = p.get('paragraphs', [])
        avg = (sum(len(str(x).split()) for x in paras) / len(paras)) if paras else 0.0
        if not p.get('headings') or avg < 20:
            critical_issues += 1
        elif avg < 40:
            warnings += 1
        else:
            passed += 1

    return {
        'score': score,
        'status': status,
        'breakdown': {
            'headings': int(round(headings_score)),
            'density': int(round(density_score)),
            'entities': int(round(entity_score)),
            'faq': int(round(faq_score)),
            'ai_visibility': int(round(ai_score)),
        },
        'counts': {
            'critical': critical_issues,
            'warnings': warnings,
            'passed': passed
        },
        'ai_radar_stats': {
            'mentions': mentions,
            'total_queries': total_q,
            'visibility_percent': int((mentions / total_q) * 100) if total_q > 0 else 0
        }
    }

def infer_brand_name(pages: List[dict]) -> str:
    """Extract brand name from pages."""
    if not pages:
        return "Company"

    for page in pages[:5]:
        meta = page.get('meta', {}) or {}
        og_site = meta.get('og:site_name') or meta.get('application-name')
        if og_site and og_site.lower() not in ('company', 'website', 'home'):
            return og_site.strip()

        for h in page.get('headings', []):
            if h.get('tag') == 'h1':
                txt = h.get('text', '').strip()
                if txt and len(txt) < 60 and txt.lower() not in ('home', 'welcome', 'company'):
                    return txt

        if page.get('title'):
            title_str = str(page.get('title') or '').strip()
            parts = re.split(r'[\|\-—»]', title_str)
            title = parts[0].strip()
            if title and len(title) < 60 and title.lower() not in ('home', 'welcome', 'company', 'homepage'):
                return title

    first_url = pages[0].get('url', '') if pages else ''
    if first_url:
        try:
            from urllib.parse import urlparse
            parsed = urlparse(first_url)
            domain = parsed.netloc or parsed.path
            domain_clean = re.sub(r'^www\.', '', domain)
            domain_clean = re.sub(r'\.[a-z]{2,}$', '', domain_clean)
            domain_clean = domain_clean.replace('-', ' ').replace('_', ' ').strip()
            if domain_clean and len(domain_clean) > 1:
                return domain_clean.title()
        except Exception:
            pass

    return "Company"

def generate_recommendations(pages: List[dict], geo_score: dict = None, api_keys: dict = None, ai_analysis_results: dict = None, extra_context: dict = None):
    """Produce actionable recommendations based on pages and GEO score."""
    api_keys = api_keys or {}
    extra_context = extra_context or {}
    recs = {
        'actions': [],
        'per_page': [],
    }

    # Detect language
    is_ar = False
    for p in pages[:3]:
        paras = p.get('paragraphs') or []
        sample_text = p.get('title', '') + ' ' + (paras[0] if paras else '')
        if _is_arabic(sample_text):
            is_ar = True
            break
    
    lang = 'ar' if is_ar else 'en'
    content = RECS_CONTENT[lang]

    # Heuristic actions based on geo_score
    if geo_score:
        b = geo_score.get('breakdown', {})
        if b.get('headings', 0) < 12:
            recs['actions'].append(content['headings'])
        if b.get('density', 0) < 12:
            recs['actions'].append(content['density'])
        if b.get('entities', 0) < 10:
            recs['actions'].append(content['entities'])
        if b.get('faq', 0) < 10:
            recs['actions'].append(content['faq'])
        if b.get('ai_visibility', 0) < 10:
            recs['actions'].append(content['ai_visibility'])

    # Per-page recommendations
    for p in pages:
        page_rec = {
            'url': p.get('url'),
            'title': p.get('title'),
            'issues': [],
            'suggestions': []
        }
        
        tags = [h.get('tag', '') for h in p.get('headings', [])]
        if 'h1' not in tags:
            page_rec['issues'].append('Missing H1' if lang == 'en' else 'عنوان H1 مفقود')
            page_rec['suggestions'].append(content.get('h1_missing', 'Fix H1'))
        
        paras = p.get('paragraphs', [])
        avg = (sum(len(str(x).split()) for x in paras) / len(paras)) if paras else 0
        if avg < 30:
            page_rec['issues'].append('Short paragraphs' if lang == 'en' else 'فقرات قصيرة جداً')
            page_rec['suggestions'].append(content.get('short_paras', 'Expand paragraphs'))

        recs['per_page'].append(page_rec)

    return recs

def simulate_visibility(content: str, brand: str, api_keys: dict = None) -> dict:
    """Predicts AI Visibility for content."""
    return {
        'ai_visibility_score': 50,
        'sentiment': 'Neutral',
        'entity_clarity': 'Medium',
        'detailed_analysis': 'Content analysis available',
        'suggested_fixes': []
    }

def analyze_ai_visibility_deep(pages: List[dict], brand: str, api_keys: dict = None) -> dict:
    """Performs deep sentiment and visibility analysis."""
    return {
        'sentiment_analysis': {
            'sentiment_score': 0,
            'sentiment_label': 'Neutral',
            'recommendations': ['Improve content density to increase trust.']
        },
        'shopping_visibility': {
            'price_detected': False,
            'price_value': None,
            'rating_detected': False,
            'rating_value': 0
        },
        'context_analysis': {
            'scenario': 'General',
            'trigger': 'Unknown'
        }
    }