File size: 1,726 Bytes
a74b879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import spacy
from collections import Counter

_nlp = None

def load_nlp():
    global _nlp
    if _nlp is None:
        try:
            _nlp = spacy.load("en_core_web_sm")
        except Exception:
            # fall back: prompt user to download model
            raise RuntimeError("spaCy model 'en_core_web_sm' not found. Run: python -m spacy download en_core_web_sm")
    return _nlp

def heading_hierarchy_ok(headings):
    # Check for skipped heading levels (simple heuristic)
    levels = [int(h['tag'][1]) for h in headings if h['tag'].startswith('h')]
    if not levels:
        return False
    prev = levels[0]
    for lv in levels[1:]:
        if lv - prev > 1:
            return False
        prev = lv
    return True

def paragraph_density(paragraphs):
    # words per paragraph and average
    counts = [len(p.split()) for p in paragraphs]
    if not counts:
        return { 'avg_words': 0, 'paras': 0 }
    return { 'avg_words': sum(counts)/len(counts), 'paras': len(counts) }

def extract_entities(text):
    nlp = load_nlp()
    doc = nlp(text)
    ents = [ { 'text': e.text, 'label': e.label_ } for e in doc.ents ]
    freq = Counter([e['label'] for e in ents])
    return { 'entities': ents, 'summary': dict(freq) }

def audit_page(page):
    headings_ok = heading_hierarchy_ok(page.get('headings', []))
    density = paragraph_density(page.get('paragraphs', []))
    text_blob = "\n\n".join(page.get('paragraphs', []))[:20000]
    entities = extract_entities(text_blob) if text_blob else { 'entities': [], 'summary': {} }
    return {
        'url': page['url'],
        'title': page.get('title',''),
        'headings_ok': headings_ok,
        'density': density,
        'entities': entities
    }