| import spacy |
| from collections import Counter |
|
|
| _nlp = None |
|
|
| def load_nlp(): |
| global _nlp |
| if _nlp is None: |
| try: |
| _nlp = spacy.load("en_core_web_sm") |
| except Exception: |
| |
| raise RuntimeError("spaCy model 'en_core_web_sm' not found. Run: python -m spacy download en_core_web_sm") |
| return _nlp |
|
|
| def heading_hierarchy_ok(headings): |
| |
| levels = [int(h['tag'][1]) for h in headings if h['tag'].startswith('h')] |
| if not levels: |
| return False |
| prev = levels[0] |
| for lv in levels[1:]: |
| if lv - prev > 1: |
| return False |
| prev = lv |
| return True |
|
|
| def paragraph_density(paragraphs): |
| |
| counts = [len(p.split()) for p in paragraphs] |
| if not counts: |
| return { 'avg_words': 0, 'paras': 0 } |
| return { 'avg_words': sum(counts)/len(counts), 'paras': len(counts) } |
|
|
| def extract_entities(text): |
| nlp = load_nlp() |
| doc = nlp(text) |
| ents = [ { 'text': e.text, 'label': e.label_ } for e in doc.ents ] |
| freq = Counter([e['label'] for e in ents]) |
| return { 'entities': ents, 'summary': dict(freq) } |
|
|
| def audit_page(page): |
| headings_ok = heading_hierarchy_ok(page.get('headings', [])) |
| density = paragraph_density(page.get('paragraphs', [])) |
| text_blob = "\n\n".join(page.get('paragraphs', []))[:20000] |
| entities = extract_entities(text_blob) if text_blob else { 'entities': [], 'summary': {} } |
| return { |
| 'url': page['url'], |
| 'title': page.get('title',''), |
| 'headings_ok': headings_ok, |
| 'density': density, |
| 'entities': entities |
| } |
|
|