owenkaplinsky
Clean initial commit for HuggingFace
363cda9
raw
history blame
3.93 kB
"""Section parsing and processing utilities."""
import json
import re
from collections import OrderedDict
from pathlib import Path
from typing import Dict, List
from ftfy import fix_text
from .text import normalize_bullets, tag_contacts
def parse_sections_from_json_text(text: str) -> List[Dict[str, str]]:
"""
Parse STRICT JSON from the API response.
Attempts direct JSON parsing first, then falls back to
extracting JSON array from surrounding text.
Args:
text: Raw text that should contain a JSON array.
Returns:
List of section dicts with 'title' and 'body' keys.
"""
# Try direct parse
try:
data = json.loads(text)
if isinstance(data, list):
out: List[Dict[str, str]] = []
for item in data:
if isinstance(item, dict):
out.append(
{
"title": str(item.get("title", "")).strip(),
"body": str(item.get("body", "")).strip(),
}
)
return out
except Exception:
pass
# Try to extract JSON array from text
m = re.search(r"\[\s*\{[\s\S]*\}\s*\]", text)
if m:
try:
data = json.loads(m.group(0))
if isinstance(data, list):
out: List[Dict[str, str]] = []
for item in data:
if isinstance(item, dict):
out.append(
{
"title": str(item.get("title", "")).strip(),
"body": str(item.get("body", "")).strip(),
}
)
return out
except Exception:
pass
return []
def normalize_sections(sections: List[Dict[str, str]]) -> List[Dict[str, str]]:
"""Normalize text encoding with ftfy (fixes mojibake, etc.)."""
norm: List[Dict[str, str]] = []
for s in sections:
title = fix_text((s.get("title") or "").strip())
body = fix_text((s.get("body") or "").strip())
norm.append({"title": title, "body": body})
return norm
def merge_duplicate_titles(sections: List[Dict[str, str]]) -> List[Dict[str, str]]:
"""Merge sections with duplicate titles while preserving order."""
merged: "OrderedDict[str, str]" = OrderedDict()
for s in sections:
title = s.get("title", "").strip()
body = (s.get("body", "") or "").strip()
if title in merged:
if body:
prev = merged[title]
merged[title] = (prev + ("\n\n" if prev else "") + body).strip()
else:
merged[title] = body
return [{"title": t, "body": b} for t, b in merged.items()]
def build_contact_section_from_filename(pdf_file: Path) -> Dict[str, str]:
"""
Create a simple 'Adresse' section based on the PDF filename.
Useful as a fallback when contact info isn't parsed from the document.
"""
stem = pdf_file.stem.replace("_", " ").strip()
tokens = stem.split(maxsplit=1)
if tokens and len(tokens[0]) == 1 and tokens[0].isalpha():
stem = tokens[1] if len(tokens) > 1 else ""
name = stem.strip() or pdf_file.name
return {"title": "Adresse", "body": f"Name: {name}"}
def process_section(section: Dict[str, str]) -> Dict[str, str]:
"""Normalize bullets and tag contact info for a single section."""
title = section.get("title", "")
body = section.get("body", "")
return {
"title": tag_contacts(normalize_bullets(title)),
"body": tag_contacts(normalize_bullets(body)),
}
def apply_postprocessing(sections: List[Dict[str, str]]) -> List[Dict[str, str]]:
"""Apply bullet normalization and contact tagging to all sections."""
return [process_section(s) for s in sections]