Spaces:

MCP-1st-Birthday
/

HR-Assistant

Running

HR-Assistant / src /doc_parser /utils /sections.py

owenkaplinsky

Clean initial commit for HuggingFace

363cda9 16 days ago

3.93 kB

	"""Section parsing and processing utilities."""

	import json
	import re
	from collections import OrderedDict
	from pathlib import Path
	from typing import Dict, List

	from ftfy import fix_text

	from .text import normalize_bullets, tag_contacts


	def parse_sections_from_json_text(text: str) -> List[Dict[str, str]]:
	"""
	Parse STRICT JSON from the API response.

	Attempts direct JSON parsing first, then falls back to
	extracting JSON array from surrounding text.

	Args:
	text: Raw text that should contain a JSON array.

	Returns:
	List of section dicts with 'title' and 'body' keys.
	"""
	# Try direct parse
	try:
	data = json.loads(text)
	if isinstance(data, list):
	out: List[Dict[str, str]] = []
	for item in data:
	if isinstance(item, dict):
	out.append(
	{
	"title": str(item.get("title", "")).strip(),
	"body": str(item.get("body", "")).strip(),
	}
	)
	return out
	except Exception:
	pass

	# Try to extract JSON array from text
	m = re.search(r"\[\s\{[\s\S]\}\s*\]", text)
	if m:
	try:
	data = json.loads(m.group(0))
	if isinstance(data, list):
	out: List[Dict[str, str]] = []
	for item in data:
	if isinstance(item, dict):
	out.append(
	{
	"title": str(item.get("title", "")).strip(),
	"body": str(item.get("body", "")).strip(),
	}
	)
	return out
	except Exception:
	pass
	return []


	def normalize_sections(sections: List[Dict[str, str]]) -> List[Dict[str, str]]:
	"""Normalize text encoding with ftfy (fixes mojibake, etc.)."""
	norm: List[Dict[str, str]] = []
	for s in sections:
	title = fix_text((s.get("title") or "").strip())
	body = fix_text((s.get("body") or "").strip())
	norm.append({"title": title, "body": body})
	return norm


	def merge_duplicate_titles(sections: List[Dict[str, str]]) -> List[Dict[str, str]]:
	"""Merge sections with duplicate titles while preserving order."""
	merged: "OrderedDict[str, str]" = OrderedDict()

	for s in sections:
	title = s.get("title", "").strip()
	body = (s.get("body", "") or "").strip()

	if title in merged:
	if body:
	prev = merged[title]
	merged[title] = (prev + ("\n\n" if prev else "") + body).strip()
	else:
	merged[title] = body

	return [{"title": t, "body": b} for t, b in merged.items()]


	def build_contact_section_from_filename(pdf_file: Path) -> Dict[str, str]:
	"""
	Create a simple 'Adresse' section based on the PDF filename.

	Useful as a fallback when contact info isn't parsed from the document.
	"""
	stem = pdf_file.stem.replace("_", " ").strip()
	tokens = stem.split(maxsplit=1)
	if tokens and len(tokens[0]) == 1 and tokens[0].isalpha():
	stem = tokens[1] if len(tokens) > 1 else ""
	name = stem.strip() or pdf_file.name
	return {"title": "Adresse", "body": f"Name: {name}"}


	def process_section(section: Dict[str, str]) -> Dict[str, str]:
	"""Normalize bullets and tag contact info for a single section."""
	title = section.get("title", "")
	body = section.get("body", "")
	return {
	"title": tag_contacts(normalize_bullets(title)),
	"body": tag_contacts(normalize_bullets(body)),
	}


	def apply_postprocessing(sections: List[Dict[str, str]]) -> List[Dict[str, str]]:
	"""Apply bullet normalization and contact tagging to all sections."""
	return [process_section(s) for s in sections]