Spaces:

MCP-1st-Birthday
/

HR-Assistant

Sleeping

HR-Assistant / src /doc_parser /utils /sections.py

owenkaplinsky

Clean initial commit for HuggingFace

363cda9 about 1 month ago

3.93 kB

	"""Section parsing and processing utilities."""

	import json
	import re
	from collections import OrderedDict
	from pathlib import Path
	from typing import Dict, List

	from ftfy import fix_text

	from .text import normalize_bullets, tag_contacts


	def parse_sections_from_json_text(text: str) -> List[Dict[str, str]]:
	"""
	Parse STRICT JSON from the API response.

	Attempts direct JSON parsing first, then falls back to
	extracting JSON array from surrounding text.

	Args:
	text: Raw text that should contain a JSON array.

	Returns:
	List of section dicts with 'title' and 'body' keys.
	"""
	# Try direct parse
	try:
	data = json.loads(text)
	if isinstance(data, list):
	out: List[Dict[str, str]] = []
	for item in data:
	if isinstance(item, dict):
	out.append(
	{
	"title": str(item.get("title", "")).strip(),
	"body": str(item.get("body", "")).strip(),
	}
	)
	return out
	except Exception:
	pass

	# Try to extract JSON array from text
	m = re.search(r"\[\s\{[\s\S]\}\s*\]", text)
	if m:
	try:
	data = json.loads(m.group(0))
	if isinstance(data, list):
	out: List[Dict[str, str]] = []
	for item in data:
	if isinstance(item, dict):
	out.append(
	{
	"title": str(item.get("title", "")).strip(),
	"body": str(item.get("body", "")).strip(),
	}
	)
	return out
	except Exception:
	pass
	return []


	def normalize_sections(sections: List[Dict[str, str]]) -> List[Dict[str, str]]:
	"""Normalize text encoding with ftfy (fixes mojibake, etc.)."""
	norm: List[Dict[str, str]] = []
	for s in sections:
	title = fix_text((s.get("title") or "").strip())
	body = fix_text((s.get("body") or "").strip())
	norm.append({"title": title, "body": body})
	return norm


	def merge_duplicate_titles(sections: List[Dict[str, str]]) -> List[Dict[str, str]]:
	"""Merge sections with duplicate titles while preserving order."""
	merged: "OrderedDict[str, str]" = OrderedDict()

	for s in sections:
	title = s.get("title", "").strip()
	body = (s.get("body", "") or "").strip()

	if title in merged:
	if body:
	prev = merged[title]
	merged[title] = (prev + ("\n\n" if prev else "") + body).strip()
	else:
	merged[title] = body

	return [{"title": t, "body": b} for t, b in merged.items()]


	def build_contact_section_from_filename(pdf_file: Path) -> Dict[str, str]:
	"""
	Create a simple 'Adresse' section based on the PDF filename.

	Useful as a fallback when contact info isn't parsed from the document.
	"""
	stem = pdf_file.stem.replace("_", " ").strip()
	tokens = stem.split(maxsplit=1)
	if tokens and len(tokens[0]) == 1 and tokens[0].isalpha():
	stem = tokens[1] if len(tokens) > 1 else ""
	name = stem.strip() or pdf_file.name
	return {"title": "Adresse", "body": f"Name: {name}"}


	def process_section(section: Dict[str, str]) -> Dict[str, str]:
	"""Normalize bullets and tag contact info for a single section."""
	title = section.get("title", "")
	body = section.get("body", "")
	return {
	"title": tag_contacts(normalize_bullets(title)),
	"body": tag_contacts(normalize_bullets(body)),
	}


	def apply_postprocessing(sections: List[Dict[str, str]]) -> List[Dict[str, str]]:
	"""Apply bullet normalization and contact tagging to all sections."""
	return [process_section(s) for s in sections]