Spaces:

aetherbase
/

llm-ready-data

Running

llm-ready-data / app /services /extraction_service.py

Soumik Bose

6c24b50 about 2 months ago

18.8 kB

	from __future__ import annotations

	import io
	import re
	import threading
	import warnings
	from pathlib import Path
	from typing import Any, Callable, Dict, List, Optional, Tuple, Union

	import pandas as pd

	from app.config import get_settings
	from app.core.constants import (
	MAX_CSV_ROWS,
	MAX_EXCEL_ROWS,
	MAX_MEMORY_CELLS,
	TABULAR_EXTENSIONS,
	)
	from app.core.logger import get_logger

	_logger = get_logger(__name__)
	_settings = get_settings()

	VALID_SPACY_LABELS: Dict[str, str] = {
	"ORG": "Companies, agencies, institutions",
	"PERSON": "People, including fictional",
	"DATE": "Absolute or relative dates or periods",
	"MONEY": "Monetary values, including unit",
	"GPE": "Countries, cities, states",
	"LOC": "Non-GPE locations, mountain ranges, bodies of water",
	"PRODUCT": "Objects, vehicles, foods, etc.",
	"EVENT": "Named hurricanes, battles, wars, sports events",
	"CARDINAL": "Numerals that do not fall under another type",
	"PERCENT": "Percentage, including '%'",
	"QUANTITY": "Measurements, as of weight or distance",
	"TIME": "Times smaller than a day",
	"NORP": "Nationalities or religious or political groups",
	"FAC": "Buildings, airports, highways, bridges",
	"WORK_OF_ART": "Titles of books, songs, etc.",
	"LAW": "Named documents made into laws",
	"LANGUAGE": "Any named language",
	"ORDINAL": "'first', 'second', etc.",
	}

	SchemaNode = Dict[str, Any]
	ResultNode = Union[str, List[Any], Dict[str, Any], None]

	_norm_lock = threading.Lock()
	_NORMALIZERS: Dict[str, Callable[[str], str]] = {
	"strip": lambda s: s.strip(),
	"upper": lambda s: s.upper(),
	"lower": lambda s: s.lower(),
	"remove_commas": lambda s: s.replace(",", ""),
	"remove_spaces": lambda s: s.replace(" ", ""),
	"remove_newlines": lambda s: s.replace("\n", " ").replace("\r", ""),
	"collapse_whitespace": lambda s: re.sub(r"\s+", " ", s).strip(),
	"remove_currency": lambda s: re.sub(r"[$\u20ac\u00a3\u00a5\u20b9]", "", s),
	"remove_non_numeric": lambda s: re.sub(r"[^\d.]", "", s),
	"normalize_date_sep": lambda s: re.sub(r"[/.]", "-", s),
	}

	_resolver_lock = threading.Lock()
	_RESOLVERS: Dict[str, Callable[[Dict[str, Any], Any, str], Optional[str]]] = {}

	_nlp_lock = threading.Lock()
	_nlp = None


	def _get_nlp():
	global _nlp
	if _nlp is not None:
	return _nlp
	with _nlp_lock:
	if _nlp is None:
	import spacy
	_nlp = spacy.load(
	_settings.spacy_model,
	exclude=["tagger", "parser", "lemmatizer", "attribute_ruler"],
	)
	_logger.info("spaCy %s loaded", _settings.spacy_model)
	return _nlp


	def _validate_file_size(size: int) -> Optional[str]:
	max_size = _settings.max_upload_bytes
	if size > max_size:
	return f"File size {size} bytes exceeds limit of {max_size} bytes"
	return None


	def _check_memory_usage(rows: int, cols: int) -> Optional[str]:
	approx_mb = (rows * cols * 50) / (1024 * 1024)
	if rows * cols > MAX_MEMORY_CELLS:
	return f"Data size too large (approx {approx_mb:.1f} MB). Too many cells: {rows}x{cols}"
	return None


	def _build_flags(rule: Dict[str, Any]) -> re.RegexFlag:
	flags = re.RegexFlag(0)
	for name in rule.get("flags", []):
	obj = getattr(re, name.upper(), None)
	if obj is None:
	_logger.warning("Unknown re flag %s", name)
	continue
	flags \|= obj
	return flags


	def _apply_normalizers(value: Optional[str], normalize: Any) -> Optional[str]:
	if not isinstance(value, str):
	return None
	if not normalize:
	return value
	if isinstance(normalize, str):
	normalize = [normalize]
	for key in normalize:
	with _norm_lock:
	fn = _NORMALIZERS.get(key)
	if fn is None:
	_logger.warning("Unknown normalizer %s", key)
	continue
	try:
	value = fn(value)
	except Exception as exc:
	_logger.error("Normalizer %s raised on value %s: %s", key, value, exc)
	return value if value else None


	def _try_group(match: re.Match, capture_group: Any) -> Tuple[bool, Optional[str]]:
	try:
	return True, match.group(capture_group)
	except (IndexError, re.error):
	_logger.warning("Group %s does not exist in pattern", capture_group)
	return False, None


	def _resolve_regex(rule: Dict[str, Any], text: str) -> Optional[str]:
	primary = rule.get("pattern", "")
	if not primary:
	_logger.warning("Regex rule missing pattern")
	return None
	flags = _build_flags(rule)
	capture_group = rule.get("capture_group", 0)
	match_index = rule.get("match_index", 0)
	normalize = rule.get("normalize", "")
	strip_chars = rule.get("strip_chars", "")
	fallbacks = rule.get("fallback_patterns", [])

	for pat in [primary, *fallbacks]:
	try:
	matches = list(re.finditer(pat, text, flags))
	except re.error as exc:
	_logger.error("Invalid regex %s: %s", pat, exc)
	continue
	if not matches:
	continue
	try:
	target_matches = [matches[match_index]]
	except IndexError:
	target_matches = [matches[-1]]
	for m in target_matches:
	exists, result = _try_group(m, capture_group)
	if not exists or result is None:
	break
	result = _apply_normalizers(result, normalize)
	if result is None:
	break
	result = result.strip(strip_chars) if strip_chars else result.strip()
	return result or None
	return None


	def _resolve_regex_all(rule: Dict[str, Any], text: str) -> List[Optional[str]]:
	primary = rule.get("pattern", "")
	if not primary:
	_logger.warning("Regex-array rule missing pattern")
	return []
	flags = _build_flags(rule)
	capture_group = rule.get("capture_group", 0)
	normalize = rule.get("normalize", "")
	strip_chars = rule.get("strip_chars", "")
	max_items = rule.get("max_items")

	try:
	matches = list(re.finditer(primary, text, flags))
	except re.error as exc:
	_logger.error("Invalid regex %s: %s", primary, exc)
	return []

	results: List[Optional[str]] = []
	for m in matches:
	exists, result = _try_group(m, capture_group)
	if not exists or result is None:
	continue
	result = _apply_normalizers(result, normalize)
	if result is None:
	continue
	result = result.strip(strip_chars) if strip_chars else result.strip()
	if result:
	results.append(result)
	if max_items is not None and len(results) >= max_items:
	break
	return results


	def _resolve_entity(rule: Dict[str, Any], doc: Any) -> Optional[str]:
	if doc is None:
	_logger.warning("Entity resolver received None doc")
	return None
	labels = rule.get("label")
	if isinstance(labels, str):
	labels = [labels]
	labels = set(labels or [])
	match_index = rule.get("match_index", 0)
	min_length = rule.get("min_length", 1)
	exclude_pat = rule.get("exclude_pattern", "")
	exclude_flags = _build_flags({"flags": rule.get("exclude_flags", [])})
	normalize = rule.get("normalize", "")

	candidates = [
	ent.text for ent in doc.ents
	if ent.label_ in labels
	and len(ent.text) >= min_length
	and not (exclude_pat and re.search(exclude_pat, ent.text, exclude_flags))
	]
	if not candidates:
	return None
	try:
	result = candidates[match_index]
	except IndexError:
	result = candidates[-1]
	return _apply_normalizers(result, normalize)


	def _resolve_entity_all(rule: Dict[str, Any], doc: Any) -> List[Optional[str]]:
	if doc is None:
	_logger.warning("Entity-array resolver received None doc")
	return []
	labels = rule.get("label")
	if isinstance(labels, str):
	labels = [labels]
	labels = set(labels or [])
	min_length = rule.get("min_length", 1)
	exclude_pat = rule.get("exclude_pattern", "")
	exclude_flags = _build_flags({"flags": rule.get("exclude_flags", [])})
	normalize = rule.get("normalize", "")
	max_items = rule.get("max_items")
	unique = rule.get("unique", False)

	results: List[str] = []
	seen: set = set()
	for ent in doc.ents:
	if ent.label_ not in labels:
	continue
	if len(ent.text) < min_length:
	continue
	if exclude_pat and re.search(exclude_pat, ent.text, exclude_flags):
	continue
	value = _apply_normalizers(ent.text, normalize)
	if not value:
	continue
	if unique:
	if value in seen:
	continue
	seen.add(value)
	results.append(value)
	if max_items is not None and len(results) >= max_items:
	break
	return results


	def _resolve_token_attr(rule: Dict[str, Any], doc: Any) -> Optional[str]:
	if doc is None:
	_logger.warning("Token-attr resolver received None doc")
	return None
	attr = rule.get("attr", "")
	match_index = rule.get("match_index", 0)
	normalize = rule.get("normalize", "")

	candidates = [t.text for t in doc if getattr(t, attr, False)]
	if not candidates:
	return None
	try:
	result = candidates[match_index]
	except IndexError:
	result = candidates[-1]
	return _apply_normalizers(result, normalize)


	def _register_builtin_resolvers() -> None:
	with _resolver_lock:
	_RESOLVERS["regex"] = lambda rule, doc, text: _resolve_regex(rule, text)
	_RESOLVERS["entity"] = lambda rule, doc, text: _resolve_entity(rule, doc)
	_RESOLVERS["token_attr"] = lambda rule, doc, text: _resolve_token_attr(rule, doc)
	_RESOLVERS["regex_all"] = lambda rule, doc, text: _resolve_regex_all(rule, text)
	_RESOLVERS["entity_all"] = lambda rule, doc, text: _resolve_entity_all(rule, doc)


	_register_builtin_resolvers()


	def _resolve_scalar_field(rule: Dict[str, Any], doc: Any, text: str) -> Optional[str]:
	src = rule.get("source_type")
	with _resolver_lock:
	fn = _RESOLVERS.get(src)
	if fn is None:
	_logger.warning("Unknown source_type %s", src)
	return None
	return fn(rule, doc, text)


	def _resolve_node(node: SchemaNode, doc: Any, text: str) -> ResultNode:
	node_type = node.get("type")
	if node_type == "object":
	return _resolve_object_node(node, doc, text)
	if node_type == "array":
	return _resolve_array_node(node, doc, text)
	return _resolve_scalar_field(node, doc, text)


	def _resolve_object_node(node: SchemaNode, doc: Any, text: str) -> Dict[str, ResultNode]:
	fields: Dict[str, SchemaNode] = node.get("fields", {})
	result: Dict[str, ResultNode] = {}
	for field_name, child_node in fields.items():
	try:
	result[field_name] = _resolve_node(child_node, doc, text)
	except Exception as exc:
	_logger.error("Object field %s raised: %s", field_name, exc)
	result[field_name] = None
	return result


	def _resolve_array_node(node: SchemaNode, doc: Any, text: str) -> List[ResultNode]:
	item_schema: SchemaNode = node.get("items", {})
	split_pat: Optional[str] = node.get("split_pattern")
	split_flags_rule = {"flags": node.get("split_flags", [])}
	max_items: Optional[int] = node.get("max_items")
	results: List[ResultNode] = []

	if split_pat:
	try:
	flags = _build_flags(split_flags_rule)
	segments = re.split(split_pat, text, flags=flags)
	except re.error as exc:
	_logger.error("Invalid split_pattern %s: %s", split_pat, exc)
	return []
	nlp = _get_nlp()
	try:
	segment_docs = list(nlp.pipe(segments))
	except Exception as exc:
	_logger.error("spaCy pipe failed on array segments: %s", exc)
	segment_docs = [None] * len(segments)
	for seg_doc, seg_text in zip(segment_docs, segments):
	if not seg_text.strip():
	continue
	try:
	item_result = _resolve_node(item_schema, seg_doc, seg_text)
	except Exception as exc:
	_logger.error("Array item resolve raised: %s", exc)
	item_result = None
	results.append(item_result)
	if max_items is not None and len(results) >= max_items:
	break
	else:
	try:
	raw = _resolve_node(item_schema, doc, text)
	except Exception as exc:
	_logger.error("Array item resolve raised: %s", exc)
	return []
	if isinstance(raw, list):
	results = raw
	elif raw is not None:
	results = [raw]
	if max_items is not None:
	results = results[:max_items]
	return results


	def _safe_resolve_node(path: str, node: SchemaNode, doc: Any, text: str) -> ResultNode:
	try:
	return _resolve_node(node, doc, text)
	except Exception as exc:
	_logger.error("Schema path %s raised: %s", path, exc)
	return None


	def _extract_spacy_fields(text: str, fields: Dict[str, SchemaNode]) -> Dict[str, ResultNode]:
	nlp = _get_nlp()
	try:
	doc = next(iter(nlp.pipe([text])))
	except Exception as exc:
	_logger.error("spaCy pipe failed: %s", exc)
	doc = None
	return {
	field: _safe_resolve_node(field, node, doc, text)
	for field, node in fields.items()
	}


	def _extract_tabular(file_path: Union[str, Path], file_data: Optional[bytes] = None) -> Dict[str, Any]:
	ext = Path(file_path).suffix.lower()

	if file_data is not None:
	size_error = _validate_file_size(len(file_data))
	if size_error:
	return {"error": size_error, "file_type": ext}
	elif Path(file_path).exists():
	size_error = _validate_file_size(Path(file_path).stat().st_size)
	if size_error:
	return {"error": size_error, "file_type": ext}

	try:
	with warnings.catch_warnings():
	warnings.simplefilter("ignore", UserWarning)
	if ext == ".csv":
	if file_data:
	df = pd.read_csv(io.BytesIO(file_data), nrows=MAX_CSV_ROWS + 1, low_memory=False)
	else:
	df = pd.read_csv(file_path, nrows=MAX_CSV_ROWS + 1, low_memory=False)
	else:
	if file_data:
	df = pd.read_excel(
	io.BytesIO(file_data),
	engine="openpyxl" if ext == ".xlsx" else "xlrd",
	)
	else:
	df = pd.read_excel(
	file_path,
	engine="openpyxl" if ext == ".xlsx" else "xlrd",
	)

	max_rows = MAX_EXCEL_ROWS if ext != ".csv" else MAX_CSV_ROWS
	if len(df) > max_rows:
	return {
	"error": f"File contains {len(df)} rows, exceeds limit of {max_rows}",
	"file_type": ext,
	"row_count": len(df),
	}

	mem_error = _check_memory_usage(len(df), len(df.columns))
	if mem_error:
	return {"error": mem_error, "file_type": ext}

	result = {
	"success": True,
	"file_type": ext,
	"data": {
	"columns": list(df.columns),
	"rows": df.where(pd.notnull(df), None).to_dict(orient="records"),
	"shape": [len(df), len(df.columns)],
	"dtypes": {col: str(dtype) for col, dtype in df.dtypes.items()},
	},
	}
	_logger.info("Extracted JSON from %s: %d rows, %d cols", ext, len(df), len(df.columns))
	return result

	except pd.errors.EmptyDataError:
	return {"error": "File is empty or has no data", "file_type": ext}
	except MemoryError:
	return {"error": "Out of memory processing file", "file_type": ext}
	except Exception as exc:
	_logger.exception("JSON extraction failed for %s", ext)
	return {
	"error": f"Processing failed: {exc}",
	"file_type": ext,
	"exception_type": type(exc).__name__,
	}


	class ExtractionService:
	def __init__(self) -> None:
	self._spacy_labels = VALID_SPACY_LABELS

	def extract_structured(
	self,
	filename: Union[str, Path],
	markdown_text: str,
	mappings: Optional[Dict[str, Dict[str, Any]]] = None,
	file_data: Optional[bytes] = None,
	) -> Dict[str, Any]:
	ext = Path(filename).suffix.lower()

	if ext in TABULAR_EXTENSIONS:
	result = _extract_tabular(filename, file_data)
	if "error" not in result:
	result["extractor"] = "pandas"
	return result

	if not mappings:
	return {
	"error": (
	f"Cannot extract JSON from '{ext}' files without field mappings. "
	"Provide a 'mappings' object with field extraction rules."
	),
	"file_type": ext,
	}

	valid, mapper_error = self.validate_mappings(mappings)
	if not valid:
	return {
	"error": "invalid_spacy_labels",
	"label_mapper": mapper_error,
	"file_type": ext,
	}

	try:
	data = _extract_spacy_fields(markdown_text, mappings)
	_logger.info("spaCy extraction completed for %s: %d fields", ext, len(data))
	return {
	"success": True,
	"extractor": "spacy",
	"file_type": ext,
	"data": data,
	}
	except Exception as exc:
	_logger.exception("spaCy extraction failed for %s", ext)
	return {
	"error": f"spaCy extraction failed: {exc}",
	"file_type": ext,
	"exception_type": type(exc).__name__,
	}

	def validate_mappings(self, mappings: Dict[str, Dict[str, Any]]) -> Tuple[bool, Optional[str]]:
	for key, rule in mappings.items():
	source_type = rule.get("source_type")
	if source_type == "entity":
	label = rule.get("label")
	if isinstance(label, str) and label not in self._spacy_labels:
	return False, f"Invalid spaCy label '{label}' in field '{key}'"
	if isinstance(label, list):
	for lbl in label:
	if lbl not in self._spacy_labels:
	return False, f"Invalid spaCy label '{lbl}' in field '{key}'"
	return True, None

	def get_spacy_labels(self) -> Dict[str, str]:
	return dict(self._spacy_labels)