Spaces:

kn29
/

doc-processor

Sleeping

Kartik Narang

first clean commit

3cfeab7 6 months ago

5.08 kB

	import spacy
	from huggingface_hub import snapshot_download
	from typing import Dict, Any

	def extract_legal_entities(text, model_id=None, hf_token=None):
	"""
	Extract named entities from legal text

	Args:
	text: Input text to process
	model_id: Optional Hugging Face model ID (defaults to en_core_web_sm)
	hf_token: Optional Hugging Face token

	Returns:
	Dictionary with entities and counts
	"""
	if not text or not text.strip():
	return {
	"error": "Empty text provided",
	"entities": [],
	"entity_counts": {},
	"total_entities": 0
	}

	# Load model
	nlp = _load_ner_model(model_id, hf_token)
	if not nlp:
	return {
	"error": "Failed to load NER model",
	"entities": [],
	"entity_counts": {},
	"total_entities": 0
	}

	try:
	# Process text (handle large texts by chunking)
	if len(text) > 4000000:
	return _process_large_text(text, nlp)

	doc = nlp(text)

	entities = []
	entity_counts = {}

	for ent in doc.ents:
	processed_entities = _process_entity(ent)

	for entity_text, entity_label in processed_entities:
	entity_info = {
	"text": entity_text,
	"label": entity_label,
	"start": ent.start_char,
	"end": ent.end_char
	}
	entities.append(entity_info)

	if entity_label not in entity_counts:
	entity_counts[entity_label] = []
	entity_counts[entity_label].append(entity_text)

	# Process counts
	for label in entity_counts:
	unique_entities = list(set(entity_counts[label]))
	entity_counts[label] = {
	"entities": unique_entities,
	"count": len(unique_entities)
	}

	return {
	"entities": entities,
	"entity_counts": entity_counts,
	"total_entities": len(entities),
	"unique_labels": list(entity_counts.keys())
	}

	except Exception as e:
	return {
	"error": str(e),
	"entities": [],
	"entity_counts": {},
	"total_entities": 0
	}

	def _load_ner_model(model_id, hf_token):
	"""Load spaCy NER model"""
	if not model_id:
	model_id = 'en_core_web_sm'

	try:
	# Try loading from Hugging Face
	if model_id != 'en_core_web_sm':
	local_dir = snapshot_download(
	repo_id=model_id,
	token=hf_token if hf_token else None
	)
	return spacy.load(local_dir)
	else:
	# Load standard model
	return spacy.load("en_core_web_sm")

	except Exception:
	# Fallback to standard English model
	try:
	return spacy.load("en_core_web_sm")
	except Exception:
	return None

	def _process_large_text(text, nlp, chunk_size=3000000):
	"""Process large text by chunking"""
	chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
	all_entities = []
	all_entity_counts = {}

	for i, chunk in enumerate(chunks):
	try:
	doc = nlp(chunk)

	for ent in doc.ents:
	processed_entities = _process_entity(ent)

	for entity_text, entity_label in processed_entities:
	entity_info = {
	"text": entity_text,
	"label": entity_label,
	"start": ent.start_char + (i * chunk_size),
	"end": ent.end_char + (i * chunk_size)
	}
	all_entities.append(entity_info)

	if entity_label not in all_entity_counts:
	all_entity_counts[entity_label] = []
	all_entity_counts[entity_label].append(entity_text)

	except Exception:
	continue

	# Process counts
	for label in all_entity_counts:
	unique_entities = list(set(all_entity_counts[label]))
	all_entity_counts[label] = {
	"entities": unique_entities,
	"count": len(unique_entities)
	}

	return {
	"entities": all_entities,
	"entity_counts": all_entity_counts,
	"total_entities": len(all_entities),
	"unique_labels": list(all_entity_counts.keys()),
	"processed_in_chunks": True,
	"num_chunks": len(chunks)
	}

	def _process_entity(ent):
	"""Process individual entity (handle special cases like 'X and Y')"""
	if ent.label_ in ["PRECEDENT", "ORG"] and " and " in ent.text:
	parts = ent.text.split(" and ")
	return [(p.strip(), "ORG") for p in parts]
	return [(ent.text, ent.label_)]