import spacy from huggingface_hub import snapshot_download from typing import Dict, Any def extract_legal_entities(text, model_id=None, hf_token=None): """ Extract named entities from legal text Args: text: Input text to process model_id: Optional Hugging Face model ID (defaults to en_core_web_sm) hf_token: Optional Hugging Face token Returns: Dictionary with entities and counts """ if not text or not text.strip(): return { "error": "Empty text provided", "entities": [], "entity_counts": {}, "total_entities": 0 } # Load model nlp = _load_ner_model(model_id, hf_token) if not nlp: return { "error": "Failed to load NER model", "entities": [], "entity_counts": {}, "total_entities": 0 } try: # Process text (handle large texts by chunking) if len(text) > 4000000: return _process_large_text(text, nlp) doc = nlp(text) entities = [] entity_counts = {} for ent in doc.ents: processed_entities = _process_entity(ent) for entity_text, entity_label in processed_entities: entity_info = { "text": entity_text, "label": entity_label, "start": ent.start_char, "end": ent.end_char } entities.append(entity_info) if entity_label not in entity_counts: entity_counts[entity_label] = [] entity_counts[entity_label].append(entity_text) # Process counts for label in entity_counts: unique_entities = list(set(entity_counts[label])) entity_counts[label] = { "entities": unique_entities, "count": len(unique_entities) } return { "entities": entities, "entity_counts": entity_counts, "total_entities": len(entities), "unique_labels": list(entity_counts.keys()) } except Exception as e: return { "error": str(e), "entities": [], "entity_counts": {}, "total_entities": 0 } def _load_ner_model(model_id, hf_token): """Load spaCy NER model""" if not model_id: model_id = 'en_core_web_sm' try: # Try loading from Hugging Face if model_id != 'en_core_web_sm': local_dir = snapshot_download( repo_id=model_id, token=hf_token if hf_token else None ) return spacy.load(local_dir) else: # Load standard model return spacy.load("en_core_web_sm") except Exception: # Fallback to standard English model try: return spacy.load("en_core_web_sm") except Exception: return None def _process_large_text(text, nlp, chunk_size=3000000): """Process large text by chunking""" chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] all_entities = [] all_entity_counts = {} for i, chunk in enumerate(chunks): try: doc = nlp(chunk) for ent in doc.ents: processed_entities = _process_entity(ent) for entity_text, entity_label in processed_entities: entity_info = { "text": entity_text, "label": entity_label, "start": ent.start_char + (i * chunk_size), "end": ent.end_char + (i * chunk_size) } all_entities.append(entity_info) if entity_label not in all_entity_counts: all_entity_counts[entity_label] = [] all_entity_counts[entity_label].append(entity_text) except Exception: continue # Process counts for label in all_entity_counts: unique_entities = list(set(all_entity_counts[label])) all_entity_counts[label] = { "entities": unique_entities, "count": len(unique_entities) } return { "entities": all_entities, "entity_counts": all_entity_counts, "total_entities": len(all_entities), "unique_labels": list(all_entity_counts.keys()), "processed_in_chunks": True, "num_chunks": len(chunks) } def _process_entity(ent): """Process individual entity (handle special cases like 'X and Y')""" if ent.label_ in ["PRECEDENT", "ORG"] and " and " in ent.text: parts = ent.text.split(" and ") return [(p.strip(), "ORG") for p in parts] return [(ent.text, ent.label_)]