import os import spacy from huggingface_hub import snapshot_download from typing import List, Dict, Any import logging HF_MODEL_ID = "kn29/my-ner-model" logger = logging.getLogger(__name__) # Global variable to store the loaded model _nlp_model = None def _initialize_model(model_id: str = None): """Initialize the NER model""" global _nlp_model if _nlp_model is not None: return _nlp_model if model_id is None: model_id = HF_MODEL_ID try: logger.info(f"Loading NER model from Hugging Face: {model_id}") token = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN") local_dir = snapshot_download( repo_id=model_id, token=token if token else None ) _nlp_model = spacy.load(local_dir) logger.info( f"Successfully loaded NER model from {model_id} (token={'yes' if token else 'no'})" ) except Exception as e: logger.error(f"Failed to load NER model from {model_id}: {str(e)}") # Fallback to standard English model try: logger.info("Falling back to standard English model") _nlp_model = spacy.load("en_core_web_sm") except Exception as fallback_error: logger.error(f"Fallback model also failed: {str(fallback_error)}") raise Exception(f"No spaCy model available: {str(e)}") return _nlp_model def process_text(text: str, model_id: str = None) -> Dict[str, Any]: """Process text with NER model""" try: nlp = _initialize_model(model_id) if len(text) > 4000000: logger.info(f"Text too large ({len(text)} chars), processing in chunks") return _process_large_text(text, nlp) doc = nlp(text) entities = [] entity_counts = {} for ent in doc.ents: processed_entities = _process_entity(ent) for entity_text, entity_label in processed_entities: entity_info = { "text": entity_text, "label": entity_label, "start": ent.start_char, "end": ent.end_char } entities.append(entity_info) if entity_label not in entity_counts: entity_counts[entity_label] = [] entity_counts[entity_label].append(entity_text) for label in entity_counts: unique_entities = list(set(entity_counts[label])) entity_counts[label] = { "entities": unique_entities, "count": len(unique_entities) } return { "entities": entities, "entity_counts": entity_counts, "total_entities": len(entities), "unique_labels": list(entity_counts.keys()) } except Exception as e: logger.error(f"Error processing text with NER: {str(e)}") return { "error": str(e), "entities": [], "entity_counts": {}, "total_entities": 0 } def _process_large_text(text: str, nlp, chunk_size: int = 3000000) -> Dict[str, Any]: """Process large text in chunks""" chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)] all_entities = [] all_entity_counts = {} for i, chunk in enumerate(chunks): logger.info(f"Processing chunk {i+1}/{len(chunks)}") try: doc = nlp(chunk) for ent in doc.ents: processed_entities = _process_entity(ent) for entity_text, entity_label in processed_entities: entity_info = { "text": entity_text, "label": entity_label, "start": ent.start_char + (i * chunk_size), "end": ent.end_char + (i * chunk_size) } all_entities.append(entity_info) if entity_label not in all_entity_counts: all_entity_counts[entity_label] = [] all_entity_counts[entity_label].append(entity_text) except Exception as e: logger.error(f"Error processing chunk {i+1}: {str(e)}") continue for label in all_entity_counts: unique_entities = list(set(all_entity_counts[label])) all_entity_counts[label] = { "entities": unique_entities, "count": len(unique_entities) } return { "entities": all_entities, "entity_counts": all_entity_counts, "total_entities": len(all_entities), "unique_labels": list(all_entity_counts.keys()), "processed_in_chunks": True, "num_chunks": len(chunks) } def _process_entity(ent) -> List[tuple]: """Process individual entity, handling special cases""" if ent.label_ in ["PRECEDENT", "ORG"] and " and " in ent.text: parts = ent.text.split(" and ") return [(p.strip(), "ORG") for p in parts] return [(ent.text, ent.label_)]