Spaces:

kn29
/

doc-processor

Runtime error

App Files Files Community

kn29 commited on Sep 22, 2025

Commit

e611cc9

verified ·

1 Parent(s): 3d5f4f7

Update simple/ner.py

Browse files

Files changed (1) hide show

simple/ner.py +55 -61

simple/ner.py CHANGED Viewed

@@ -1,40 +1,57 @@
 import spacy
 from huggingface_hub import snapshot_download
-from typing import Dict, Any
-def extract_legal_entities(text, model_id=None, hf_token=None):
-    """
-    Extract named entities from legal text
-    Args:
-        text: Input text to process
-        model_id: Optional Hugging Face model ID (defaults to en_core_web_sm)
-        hf_token: Optional Hugging Face token
-    Returns:
-        Dictionary with entities and counts
-    """
-    if not text or not text.strip():
-        return {
-            "error": "Empty text provided",
-            "entities": [],
-            "entity_counts": {},
-            "total_entities": 0
-        }
-    # Load model
-    nlp = _load_ner_model(model_id, hf_token)
-    if not nlp:
-        return {
-            "error": "Failed to load NER model",
-            "entities": [],
-            "entity_counts": {},
-            "total_entities": 0
-        }
     try:
-        # Process text (handle large texts by chunking)
         if len(text) > 4000000:
             return _process_large_text(text, nlp)
         doc = nlp(text)
@@ -58,7 +75,6 @@ def extract_legal_entities(text, model_id=None, hf_token=None):
                     entity_counts[entity_label] = []
                 entity_counts[entity_label].append(entity_text)
-        # Process counts
         for label in entity_counts:
             unique_entities = list(set(entity_counts[label]))
             entity_counts[label] = {
@@ -74,6 +90,7 @@ def extract_legal_entities(text, model_id=None, hf_token=None):
         }
     except Exception as e:
         return {
             "error": str(e),
             "entities": [],
@@ -81,37 +98,14 @@ def extract_legal_entities(text, model_id=None, hf_token=None):
             "total_entities": 0
         }
-def _load_ner_model(model_id, hf_token):
-    """Load spaCy NER model"""
-    if not model_id:
-        model_id = 'en_core_web_sm'
-    try:
-        # Try loading from Hugging Face
-        if model_id != 'en_core_web_sm':
-            local_dir = snapshot_download(
-                repo_id=model_id,
-                token=hf_token if hf_token else None
-            )
-            return spacy.load(local_dir)
-        else:
-            # Load standard model
-            return spacy.load("en_core_web_sm")
-    except Exception:
-        # Fallback to standard English model
-        try:
-            return spacy.load("en_core_web_sm")
-        except Exception:
-            return None
-def _process_large_text(text, nlp, chunk_size=3000000):
-    """Process large text by chunking"""
     chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
     all_entities = []
     all_entity_counts = {}
     for i, chunk in enumerate(chunks):
         try:
             doc = nlp(chunk)
@@ -131,10 +125,10 @@ def _process_large_text(text, nlp, chunk_size=3000000):
                         all_entity_counts[entity_label] = []
                     all_entity_counts[entity_label].append(entity_text)
-        except Exception:
             continue
-    # Process counts
     for label in all_entity_counts:
         unique_entities = list(set(all_entity_counts[label]))
         all_entity_counts[label] = {
@@ -151,8 +145,8 @@ def _process_large_text(text, nlp, chunk_size=3000000):
         "num_chunks": len(chunks)
     }
-def _process_entity(ent):
-    """Process individual entity (handle special cases like 'X and Y')"""
     if ent.label_ in ["PRECEDENT", "ORG"] and " and " in ent.text:
         parts = ent.text.split(" and ")
         return [(p.strip(), "ORG") for p in parts]

+import os
 import spacy
 from huggingface_hub import snapshot_download
+from typing import List, Dict, Any
+import logging
+HF_MODEL_ID = "kn29/my-ner-model"
+logger = logging.getLogger(__name__)
+# Global variable to store the loaded model
+_nlp_model = None
+def _initialize_model(model_id: str = None):
+    """Initialize the NER model"""
+    global _nlp_model
+    if _nlp_model is not None:
+        return _nlp_model
+    if model_id is None:
+        model_id = HF_MODEL_ID
+    try:
+        logger.info(f"Loading NER model from Hugging Face: {model_id}")
+        token = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
+        local_dir = snapshot_download(
+            repo_id=model_id,
+            token=token if token else None
+        )
+        _nlp_model = spacy.load(local_dir)
+        logger.info(
+            f"Successfully loaded NER model from {model_id} (token={'yes' if token else 'no'})"
+        )
+    except Exception as e:
+        logger.error(f"Failed to load NER model from {model_id}: {str(e)}")
+        # Fallback to standard English model
+        try:
+            logger.info("Falling back to standard English model")
+            _nlp_model = spacy.load("en_core_web_sm")
+        except Exception as fallback_error:
+            logger.error(f"Fallback model also failed: {str(fallback_error)}")
+            raise Exception(f"No spaCy model available: {str(e)}")
+    return _nlp_model
+def process_text(text: str, model_id: str = None) -> Dict[str, Any]:
+    """Process text with NER model"""
     try:
+        nlp = _initialize_model(model_id)
         if len(text) > 4000000:
+            logger.info(f"Text too large ({len(text)} chars), processing in chunks")
             return _process_large_text(text, nlp)
         doc = nlp(text)
                     entity_counts[entity_label] = []
                 entity_counts[entity_label].append(entity_text)
         for label in entity_counts:
             unique_entities = list(set(entity_counts[label]))
             entity_counts[label] = {
         }
     except Exception as e:
+        logger.error(f"Error processing text with NER: {str(e)}")
         return {
             "error": str(e),
             "entities": [],
             "total_entities": 0
         }
+def _process_large_text(text: str, nlp, chunk_size: int = 3000000) -> Dict[str, Any]:
+    """Process large text in chunks"""
     chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
     all_entities = []
     all_entity_counts = {}
     for i, chunk in enumerate(chunks):
+        logger.info(f"Processing chunk {i+1}/{len(chunks)}")
         try:
             doc = nlp(chunk)
                         all_entity_counts[entity_label] = []
                     all_entity_counts[entity_label].append(entity_text)
+        except Exception as e:
+            logger.error(f"Error processing chunk {i+1}: {str(e)}")
             continue
     for label in all_entity_counts:
         unique_entities = list(set(all_entity_counts[label]))
         all_entity_counts[label] = {
         "num_chunks": len(chunks)
     }
+def _process_entity(ent) -> List[tuple]:
+    """Process individual entity, handling special cases"""
     if ent.label_ in ["PRECEDENT", "ORG"] and " and " in ent.text:
         parts = ent.text.split(" and ")
         return [(p.strip(), "ORG") for p in parts]