kn29 commited on
Commit
e611cc9
·
verified ·
1 Parent(s): 3d5f4f7

Update simple/ner.py

Browse files
Files changed (1) hide show
  1. simple/ner.py +55 -61
simple/ner.py CHANGED
@@ -1,40 +1,57 @@
 
1
  import spacy
2
  from huggingface_hub import snapshot_download
3
- from typing import Dict, Any
 
4
 
5
- def extract_legal_entities(text, model_id=None, hf_token=None):
6
- """
7
- Extract named entities from legal text
8
-
9
- Args:
10
- text: Input text to process
11
- model_id: Optional Hugging Face model ID (defaults to en_core_web_sm)
12
- hf_token: Optional Hugging Face token
13
-
14
- Returns:
15
- Dictionary with entities and counts
16
- """
17
- if not text or not text.strip():
18
- return {
19
- "error": "Empty text provided",
20
- "entities": [],
21
- "entity_counts": {},
22
- "total_entities": 0
23
- }
24
 
25
- # Load model
26
- nlp = _load_ner_model(model_id, hf_token)
27
- if not nlp:
28
- return {
29
- "error": "Failed to load NER model",
30
- "entities": [],
31
- "entity_counts": {},
32
- "total_entities": 0
33
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
 
 
 
 
 
35
  try:
36
- # Process text (handle large texts by chunking)
 
37
  if len(text) > 4000000:
 
38
  return _process_large_text(text, nlp)
39
 
40
  doc = nlp(text)
@@ -58,7 +75,6 @@ def extract_legal_entities(text, model_id=None, hf_token=None):
58
  entity_counts[entity_label] = []
59
  entity_counts[entity_label].append(entity_text)
60
 
61
- # Process counts
62
  for label in entity_counts:
63
  unique_entities = list(set(entity_counts[label]))
64
  entity_counts[label] = {
@@ -74,6 +90,7 @@ def extract_legal_entities(text, model_id=None, hf_token=None):
74
  }
75
 
76
  except Exception as e:
 
77
  return {
78
  "error": str(e),
79
  "entities": [],
@@ -81,37 +98,14 @@ def extract_legal_entities(text, model_id=None, hf_token=None):
81
  "total_entities": 0
82
  }
83
 
84
- def _load_ner_model(model_id, hf_token):
85
- """Load spaCy NER model"""
86
- if not model_id:
87
- model_id = 'en_core_web_sm'
88
-
89
- try:
90
- # Try loading from Hugging Face
91
- if model_id != 'en_core_web_sm':
92
- local_dir = snapshot_download(
93
- repo_id=model_id,
94
- token=hf_token if hf_token else None
95
- )
96
- return spacy.load(local_dir)
97
- else:
98
- # Load standard model
99
- return spacy.load("en_core_web_sm")
100
-
101
- except Exception:
102
- # Fallback to standard English model
103
- try:
104
- return spacy.load("en_core_web_sm")
105
- except Exception:
106
- return None
107
-
108
- def _process_large_text(text, nlp, chunk_size=3000000):
109
- """Process large text by chunking"""
110
  chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
111
  all_entities = []
112
  all_entity_counts = {}
113
 
114
  for i, chunk in enumerate(chunks):
 
115
  try:
116
  doc = nlp(chunk)
117
 
@@ -131,10 +125,10 @@ def _process_large_text(text, nlp, chunk_size=3000000):
131
  all_entity_counts[entity_label] = []
132
  all_entity_counts[entity_label].append(entity_text)
133
 
134
- except Exception:
 
135
  continue
136
 
137
- # Process counts
138
  for label in all_entity_counts:
139
  unique_entities = list(set(all_entity_counts[label]))
140
  all_entity_counts[label] = {
@@ -151,8 +145,8 @@ def _process_large_text(text, nlp, chunk_size=3000000):
151
  "num_chunks": len(chunks)
152
  }
153
 
154
- def _process_entity(ent):
155
- """Process individual entity (handle special cases like 'X and Y')"""
156
  if ent.label_ in ["PRECEDENT", "ORG"] and " and " in ent.text:
157
  parts = ent.text.split(" and ")
158
  return [(p.strip(), "ORG") for p in parts]
 
1
+ import os
2
  import spacy
3
  from huggingface_hub import snapshot_download
4
+ from typing import List, Dict, Any
5
+ import logging
6
 
7
+ HF_MODEL_ID = "kn29/my-ner-model"
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+ # Global variable to store the loaded model
12
+ _nlp_model = None
13
+
14
+ def _initialize_model(model_id: str = None):
15
+ """Initialize the NER model"""
16
+ global _nlp_model
 
 
 
 
 
 
 
 
 
17
 
18
+ if _nlp_model is not None:
19
+ return _nlp_model
20
+
21
+ if model_id is None:
22
+ model_id = HF_MODEL_ID
23
+
24
+ try:
25
+ logger.info(f"Loading NER model from Hugging Face: {model_id}")
26
+ token = os.getenv("HUGGINGFACE_TOKEN") or os.getenv("HF_TOKEN")
27
+ local_dir = snapshot_download(
28
+ repo_id=model_id,
29
+ token=token if token else None
30
+ )
31
+ _nlp_model = spacy.load(local_dir)
32
+ logger.info(
33
+ f"Successfully loaded NER model from {model_id} (token={'yes' if token else 'no'})"
34
+ )
35
+
36
+ except Exception as e:
37
+ logger.error(f"Failed to load NER model from {model_id}: {str(e)}")
38
+ # Fallback to standard English model
39
+ try:
40
+ logger.info("Falling back to standard English model")
41
+ _nlp_model = spacy.load("en_core_web_sm")
42
+ except Exception as fallback_error:
43
+ logger.error(f"Fallback model also failed: {str(fallback_error)}")
44
+ raise Exception(f"No spaCy model available: {str(e)}")
45
 
46
+ return _nlp_model
47
+
48
+ def process_text(text: str, model_id: str = None) -> Dict[str, Any]:
49
+ """Process text with NER model"""
50
  try:
51
+ nlp = _initialize_model(model_id)
52
+
53
  if len(text) > 4000000:
54
+ logger.info(f"Text too large ({len(text)} chars), processing in chunks")
55
  return _process_large_text(text, nlp)
56
 
57
  doc = nlp(text)
 
75
  entity_counts[entity_label] = []
76
  entity_counts[entity_label].append(entity_text)
77
 
 
78
  for label in entity_counts:
79
  unique_entities = list(set(entity_counts[label]))
80
  entity_counts[label] = {
 
90
  }
91
 
92
  except Exception as e:
93
+ logger.error(f"Error processing text with NER: {str(e)}")
94
  return {
95
  "error": str(e),
96
  "entities": [],
 
98
  "total_entities": 0
99
  }
100
 
101
+ def _process_large_text(text: str, nlp, chunk_size: int = 3000000) -> Dict[str, Any]:
102
+ """Process large text in chunks"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  chunks = [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
104
  all_entities = []
105
  all_entity_counts = {}
106
 
107
  for i, chunk in enumerate(chunks):
108
+ logger.info(f"Processing chunk {i+1}/{len(chunks)}")
109
  try:
110
  doc = nlp(chunk)
111
 
 
125
  all_entity_counts[entity_label] = []
126
  all_entity_counts[entity_label].append(entity_text)
127
 
128
+ except Exception as e:
129
+ logger.error(f"Error processing chunk {i+1}: {str(e)}")
130
  continue
131
 
 
132
  for label in all_entity_counts:
133
  unique_entities = list(set(all_entity_counts[label]))
134
  all_entity_counts[label] = {
 
145
  "num_chunks": len(chunks)
146
  }
147
 
148
+ def _process_entity(ent) -> List[tuple]:
149
+ """Process individual entity, handling special cases"""
150
  if ent.label_ in ["PRECEDENT", "ORG"] and " and " in ent.text:
151
  parts = ent.text.split(" and ")
152
  return [(p.strip(), "ORG") for p in parts]