SCBconsulting commited on
Commit
722c74f
·
verified ·
1 Parent(s): 8d0f5c4

Update utils/metadata.py

Browse files
Files changed (1) hide show
  1. utils/metadata.py +13 -10
utils/metadata.py CHANGED
@@ -18,14 +18,15 @@ def clean_text(text):
18
 
19
  def extract_metadata(text):
20
  """
21
- Extract entities: DATE, PERSON, ORGANIZATION, LOCATION
22
  """
23
  if not text.strip():
24
  return {"error": "No input provided."}
25
 
26
- text = clean_text(text)[:4000] # Allow a broader context
27
-
28
- ner_results = ner_pipeline(text)
 
29
 
30
  metadata = {
31
  "DATE": [],
@@ -34,11 +35,13 @@ def extract_metadata(text):
34
  "LOCATION": []
35
  }
36
 
37
- for ent in ner_results:
38
- label = ent["entity_group"]
39
- word = ent["word"]
40
-
41
- if label in metadata and word not in metadata[label]:
42
- metadata[label].append(word)
 
43
 
44
  return metadata
 
 
18
 
19
  def extract_metadata(text):
20
  """
21
+ Extract named entities from long documents using chunked NER.
22
  """
23
  if not text.strip():
24
  return {"error": "No input provided."}
25
 
26
+ text = clean_text(text)
27
+ max_chunk_length = 512 # safe for transformer models
28
+ words = text.split()
29
+ chunks = [" ".join(words[i:i + max_chunk_length]) for i in range(0, len(words), max_chunk_length)]
30
 
31
  metadata = {
32
  "DATE": [],
 
35
  "LOCATION": []
36
  }
37
 
38
+ for chunk in chunks:
39
+ ner_results = ner_pipeline(chunk)
40
+ for ent in ner_results:
41
+ label = ent["entity_group"]
42
+ word = ent["word"]
43
+ if label in metadata and word not in metadata[label]:
44
+ metadata[label].append(word)
45
 
46
  return metadata
47
+