Statistical-Impossibility commited on
Commit
3ebde54
·
verified ·
1 Parent(s): 36e71f4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -8
app.py CHANGED
@@ -95,26 +95,33 @@ def ner_predict(text):
95
  if not sentences:
96
  return "<p>No sentences detected</p>", ""
97
 
98
- # Chunking with overlap
 
 
 
 
 
 
99
  max_tokens = 450
100
  chunks = []
101
-
102
  i = 0
 
103
  while i < len(sentences):
104
  chunk_sents = []
105
- chunk_text = ""
106
 
107
  for j in range(i, len(sentences)):
108
- candidate = chunk_text + " " + sentences[j]["text"] if chunk_text else sentences[j]["text"]
109
- tokens = ner_pipeline.tokenizer.tokenize(candidate)
110
 
111
- if len(tokens) > max_tokens and chunk_sents:
 
112
  break
113
 
114
  chunk_sents.append(sentences[j])
115
- chunk_text = candidate
116
 
117
  if chunk_sents:
 
118
  chunks.append({
119
  "text": chunk_text,
120
  "offset": chunk_sents[0]["start"],
@@ -124,7 +131,7 @@ def ner_predict(text):
124
  sentences_to_skip = max(1, len(chunk_sents) - 2)
125
  i += sentences_to_skip
126
 
127
- # Predict on chunks
128
  all_entities = []
129
 
130
  for chunk in chunks:
@@ -148,6 +155,7 @@ def ner_predict(text):
148
  except Exception as e:
149
  print(f"Chunk processing error: {e}")
150
  continue
 
151
 
152
  # Sort and deduplicate
153
  all_entities = sorted(all_entities, key=lambda x: (x['start'], -x['score']))
 
95
  if not sentences:
96
  return "<p>No sentences detected</p>", ""
97
 
98
+ # Pre-tokenize sentences ONCE (cache token counts)
99
+ sentence_token_counts = []
100
+ for sent in sentences:
101
+ tokens = ner_pipeline.tokenizer.tokenize(sent["text"])
102
+ sentence_token_counts.append(len(tokens))
103
+
104
+ # Chunking with cached token counts
105
  max_tokens = 450
106
  chunks = []
 
107
  i = 0
108
+
109
  while i < len(sentences):
110
  chunk_sents = []
111
+ token_count = 0
112
 
113
  for j in range(i, len(sentences)):
114
+ sent_token_count = sentence_token_counts[j]
 
115
 
116
+ # Check if adding this sentence exceeds limit
117
+ if token_count + sent_token_count > max_tokens and chunk_sents:
118
  break
119
 
120
  chunk_sents.append(sentences[j])
121
+ token_count += sent_token_count
122
 
123
  if chunk_sents:
124
+ chunk_text = " ".join([s["text"] for s in chunk_sents])
125
  chunks.append({
126
  "text": chunk_text,
127
  "offset": chunk_sents[0]["start"],
 
131
  sentences_to_skip = max(1, len(chunk_sents) - 2)
132
  i += sentences_to_skip
133
 
134
+ # Predict on chunks (NO CHANGES HERE)
135
  all_entities = []
136
 
137
  for chunk in chunks:
 
155
  except Exception as e:
156
  print(f"Chunk processing error: {e}")
157
  continue
158
+
159
 
160
  # Sort and deduplicate
161
  all_entities = sorted(all_entities, key=lambda x: (x['start'], -x['score']))