Update app.py
Browse files
app.py
CHANGED
|
@@ -95,26 +95,33 @@ def ner_predict(text):
|
|
| 95 |
if not sentences:
|
| 96 |
return "<p>No sentences detected</p>", ""
|
| 97 |
|
| 98 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
max_tokens = 450
|
| 100 |
chunks = []
|
| 101 |
-
|
| 102 |
i = 0
|
|
|
|
| 103 |
while i < len(sentences):
|
| 104 |
chunk_sents = []
|
| 105 |
-
|
| 106 |
|
| 107 |
for j in range(i, len(sentences)):
|
| 108 |
-
|
| 109 |
-
tokens = ner_pipeline.tokenizer.tokenize(candidate)
|
| 110 |
|
| 111 |
-
if
|
|
|
|
| 112 |
break
|
| 113 |
|
| 114 |
chunk_sents.append(sentences[j])
|
| 115 |
-
|
| 116 |
|
| 117 |
if chunk_sents:
|
|
|
|
| 118 |
chunks.append({
|
| 119 |
"text": chunk_text,
|
| 120 |
"offset": chunk_sents[0]["start"],
|
|
@@ -124,7 +131,7 @@ def ner_predict(text):
|
|
| 124 |
sentences_to_skip = max(1, len(chunk_sents) - 2)
|
| 125 |
i += sentences_to_skip
|
| 126 |
|
| 127 |
-
# Predict on chunks
|
| 128 |
all_entities = []
|
| 129 |
|
| 130 |
for chunk in chunks:
|
|
@@ -148,6 +155,7 @@ def ner_predict(text):
|
|
| 148 |
except Exception as e:
|
| 149 |
print(f"Chunk processing error: {e}")
|
| 150 |
continue
|
|
|
|
| 151 |
|
| 152 |
# Sort and deduplicate
|
| 153 |
all_entities = sorted(all_entities, key=lambda x: (x['start'], -x['score']))
|
|
|
|
| 95 |
if not sentences:
|
| 96 |
return "<p>No sentences detected</p>", ""
|
| 97 |
|
| 98 |
+
# Pre-tokenize sentences ONCE (cache token counts)
|
| 99 |
+
sentence_token_counts = []
|
| 100 |
+
for sent in sentences:
|
| 101 |
+
tokens = ner_pipeline.tokenizer.tokenize(sent["text"])
|
| 102 |
+
sentence_token_counts.append(len(tokens))
|
| 103 |
+
|
| 104 |
+
# Chunking with cached token counts
|
| 105 |
max_tokens = 450
|
| 106 |
chunks = []
|
|
|
|
| 107 |
i = 0
|
| 108 |
+
|
| 109 |
while i < len(sentences):
|
| 110 |
chunk_sents = []
|
| 111 |
+
token_count = 0
|
| 112 |
|
| 113 |
for j in range(i, len(sentences)):
|
| 114 |
+
sent_token_count = sentence_token_counts[j]
|
|
|
|
| 115 |
|
| 116 |
+
# Check if adding this sentence exceeds limit
|
| 117 |
+
if token_count + sent_token_count > max_tokens and chunk_sents:
|
| 118 |
break
|
| 119 |
|
| 120 |
chunk_sents.append(sentences[j])
|
| 121 |
+
token_count += sent_token_count
|
| 122 |
|
| 123 |
if chunk_sents:
|
| 124 |
+
chunk_text = " ".join([s["text"] for s in chunk_sents])
|
| 125 |
chunks.append({
|
| 126 |
"text": chunk_text,
|
| 127 |
"offset": chunk_sents[0]["start"],
|
|
|
|
| 131 |
sentences_to_skip = max(1, len(chunk_sents) - 2)
|
| 132 |
i += sentences_to_skip
|
| 133 |
|
| 134 |
+
# Predict on chunks (NO CHANGES HERE)
|
| 135 |
all_entities = []
|
| 136 |
|
| 137 |
for chunk in chunks:
|
|
|
|
| 155 |
except Exception as e:
|
| 156 |
print(f"Chunk processing error: {e}")
|
| 157 |
continue
|
| 158 |
+
|
| 159 |
|
| 160 |
# Sort and deduplicate
|
| 161 |
all_entities = sorted(all_entities, key=lambda x: (x['start'], -x['score']))
|