Spaces:

Statistical-Impossibility
/

Feline-NER-Demo

Sleeping

Commit

3ebde54

verified ·

1 Parent(s): 36e71f4

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -95,26 +95,33 @@ def ner_predict(text):
     if not sentences:
         return "<p>No sentences detected</p>", ""
-    # Chunking with overlap
     max_tokens = 450
     chunks = []
     i = 0
     while i < len(sentences):
         chunk_sents = []
-        chunk_text = ""
         for j in range(i, len(sentences)):
-            candidate = chunk_text + " " + sentences[j]["text"] if chunk_text else sentences[j]["text"]
-            tokens = ner_pipeline.tokenizer.tokenize(candidate)
-            if len(tokens) > max_tokens and chunk_sents:
                 break
             chunk_sents.append(sentences[j])
-            chunk_text = candidate
         if chunk_sents:
             chunks.append({
                 "text": chunk_text,
                 "offset": chunk_sents[0]["start"],
@@ -124,7 +131,7 @@ def ner_predict(text):
         sentences_to_skip = max(1, len(chunk_sents) - 2)
         i += sentences_to_skip
-    # Predict on chunks
     all_entities = []
     for chunk in chunks:
@@ -148,6 +155,7 @@ def ner_predict(text):
         except Exception as e:
             print(f"Chunk processing error: {e}")
             continue
     # Sort and deduplicate
     all_entities = sorted(all_entities, key=lambda x: (x['start'], -x['score']))

     if not sentences:
         return "<p>No sentences detected</p>", ""
+    # Pre-tokenize sentences ONCE (cache token counts)
+    sentence_token_counts = []
+    for sent in sentences:
+        tokens = ner_pipeline.tokenizer.tokenize(sent["text"])
+        sentence_token_counts.append(len(tokens))
+    # Chunking with cached token counts
     max_tokens = 450
     chunks = []
     i = 0
     while i < len(sentences):
         chunk_sents = []
+        token_count = 0
         for j in range(i, len(sentences)):
+            sent_token_count = sentence_token_counts[j]
+            # Check if adding this sentence exceeds limit
+            if token_count + sent_token_count > max_tokens and chunk_sents:
                 break
             chunk_sents.append(sentences[j])
+            token_count += sent_token_count
         if chunk_sents:
+            chunk_text = " ".join([s["text"] for s in chunk_sents])
             chunks.append({
                 "text": chunk_text,
                 "offset": chunk_sents[0]["start"],
         sentences_to_skip = max(1, len(chunk_sents) - 2)
         i += sentences_to_skip
+    # Predict on chunks (NO CHANGES HERE)
     all_entities = []
     for chunk in chunks:
         except Exception as e:
             print(f"Chunk processing error: {e}")
             continue
     # Sort and deduplicate
     all_entities = sorted(all_entities, key=lambda x: (x['start'], -x['score']))