Spaces:

hs-knowledge
/

ner_app

Sleeping

App Files Files Community

finiteautomata commited on Jun 14, 2023

Commit

3f556fb

1 Parent(s): 8739181

EL demo

Browse files

Files changed (1) hide show

app.py +39 -61

app.py CHANGED Viewed

@@ -5,78 +5,56 @@ from datasets import load_dataset
 from annotated_text import annotated_text
 # Load data
-ds = load_dataset("hs-knowledge/hateval_ner")
-ds_2 = load_dataset("hs-knowledge/hateval_ner_2")
 # Show highlighted ner entities in a tweet
 def display_text(example):
     # Use annotated_text to show entities
-    ner_output = example["ner_output"]
     chunks = []
-    current_chunk = ""
-    current_type = None
-    # Check if there are two labels repeated
-    previous_label = None
-    for label in ner_output["labels"]:
-        if label and previous_label and previous_label == label and label != "O" and not label.startswith("I-") and not label.startswith("B-"):
-            pass
-        previous_label = label
-    for token, label in zip(ner_output["tokens"], ner_output["labels"]):
-        if label is None:
-            # Perhaps it is too long
-            continue
-        if label == "O":
-            if current_type is not None:
-                # Add previous entity
-                chunks.append((current_chunk.strip(), current_type))
-                current_chunk = token + " "
-                current_type = None
-            else:
-                current_chunk += token + " "
-                current_type = None
-        elif label.startswith("B-"):
-            if current_chunk:
-                chunks.append((current_chunk.strip(), current_type))
-            current_chunk = token + " "
-            current_type = label[2:]
-        elif label.startswith('I-'):
-            current_chunk += token + " "
-            current_type = label[2:]
-        else:
-            # It doesn't start with B- or I- => add single token
-            if label != current_type:
-                chunks.append((current_chunk.strip(), current_type))
-                current_chunk = token + " "
-                current_type = label
-            else:
-                current_chunk += token + " "
-                current_type = label
-    if current_chunk:
-        chunks.append((current_chunk.strip(), current_type))
-    # remove nones
     chunks = [(c, t) if t is not None else c for c, t in chunks]
     annotated_text(*chunks)
-# Get first 1000 examples
-elements = random.choices(range(len(ds["train"])), k=300)
 ds["train"] = ds["train"].select(elements)
-ds_2["train"] = ds_2["train"].select(elements)
-for ex1, ex2 in zip(ds["train"], ds_2["train"]):
-    st.write("====================================")
-    st.write("NER model: robertuito", "\n")
-    display_text(ex1)
-    st.write("NER model: roberta-large", "\n")
-    display_text(ex2)
-    st.write("\n")
-    st.write(f"Original text: {ex1['text']}")

 from annotated_text import annotated_text
 # Load data
+ds = load_dataset("hs-knowledge/hateval_enriched")
 # Show highlighted ner entities in a tweet
 def display_text(example):
     # Use annotated_text to show entities
+    text = example["text"]
+    # Sort entities by start
+    entities = sorted(example["entities"], key=lambda x: x["start"])
+    # Chunk text
+    if len(entities) == 0:
+        annotated_text(*[text])
+        return
     chunks = []
+    last_index = 0
+    for i in range(len(entities)):
+        entity = entities[i]
+        start, end = entity["start"], entity["end"]
+        if last_index < start:
+            chunk_before_entity = text[last_index : entity["start"]]
+            chunks.append((chunk_before_entity, None))
+        chunks.append((entity["text"], entity["type"]))
+        last_index = end
+    # description = entity["kg_result"]["detailedDescription"]["articleBody"]
     chunks = [(c, t) if t is not None else c for c, t in chunks]
     annotated_text(*chunks)
+# Get first 1000 examples
+elements = random.choices(range(len(ds["train"])), k=50)
 ds["train"] = ds["train"].select(elements)
+for ex in ds["train"]:
+    st.write("=" * 80)
+    # display_text(ex)
+    st.write(ex["text"])
+    for ent in ex["entities"]:
+        entity_name = ent["text"]
+        entity_type = ent["type"]
+        entity_description = ent["kg_result"]["detailedDescription"]["articleBody"]
+        annotated_text(
+            (entity_name, "entity"), (f"({entity_type})", "type"), entity_description
+        )