File size: 6,731 Bytes
d55d8da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d7be9c6
d55d8da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c477151
d55d8da
 
d7be9c6
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import csv
import tempfile

import gradio as gr
from transformers import pipeline

MODEL_NAME = "WpnSta/lner-xlm-roberta"

ner_pipeline = pipeline(
    "token-classification",
    model=MODEL_NAME,
    aggregation_strategy="simple",
)

def build_csv(text, entities):
    """Create a CSV file mapping every word to its entity label or 'O'."""
    # Split text into words while tracking character positions
    words = []
    i = 0
    while i < len(text):
        if text[i].isspace():
            i += 1
            continue
        j = i
        while j < len(text) and not text[j].isspace():
            j += 1
        words.append((text[i:j], i, j))
        i = j

    # For each word, find if it overlaps with an entity span
    rows = []
    for word, start, end in words:
        label = "O"
        for ent in entities:
            if start >= ent["start"] and end <= ent["end"]:
                label = ent["entity_group"]
                break
            if start < ent["end"] and end > ent["start"]:
                label = ent["entity_group"]
                break
        rows.append((word, label))

    # Write to a temp CSV file
    tmp = tempfile.NamedTemporaryFile(
        mode="w", suffix=".csv", delete=False, newline="", encoding="utf-8"
    )
    writer = csv.writer(tmp)
    writer.writerow(["Word", "Label"])
    writer.writerows(rows)
    tmp.close()
    return tmp.name


def run_ner(text: str):
    if not text or not text.strip():
        return gr.update(value={"text": "", "entities": []}, visible=False), gr.update(value=None, visible=False)

    entities = ner_pipeline(text)

    # Convert to HighlightedText format
    highlighted_entities = []
    for ent in entities:
        highlighted_entities.append({
            "entity": ent["entity_group"],
            "start": ent["start"],
            "end": ent["end"],
        })

    highlighted = {"text": text, "entities": highlighted_entities}
    csv_path = build_csv(text, entities)
    return gr.update(value=highlighted, visible=True), gr.update(value=csv_path, visible=True)


def process_file(file):
    if file is None:
        return gr.update(value={"text": "Please upload a .txt file.", "entities": []}, visible=True), gr.update(value=None, visible=False)
    with open(file, "r", encoding="utf-8") as f:
        text = f.read()
    if not text.strip():
        return gr.update(value={"text": "The uploaded file is empty.", "entities": []}, visible=True), gr.update(value=None, visible=False)
    return run_ner(text)


COLOR_MAP = {
    "PER": "#4A90D9",   # Blue
    "GPE": "#9B59B6",   # Purple
    "LOC": "#D94A4A",   # Red
    "ORG": "#4AD97A",   # Green
    "FAC": "#D9A34A",   # Orange
    "VEH": "#1ABC9C",   # Teal
    "TIME": "#E74C8B",  # Pink
}


theme = gr.themes.Default(
    primary_hue="stone",
    secondary_hue="neutral",
    neutral_hue="gray",
    font=gr.themes.GoogleFont("Raleway"),
)

with gr.Blocks(title="NER Literary Texts") as demo:
    gr.Markdown("# Named Entity Recognition for Literary Texts")
    gr.Markdown("Detect persons, places, organizations, and more in **English**, **French**, and **Italian** text. The texts will be analysed using a fine-tuned XLM-RoBERTa model that was trained with literary texts in these languages dating from the 19th to the 20th century. For more technical information see https://github.com/WpnSta/CAS_Mod4_NER.")

    with gr.Tabs():
        with gr.Tab("Text Input"):
            text_input = gr.Textbox(label="Enter or paste your text", lines=5)
            gr.Examples(
                examples=[
                    ["Although they had but that moment left the school behind them, they were now in the busy thoroughfares of a city, where shadowy passengers passed and re-passed; where shadowy carts and coaches battled for the way, and all the strife and tumult of a real city were. It was made plain enough, by the dressing of the shops, that here, too, it was Christmas-time again; but it was evening, and the streets were lighted up. The Ghost stopped at a certain warehouse door, and asked Scrooge if he knew it."],
                    ["Les cabines roulantes, attelées d'un cheval, remontaient aussi; et sur les planches de la promenade, qui borde la plage d'un bout à l'autre, c'était maintenant une coulée continue, épaisse et lente, de foule élégante, formant deux courants contraires qui se coudoyaient et se mêlaient. Pierre, nerveux, exaspéré par ce frôlement, s'enfuit, s'enfonça dans la ville et s'arrêta pour déjeuner chez un simple marchand de vins, à l'entrée des champs."],
                    ["Detto fatto traversarono la città, e, usciti fuori delle mura, si fermarono in un campo solitario che, su per giù, somigliava a tutti gli altri campi. Pinocchio è derubato delle sue monete d'oro, e per gastigo si busca quattro mesi di prigione. Il burattino, ritornato in città, cominciò a contare i minuti a uno a uno: e quando gli parve che fosse l'ora, riprese subito la strada che menava al Campo dei miracoli."],
                ],
                example_labels=[
                    "English - A Christmas Carol, Charles Dickens",
                    "French - Pierre et Jean, Guy de Maupassant",
                    "Italian - Le avventure di Pinocchio, Carlo Collodi",
                ],
                inputs=text_input,
            )
            text_button = gr.Button("Analyze", variant="primary")
            text_output = gr.HighlightedText(label="Detected Entities", color_map=COLOR_MAP, show_legend=True, visible=False)
            text_csv = gr.File(label="Download CSV", visible=False)
            text_button.click(fn=run_ner, inputs=text_input, outputs=[text_output, text_csv])

        with gr.Tab("File Upload"):
            file_input = gr.File(label="Upload a .txt file", file_types=[".txt"], type="filepath")
            file_button = gr.Button("Analyze File", variant="primary")
            file_output = gr.HighlightedText(label="Detected Entities", color_map=COLOR_MAP, show_legend=True, visible=False)
            file_csv = gr.File(label="Download CSV", visible=False)
            file_button.click(fn=process_file, inputs=file_input, outputs=[file_output, file_csv])

    gr.Markdown("### Entity Types\n| Tag | Meaning |\n|---|---|\n| PER | Person, character, also animals with active role in the narrative | \n| GPE | Geo-political entity, e.g. London, the city |\n| LOC | Location, e.g. the sea, the beach |\n| ORG | Organization, e.g. the army, the court |\n| FAC | Facility, e.g. the kitchen, the street |\n| VEH | Vehicle, e.g. the coach, the boats |\n| TIME | Temporal or historical reference, e.g. in the morning |")

if __name__ == "__main__":
    demo.launch(theme=theme, ssr_mode=False)