WpnSta commited on
Commit
d55d8da
·
0 Parent(s):

initial commit

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +137 -0
  3. readme.md +1 -0
  4. requirements.txt +3 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ venv/
app.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import tempfile
3
+
4
+ import gradio as gr
5
+ from transformers import pipeline
6
+
7
+ MODEL_NAME = "WpnSta/lner-xlm-roberta"
8
+
9
+ ner_pipeline = pipeline(
10
+ "token-classification",
11
+ model=MODEL_NAME,
12
+ aggregation_strategy="simple",
13
+ )
14
+
15
+ def build_csv(text, entities):
16
+ """Create a CSV file mapping every word to its entity label or 'O'."""
17
+ # Split text into words while tracking character positions
18
+ words = []
19
+ i = 0
20
+ while i < len(text):
21
+ if text[i].isspace():
22
+ i += 1
23
+ continue
24
+ j = i
25
+ while j < len(text) and not text[j].isspace():
26
+ j += 1
27
+ words.append((text[i:j], i, j))
28
+ i = j
29
+
30
+ # For each word, find if it overlaps with an entity span
31
+ rows = []
32
+ for word, start, end in words:
33
+ label = "O"
34
+ for ent in entities:
35
+ if start >= ent["start"] and end <= ent["end"]:
36
+ label = ent["entity_group"]
37
+ break
38
+ if start < ent["end"] and end > ent["start"]:
39
+ label = ent["entity_group"]
40
+ break
41
+ rows.append((word, label))
42
+
43
+ # Write to a temp CSV file
44
+ tmp = tempfile.NamedTemporaryFile(
45
+ mode="w", suffix=".csv", delete=False, newline="", encoding="utf-8"
46
+ )
47
+ writer = csv.writer(tmp)
48
+ writer.writerow(["Word", "Label"])
49
+ writer.writerows(rows)
50
+ tmp.close()
51
+ return tmp.name
52
+
53
+
54
+ def run_ner(text: str):
55
+ if not text or not text.strip():
56
+ return gr.update(value={"text": "", "entities": []}, visible=False), gr.update(value=None, visible=False)
57
+
58
+ entities = ner_pipeline(text)
59
+
60
+ # Convert to HighlightedText format
61
+ highlighted_entities = []
62
+ for ent in entities:
63
+ highlighted_entities.append({
64
+ "entity": ent["entity_group"],
65
+ "start": ent["start"],
66
+ "end": ent["end"],
67
+ })
68
+
69
+ highlighted = {"text": text, "entities": highlighted_entities}
70
+ csv_path = build_csv(text, entities)
71
+ return gr.update(value=highlighted, visible=True), gr.update(value=csv_path, visible=True)
72
+
73
+
74
+ def process_file(file):
75
+ if file is None:
76
+ return gr.update(value={"text": "Please upload a .txt file.", "entities": []}, visible=True), gr.update(value=None, visible=False)
77
+ with open(file, "r", encoding="utf-8") as f:
78
+ text = f.read()
79
+ if not text.strip():
80
+ return gr.update(value={"text": "The uploaded file is empty.", "entities": []}, visible=True), gr.update(value=None, visible=False)
81
+ return run_ner(text)
82
+
83
+
84
+ COLOR_MAP = {
85
+ "PER": "#4A90D9", # Blue
86
+ "GPE": "#9B59B6", # Purple
87
+ "LOC": "#D94A4A", # Red
88
+ "ORG": "#4AD97A", # Green
89
+ "FAC": "#D9A34A", # Orange
90
+ "VEH": "#1ABC9C", # Teal
91
+ "TIME": "#E74C8B", # Pink
92
+ }
93
+
94
+
95
+ theme = gr.themes.Default(
96
+ primary_hue="stone",
97
+ secondary_hue="neutral",
98
+ neutral_hue="gray",
99
+ font=gr.themes.GoogleFont("Raleway"),
100
+ )
101
+
102
+ with gr.Blocks(title="NER Literary Texts", theme=theme) as demo:
103
+ gr.Markdown("# Named Entity Recognition for Literary Texts")
104
+ gr.Markdown("Detect persons, places, organizations, and more in **English**, **French**, and **Italian** text. The texts will be analysed using a fine-tuned XLM-RoBERTa model that was trained with literary texts in these languages dating from the 19th to the 20th century. For more technical information see https://github.com/WpnSta/CAS_Mod4_NER.")
105
+
106
+ with gr.Tabs():
107
+ with gr.Tab("Text Input"):
108
+ text_input = gr.Textbox(label="Enter or paste your text", lines=5)
109
+ gr.Examples(
110
+ examples=[
111
+ ["Although they had but that moment left the school behind them, they were now in the busy thoroughfares of a city, where shadowy passengers passed and re-passed; where shadowy carts and coaches battled for the way, and all the strife and tumult of a real city were. It was made plain enough, by the dressing of the shops, that here, too, it was Christmas-time again; but it was evening, and the streets were lighted up. The Ghost stopped at a certain warehouse door, and asked Scrooge if he knew it."],
112
+ ["Les cabines roulantes, attelées d'un cheval, remontaient aussi; et sur les planches de la promenade, qui borde la plage d'un bout à l'autre, c'était maintenant une coulée continue, épaisse et lente, de foule élégante, formant deux courants contraires qui se coudoyaient et se mêlaient. Pierre, nerveux, exaspéré par ce frôlement, s'enfuit, s'enfonça dans la ville et s'arrêta pour déjeuner chez un simple marchand de vins, à l'entrée des champs."],
113
+ ["Detto fatto traversarono la città, e, usciti fuori delle mura, si fermarono in un campo solitario che, su per giù, somigliava a tutti gli altri campi. Pinocchio è derubato delle sue monete d'oro, e per gastigo si busca quattro mesi di prigione. Il burattino, ritornato in città, cominciò a contare i minuti a uno a uno: e quando gli parve che fosse l'ora, riprese subito la strada che menava al Campo dei miracoli."],
114
+ ],
115
+ example_labels=[
116
+ "English - A Christmas Carol, Charles Dickens",
117
+ "French - Pierre et Jean, Guy de Maupassant",
118
+ "Italian - Le avventure di Pinocchio, Carlo Collodi",
119
+ ],
120
+ inputs=text_input,
121
+ )
122
+ text_button = gr.Button("Analyze", variant="primary")
123
+ text_output = gr.HighlightedText(label="Detected Entities", color_map=COLOR_MAP, show_legend=True, visible=False)
124
+ text_csv = gr.File(label="Download CSV", visible=False)
125
+ text_button.click(fn=run_ner, inputs=text_input, outputs=[text_output, text_csv])
126
+
127
+ with gr.Tab("File Upload"):
128
+ file_input = gr.File(label="Upload a .txt file", file_types=[".txt"], type="filepath")
129
+ file_button = gr.Button("Analyze File", variant="primary")
130
+ file_output = gr.HighlightedText(label="Detected Entities", color_map=COLOR_MAP, show_legend=True, visible=False)
131
+ file_csv = gr.File(label="Download CSV", visible=False)
132
+ file_button.click(fn=process_file, inputs=file_input, outputs=[file_output, file_csv])
133
+
134
+ gr.Markdown("### Entity Types\n| Tag | Meaning |\n|---|---|\n| PER | Person | \n| GPE | Geo-political entity |\n| LOC | Location |\n| ORG | Organization |\n| FAC | Facility |\n| VEH | Vehicle |\n| TIME | Temporal expression |")
135
+
136
+ if __name__ == "__main__":
137
+ demo.launch()
readme.md ADDED
@@ -0,0 +1 @@
 
 
1
+ # Multilingual NER tagger for literary texts
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ transformers
2
+ torch
3
+ gradio