ArabovMK commited on
Commit
e55178d
·
verified ·
1 Parent(s): 6b19291

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +227 -0
app.py CHANGED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
4
+ import torch
5
+
6
+ # ----------------------------------------------------------------------
7
+ # Page configuration
8
+ # ----------------------------------------------------------------------
9
+ st.set_page_page(
10
+ page_title="Tatar Morphological Analyzer",
11
+ page_icon="🔤",
12
+ layout="wide",
13
+ initial_sidebar_state="expanded"
14
+ )
15
+
16
+ # ----------------------------------------------------------------------
17
+ # Header and description
18
+ # ----------------------------------------------------------------------
19
+ st.title("🔤 Tatar Morphological Analyzer")
20
+ st.markdown(
21
+ """
22
+ Interactive demo of models for morphological analysis of the Tatar language,
23
+ developed by the [TatarNLPWorld](https://huggingface.co/TatarNLPWorld) community.
24
+ Choose a model, enter a Tatar sentence, and get token‑level predictions with full
25
+ morphological tags.
26
+ """
27
+ )
28
+
29
+ # ----------------------------------------------------------------------
30
+ # Sidebar: model selection and performance info
31
+ # ----------------------------------------------------------------------
32
+ with st.sidebar:
33
+ st.header("⚙️ Model Settings")
34
+
35
+ # Available models: display name -> Hugging Face Hub ID
36
+ MODEL_OPTIONS = {
37
+ "mBERT (multilingual BERT)": "TatarNLPWorld/tatar-morph-mbert",
38
+ "RuBERT (Russian BERT)": "TatarNLPWorld/tatar-morph-rubert",
39
+ "DistilBERT (multilingual)": "TatarNLPWorld/tatar-morph-distilbert",
40
+ "XLM-RoBERTa (base)": "TatarNLPWorld/tatar-morph-xlmr",
41
+ "Turkish BERT": "TatarNLPWorld/tatar-morph-turkish-bert",
42
+ }
43
+
44
+ selected_model_name = st.selectbox(
45
+ "Select model for analysis:",
46
+ list(MODEL_OPTIONS.keys()),
47
+ index=0
48
+ )
49
+ model_id = MODEL_OPTIONS[selected_model_name]
50
+
51
+ # Hard-coded metrics from the experiment (you can also load them from a file)
52
+ model_info = {
53
+ "TatarNLPWorld/tatar-morph-mbert": {
54
+ "accuracy": 0.9868,
55
+ "f1_micro": 0.9868,
56
+ "f1_macro": 0.5094,
57
+ "description": "Best overall accuracy.",
58
+ },
59
+ "TatarNLPWorld/tatar-morph-rubert": {
60
+ "accuracy": 0.9813,
61
+ "f1_micro": 0.9813,
62
+ "f1_macro": 0.4737,
63
+ "description": "Excellent performance due to Russian–Tatar language proximity.",
64
+ },
65
+ "TatarNLPWorld/tatar-morph-distilbert": {
66
+ "accuracy": 0.9798,
67
+ "f1_micro": 0.9798,
68
+ "f1_macro": 0.4402,
69
+ "description": "Lightweight and fast, almost no quality loss.",
70
+ },
71
+ "TatarNLPWorld/tatar-morph-xlmr": {
72
+ "accuracy": 0.9767,
73
+ "f1_micro": 0.9767,
74
+ "f1_macro": 0.4061,
75
+ "description": "Powerful multilingual model.",
76
+ },
77
+ "TatarNLPWorld/tatar-morph-turkish-bert": {
78
+ "accuracy": 0.8684,
79
+ "f1_micro": 0.8684,
80
+ "f1_macro": 0.3334,
81
+ "description": "Solid baseline thanks to Turkic language relatedness.",
82
+ },
83
+ }
84
+
85
+ info = model_info[model_id]
86
+
87
+ st.markdown("---")
88
+ st.subheader("📊 Model Metrics (test set)")
89
+ col1, col2 = st.columns(2)
90
+ with col1:
91
+ st.metric("Token Accuracy", f"{info['accuracy']:.2%}")
92
+ st.metric("F1 (micro)", f"{info['f1_micro']:.2%}")
93
+ with col2:
94
+ st.metric("F1 (macro)", f"{info['f1_macro']:.2%}")
95
+ st.caption(info["description"])
96
+
97
+ st.markdown("---")
98
+ st.markdown(
99
+ """
100
+ **Links:**
101
+ - [Model repository](https://huggingface.co/{})
102
+ - [Dataset](https://huggingface.co/datasets/TatarNLPWorld/tatar-morphological-corpus)
103
+ - [TatarNLPWorld organization](https://huggingface.co/TatarNLPWorld)
104
+ """.format(model_id)
105
+ )
106
+
107
+ # ----------------------------------------------------------------------
108
+ # Cache model loading (so it's not reloaded on every interaction)
109
+ # ----------------------------------------------------------------------
110
+ @st.cache_resource(show_spinner="Loading model... (may take up to a minute)")
111
+ def load_model(model_id: str):
112
+ """Load tokenizer, model, and return a token-classification pipeline."""
113
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
114
+ model = AutoModelForTokenClassification.from_pretrained(model_id)
115
+ # Use pipeline with aggregation_strategy="simple" to merge subwords into words
116
+ nlp = pipeline(
117
+ "token-classification",
118
+ model=model,
119
+ tokenizer=tokenizer,
120
+ aggregation_strategy="simple", # merges subword tokens
121
+ device=0 if torch.cuda.is_available() else -1
122
+ )
123
+ return nlp
124
+
125
+ # ----------------------------------------------------------------------
126
+ # Main area: text input and analysis button
127
+ # ----------------------------------------------------------------------
128
+ col_input, col_examples = st.columns([3, 1])
129
+
130
+ with col_input:
131
+ input_text = st.text_area(
132
+ "✏️ Enter a Tatar sentence:",
133
+ value="Min tatarça söyläşäm.",
134
+ height=100,
135
+ placeholder="Example: Kiçä min duslarım belän parkka bardım."
136
+ )
137
+ analyze_clicked = st.button("🔍 Analyze", type="primary", use_container_width=True)
138
+
139
+ with col_examples:
140
+ st.markdown("##### 📋 Examples")
141
+ if st.button("Simple sentence"):
142
+ input_text = "Min tatarça söyläşäm."
143
+ if st.button("Complex sentence"):
144
+ input_text = "Kiçä min duslarım belän parkka bardım."
145
+ if st.button("Definition"):
146
+ input_text = "Tatarstan – Rossiya Federatsiäse sostavındağı respublika."
147
+
148
+ # ----------------------------------------------------------------------
149
+ # Perform analysis when button is clicked
150
+ # ----------------------------------------------------------------------
151
+ if analyze_clicked and input_text.strip():
152
+ try:
153
+ with st.spinner("Analyzing..."):
154
+ nlp = load_model(model_id)
155
+ results = nlp(input_text)
156
+
157
+ if not results:
158
+ st.warning("No results returned. The sentence may be too short or contain unrecognized characters.")
159
+ else:
160
+ # Convert to DataFrame for better display
161
+ df = pd.DataFrame(results)
162
+ # Rename columns for readability
163
+ df.rename(columns={
164
+ "word": "Word",
165
+ "entity": "Morphological Tag",
166
+ "score": "Confidence",
167
+ "start": "Start",
168
+ "end": "End"
169
+ }, inplace=True)
170
+ df["Confidence"] = df["Confidence"].apply(lambda x: f"{x:.3f}")
171
+
172
+ st.subheader("📋 Analysis Results")
173
+ st.dataframe(df[["Word", "Morphological Tag", "Confidence"]], use_container_width=True)
174
+
175
+ # Visualize as colored badges
176
+ st.subheader("🏷️ Tag Visualization")
177
+ html_spans = []
178
+ for _, row in df.iterrows():
179
+ # Generate a color based on the tag (simple hash)
180
+ tag = row["Morphological Tag"]
181
+ color = f"hsl({hash(tag) % 360}, 70%, 80%)"
182
+ span = f"<span style='background-color: {color}; padding: 0.3rem 0.6rem; margin: 0.2rem; border-radius: 12px; display: inline-block; font-size: 1rem;'>{row['Word']}<br><small>{tag}</small></span>"
183
+ html_spans.append(span)
184
+ st.markdown(
185
+ f"<div style='display: flex; flex-wrap: wrap; gap: 0.5rem;'>{' '.join(html_spans)}</div>",
186
+ unsafe_allow_html=True
187
+ )
188
+
189
+ except Exception as e:
190
+ st.error(f"❌ An error occurred during analysis: {e}")
191
+ st.exception(e) # for debugging; you may remove it in production
192
+
193
+ else:
194
+ if analyze_clicked and not input_text.strip():
195
+ st.warning("Please enter some text to analyze.")
196
+
197
+ # ----------------------------------------------------------------------
198
+ # Information about tags
199
+ # ----------------------------------------------------------------------
200
+ with st.expander("ℹ️ About morphological tags"):
201
+ st.markdown("""
202
+ The models predict **full morphological tags** in the format used in the
203
+ [TatarNLPWorld/tatar-morphological-corpus](https://huggingface.co/datasets/TatarNLPWorld/tatar-morphological-corpus).
204
+ Tags are sequences of grammatical features separated by `+`.
205
+
206
+ **Examples:**
207
+ - `N+Sg+Nom` — noun, singular, nominative case
208
+ - `V+Past+3` — verb, past tense, 3rd person
209
+ - `PUNCT` — punctuation
210
+ - `Adj` — adjective without additional features
211
+
212
+ The complete list of tags is available in the `tag2id.json` file inside each model repository.
213
+ """)
214
+
215
+ # ----------------------------------------------------------------------
216
+ # Footer
217
+ # ----------------------------------------------------------------------
218
+ st.markdown("---")
219
+ st.markdown(
220
+ """
221
+ <div style='text-align: center; color: gray;'>
222
+ Developed by <a href='https://huggingface.co/ArabovMK'>Arabov Mullosharaf Kurbonovich</a>
223
+ for the <a href='https://huggingface.co/TatarNLPWorld'>TatarNLPWorld</a> community.
224
+ </div>
225
+ """,
226
+ unsafe_allow_html=True
227
+ )