Spaces:

TatarNLPWorld
/

TatarMorphAnalyzer

Runtime error

App Files Files Community

ArabovMK commited on Mar 19

Commit

e55178d

verified ·

1 Parent(s): 6b19291

Update app.py

Browse files

Files changed (1) hide show

app.py +227 -0

app.py CHANGED Viewed

	@@ -0,0 +1,227 @@

+import streamlit as st
+import pandas as pd
+from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
+import torch
+# ----------------------------------------------------------------------
+# Page configuration
+# ----------------------------------------------------------------------
+st.set_page_page(
+    page_title="Tatar Morphological Analyzer",
+    page_icon="🔤",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# ----------------------------------------------------------------------
+# Header and description
+# ----------------------------------------------------------------------
+st.title("🔤 Tatar Morphological Analyzer")
+st.markdown(
+    """
+    Interactive demo of models for morphological analysis of the Tatar language,
+    developed by the [TatarNLPWorld](https://huggingface.co/TatarNLPWorld) community.
+    Choose a model, enter a Tatar sentence, and get token‑level predictions with full
+    morphological tags.
+    """
+)
+# ----------------------------------------------------------------------
+# Sidebar: model selection and performance info
+# ----------------------------------------------------------------------
+with st.sidebar:
+    st.header("⚙️ Model Settings")
+    # Available models: display name -> Hugging Face Hub ID
+    MODEL_OPTIONS = {
+        "mBERT (multilingual BERT)": "TatarNLPWorld/tatar-morph-mbert",
+        "RuBERT (Russian BERT)": "TatarNLPWorld/tatar-morph-rubert",
+        "DistilBERT (multilingual)": "TatarNLPWorld/tatar-morph-distilbert",
+        "XLM-RoBERTa (base)": "TatarNLPWorld/tatar-morph-xlmr",
+        "Turkish BERT": "TatarNLPWorld/tatar-morph-turkish-bert",
+    }
+    selected_model_name = st.selectbox(
+        "Select model for analysis:",
+        list(MODEL_OPTIONS.keys()),
+        index=0
+    )
+    model_id = MODEL_OPTIONS[selected_model_name]
+    # Hard-coded metrics from the experiment (you can also load them from a file)
+    model_info = {
+        "TatarNLPWorld/tatar-morph-mbert": {
+            "accuracy": 0.9868,
+            "f1_micro": 0.9868,
+            "f1_macro": 0.5094,
+            "description": "Best overall accuracy.",
+        },
+        "TatarNLPWorld/tatar-morph-rubert": {
+            "accuracy": 0.9813,
+            "f1_micro": 0.9813,
+            "f1_macro": 0.4737,
+            "description": "Excellent performance due to Russian–Tatar language proximity.",
+        },
+        "TatarNLPWorld/tatar-morph-distilbert": {
+            "accuracy": 0.9798,
+            "f1_micro": 0.9798,
+            "f1_macro": 0.4402,
+            "description": "Lightweight and fast, almost no quality loss.",
+        },
+        "TatarNLPWorld/tatar-morph-xlmr": {
+            "accuracy": 0.9767,
+            "f1_micro": 0.9767,
+            "f1_macro": 0.4061,
+            "description": "Powerful multilingual model.",
+        },
+        "TatarNLPWorld/tatar-morph-turkish-bert": {
+            "accuracy": 0.8684,
+            "f1_micro": 0.8684,
+            "f1_macro": 0.3334,
+            "description": "Solid baseline thanks to Turkic language relatedness.",
+        },
+    }
+    info = model_info[model_id]
+    st.markdown("---")
+    st.subheader("📊 Model Metrics (test set)")
+    col1, col2 = st.columns(2)
+    with col1:
+        st.metric("Token Accuracy", f"{info['accuracy']:.2%}")
+        st.metric("F1 (micro)", f"{info['f1_micro']:.2%}")
+    with col2:
+        st.metric("F1 (macro)", f"{info['f1_macro']:.2%}")
+    st.caption(info["description"])
+    st.markdown("---")
+    st.markdown(
+        """
+        **Links:**
+        - [Model repository](https://huggingface.co/{})
+        - [Dataset](https://huggingface.co/datasets/TatarNLPWorld/tatar-morphological-corpus)
+        - [TatarNLPWorld organization](https://huggingface.co/TatarNLPWorld)
+        """.format(model_id)
+    )
+# ----------------------------------------------------------------------
+# Cache model loading (so it's not reloaded on every interaction)
+# ----------------------------------------------------------------------
+@st.cache_resource(show_spinner="Loading model... (may take up to a minute)")
+def load_model(model_id: str):
+    """Load tokenizer, model, and return a token-classification pipeline."""
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    model = AutoModelForTokenClassification.from_pretrained(model_id)
+    # Use pipeline with aggregation_strategy="simple" to merge subwords into words
+    nlp = pipeline(
+        "token-classification",
+        model=model,
+        tokenizer=tokenizer,
+        aggregation_strategy="simple",   # merges subword tokens
+        device=0 if torch.cuda.is_available() else -1
+    )
+    return nlp
+# ----------------------------------------------------------------------
+# Main area: text input and analysis button
+# ----------------------------------------------------------------------
+col_input, col_examples = st.columns([3, 1])
+with col_input:
+    input_text = st.text_area(
+        "✏️ Enter a Tatar sentence:",
+        value="Min tatarça söyläşäm.",
+        height=100,
+        placeholder="Example: Kiçä min duslarım belän parkka bardım."
+    )
+    analyze_clicked = st.button("🔍 Analyze", type="primary", use_container_width=True)
+with col_examples:
+    st.markdown("##### 📋 Examples")
+    if st.button("Simple sentence"):
+        input_text = "Min tatarça söyläşäm."
+    if st.button("Complex sentence"):
+        input_text = "Kiçä min duslarım belän parkka bardım."
+    if st.button("Definition"):
+        input_text = "Tatarstan – Rossiya Federatsiäse sostavındağı respublika."
+# ----------------------------------------------------------------------
+# Perform analysis when button is clicked
+# ----------------------------------------------------------------------
+if analyze_clicked and input_text.strip():
+    try:
+        with st.spinner("Analyzing..."):
+            nlp = load_model(model_id)
+            results = nlp(input_text)
+        if not results:
+            st.warning("No results returned. The sentence may be too short or contain unrecognized characters.")
+        else:
+            # Convert to DataFrame for better display
+            df = pd.DataFrame(results)
+            # Rename columns for readability
+            df.rename(columns={
+                "word": "Word",
+                "entity": "Morphological Tag",
+                "score": "Confidence",
+                "start": "Start",
+                "end": "End"
+            }, inplace=True)
+            df["Confidence"] = df["Confidence"].apply(lambda x: f"{x:.3f}")
+            st.subheader("📋 Analysis Results")
+            st.dataframe(df[["Word", "Morphological Tag", "Confidence"]], use_container_width=True)
+            # Visualize as colored badges
+            st.subheader("🏷️ Tag Visualization")
+            html_spans = []
+            for _, row in df.iterrows():
+                # Generate a color based on the tag (simple hash)
+                tag = row["Morphological Tag"]
+                color = f"hsl({hash(tag) % 360}, 70%, 80%)"
+                span = f"<span style='background-color: {color}; padding: 0.3rem 0.6rem; margin: 0.2rem; border-radius: 12px; display: inline-block; font-size: 1rem;'>{row['Word']}<br><small>{tag}</small></span>"
+                html_spans.append(span)
+            st.markdown(
+                f"<div style='display: flex; flex-wrap: wrap; gap: 0.5rem;'>{' '.join(html_spans)}</div>",
+                unsafe_allow_html=True
+            )
+    except Exception as e:
+        st.error(f"❌ An error occurred during analysis: {e}")
+        st.exception(e)   # for debugging; you may remove it in production
+else:
+    if analyze_clicked and not input_text.strip():
+        st.warning("Please enter some text to analyze.")
+# ----------------------------------------------------------------------
+# Information about tags
+# ----------------------------------------------------------------------
+with st.expander("ℹ️ About morphological tags"):
+    st.markdown("""
+    The models predict **full morphological tags** in the format used in the
+    [TatarNLPWorld/tatar-morphological-corpus](https://huggingface.co/datasets/TatarNLPWorld/tatar-morphological-corpus).
+    Tags are sequences of grammatical features separated by `+`.
+    **Examples:**
+    - `N+Sg+Nom` — noun, singular, nominative case
+    - `V+Past+3` — verb, past tense, 3rd person
+    - `PUNCT` — punctuation
+    - `Adj` — adjective without additional features
+    The complete list of tags is available in the `tag2id.json` file inside each model repository.
+    """)
+# ----------------------------------------------------------------------
+# Footer
+# ----------------------------------------------------------------------
+st.markdown("---")
+st.markdown(
+    """
+    <div style='text-align: center; color: gray;'>
+        Developed by <a href='https://huggingface.co/ArabovMK'>Arabov Mullosharaf Kurbonovich</a>
+        for the <a href='https://huggingface.co/TatarNLPWorld'>TatarNLPWorld</a> community.
+    </div>
+    """,
+    unsafe_allow_html=True
+)