Spaces:

UppsalaNLP
/

swedish-causality-detection

Sleeping

File size: 5,226 Bytes

0bcc156

#!/usr/bin/env python3
"""Swedish Causality Detection - HuggingFace Space"""

import gradio as gr
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from datasets import load_dataset
import pickle
import os

# Global model variables
classifier = None
embedder = None

def load_models():
    """Load or train the causality classifier."""
    global classifier, embedder

    # Load embedding model
    embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

    model_path = "causality_classifier.pkl"

    if os.path.exists(model_path):
        with open(model_path, 'rb') as f:
            classifier = pickle.load(f)
    else:
        # Train classifier on the dataset
        print("Training classifier...")
        dataset = load_dataset("UppsalaNLP/swedish-causality-binary")

        train_texts = dataset['train']['target_sentence']
        train_labels = dataset['train']['label']

        # Generate embeddings
        train_embeddings = embedder.encode(train_texts, show_progress_bar=True)

        # Train logistic regression
        classifier = LogisticRegression(max_iter=1000, random_state=42)
        classifier.fit(train_embeddings, train_labels)

        # Save model
        with open(model_path, 'wb') as f:
            pickle.dump(classifier, f)

        print("Classifier trained and saved!")

def detect_causality(text: str) -> dict:
    """Detect causality in Swedish text."""
    if not text.strip():
        return {"Causal": 0.0, "Non-causal": 0.0}

    # Generate embedding
    embedding = embedder.encode([text])

    # Get prediction probabilities
    probs = classifier.predict_proba(embedding)[0]

    return {
        "Non-causal": float(probs[0]),
        "Causal": float(probs[1])
    }

def analyze_text(text: str) -> tuple:
    """Analyze text and return results."""
    if not text.strip():
        return {}, "Please enter some text to analyze."

    # Get causality scores
    scores = detect_causality(text)

    # Determine result
    is_causal = scores["Causal"] > scores["Non-causal"]
    confidence = max(scores.values())

    if is_causal:
        result = f"**Causal relation detected** (confidence: {confidence:.1%})\n\n"
        result += "This sentence appears to express a cause-effect relationship."
    else:
        result = f"**No causal relation detected** (confidence: {confidence:.1%})\n\n"
        result += "This sentence does not appear to express a cause-effect relationship."

    return scores, result

# Example sentences
EXAMPLES = [
    ["Den lägre produktiviteten kan bero på att kvinnor har kortare arbetslivserfarenhet än män."],
    ["Klimatförändringarna leder till ökade havsnivåer och extrema väderhändelser."],
    ["Sverige är ett land i Nordeuropa med ungefär 10 miljoner invånare."],
    ["Regeringen presenterade sin budget för nästa år."],
    ["Bristen på utbildning orsakar hög arbetslöshet bland unga."],
    ["Stockholm är Sveriges huvudstad och största stad."],
]

# Load models at startup
print("Loading models...")
load_models()
print("Models loaded!")

# Create Gradio interface
with gr.Blocks(title="Swedish Causality Detection", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # Swedish Causality Detection

    Detect causal relations in Swedish text using machine learning.

    **Author:** Birger Moëll, Uppsala NLP

    This tool classifies whether a Swedish sentence expresses a cause-effect relationship.
    The model is trained on the [Swedish Causality Binary Dataset](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary).
    """)

    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Swedish Text",
                placeholder="Enter a Swedish sentence to analyze...",
                lines=3
            )
            analyze_btn = gr.Button("Analyze", variant="primary")

        with gr.Column(scale=1):
            label_output = gr.Label(label="Causality Score")

    result_output = gr.Markdown(label="Analysis Result")

    gr.Examples(
        examples=EXAMPLES,
        inputs=text_input,
        label="Example Sentences (click to try)"
    )

    analyze_btn.click(
        fn=analyze_text,
        inputs=text_input,
        outputs=[label_output, result_output]
    )

    text_input.submit(
        fn=analyze_text,
        inputs=text_input,
        outputs=[label_output, result_output]
    )

    gr.Markdown("""
    ---

    ## About

    This classifier uses sentence embeddings from `paraphrase-multilingual-MiniLM-L12-v2`
    and logistic regression trained on Swedish government reports annotated for causality.

    **Dataset:** [UppsalaNLP/swedish-causality-binary](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary)

    **Citation:**
    ```
    Dürlich et al. (2022). Cause and Effect in Governmental Reports:
    Two Data Sets for Causality Detection in Swedish.
    ```

    **Links:**
    - [Uppsala NLP](https://huggingface.co/UppsalaNLP)
    - [GitHub Repository](https://github.com/UppsalaNLP/Swedish-Causality-Datasets)
    """)

if __name__ == "__main__":
    demo.launch()