File size: 5,226 Bytes
0bcc156
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#!/usr/bin/env python3
"""Swedish Causality Detection - HuggingFace Space"""

import gradio as gr
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from datasets import load_dataset
import pickle
import os

# Global model variables
classifier = None
embedder = None

def load_models():
    """Load or train the causality classifier."""
    global classifier, embedder

    # Load embedding model
    embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

    model_path = "causality_classifier.pkl"

    if os.path.exists(model_path):
        with open(model_path, 'rb') as f:
            classifier = pickle.load(f)
    else:
        # Train classifier on the dataset
        print("Training classifier...")
        dataset = load_dataset("UppsalaNLP/swedish-causality-binary")

        train_texts = dataset['train']['target_sentence']
        train_labels = dataset['train']['label']

        # Generate embeddings
        train_embeddings = embedder.encode(train_texts, show_progress_bar=True)

        # Train logistic regression
        classifier = LogisticRegression(max_iter=1000, random_state=42)
        classifier.fit(train_embeddings, train_labels)

        # Save model
        with open(model_path, 'wb') as f:
            pickle.dump(classifier, f)

        print("Classifier trained and saved!")

def detect_causality(text: str) -> dict:
    """Detect causality in Swedish text."""
    if not text.strip():
        return {"Causal": 0.0, "Non-causal": 0.0}

    # Generate embedding
    embedding = embedder.encode([text])

    # Get prediction probabilities
    probs = classifier.predict_proba(embedding)[0]

    return {
        "Non-causal": float(probs[0]),
        "Causal": float(probs[1])
    }

def analyze_text(text: str) -> tuple:
    """Analyze text and return results."""
    if not text.strip():
        return {}, "Please enter some text to analyze."

    # Get causality scores
    scores = detect_causality(text)

    # Determine result
    is_causal = scores["Causal"] > scores["Non-causal"]
    confidence = max(scores.values())

    if is_causal:
        result = f"**Causal relation detected** (confidence: {confidence:.1%})\n\n"
        result += "This sentence appears to express a cause-effect relationship."
    else:
        result = f"**No causal relation detected** (confidence: {confidence:.1%})\n\n"
        result += "This sentence does not appear to express a cause-effect relationship."

    return scores, result

# Example sentences
EXAMPLES = [
    ["Den lägre produktiviteten kan bero på att kvinnor har kortare arbetslivserfarenhet än män."],
    ["Klimatförändringarna leder till ökade havsnivåer och extrema väderhändelser."],
    ["Sverige är ett land i Nordeuropa med ungefär 10 miljoner invånare."],
    ["Regeringen presenterade sin budget för nästa år."],
    ["Bristen på utbildning orsakar hög arbetslöshet bland unga."],
    ["Stockholm är Sveriges huvudstad och största stad."],
]

# Load models at startup
print("Loading models...")
load_models()
print("Models loaded!")

# Create Gradio interface
with gr.Blocks(title="Swedish Causality Detection", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # Swedish Causality Detection

    Detect causal relations in Swedish text using machine learning.

    **Author:** Birger Moëll, Uppsala NLP

    This tool classifies whether a Swedish sentence expresses a cause-effect relationship.
    The model is trained on the [Swedish Causality Binary Dataset](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary).
    """)

    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                label="Swedish Text",
                placeholder="Enter a Swedish sentence to analyze...",
                lines=3
            )
            analyze_btn = gr.Button("Analyze", variant="primary")

        with gr.Column(scale=1):
            label_output = gr.Label(label="Causality Score")

    result_output = gr.Markdown(label="Analysis Result")

    gr.Examples(
        examples=EXAMPLES,
        inputs=text_input,
        label="Example Sentences (click to try)"
    )

    analyze_btn.click(
        fn=analyze_text,
        inputs=text_input,
        outputs=[label_output, result_output]
    )

    text_input.submit(
        fn=analyze_text,
        inputs=text_input,
        outputs=[label_output, result_output]
    )

    gr.Markdown("""
    ---

    ## About

    This classifier uses sentence embeddings from `paraphrase-multilingual-MiniLM-L12-v2`
    and logistic regression trained on Swedish government reports annotated for causality.

    **Dataset:** [UppsalaNLP/swedish-causality-binary](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary)

    **Citation:**
    ```
    Dürlich et al. (2022). Cause and Effect in Governmental Reports:
    Two Data Sets for Causality Detection in Swedish.
    ```

    **Links:**
    - [Uppsala NLP](https://huggingface.co/UppsalaNLP)
    - [GitHub Repository](https://github.com/UppsalaNLP/Swedish-Causality-Datasets)
    """)

if __name__ == "__main__":
    demo.launch()