|
|
|
|
|
"""Swedish Causality Detection - HuggingFace Space""" |
|
|
|
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
from sentence_transformers import SentenceTransformer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from datasets import load_dataset |
|
|
import pickle |
|
|
import os |
|
|
|
|
|
|
|
|
classifier = None |
|
|
embedder = None |
|
|
|
|
|
def load_models(): |
|
|
"""Load or train the causality classifier.""" |
|
|
global classifier, embedder |
|
|
|
|
|
|
|
|
embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') |
|
|
|
|
|
model_path = "causality_classifier.pkl" |
|
|
|
|
|
if os.path.exists(model_path): |
|
|
with open(model_path, 'rb') as f: |
|
|
classifier = pickle.load(f) |
|
|
else: |
|
|
|
|
|
print("Training classifier...") |
|
|
dataset = load_dataset("UppsalaNLP/swedish-causality-binary") |
|
|
|
|
|
train_texts = dataset['train']['target_sentence'] |
|
|
train_labels = dataset['train']['label'] |
|
|
|
|
|
|
|
|
train_embeddings = embedder.encode(train_texts, show_progress_bar=True) |
|
|
|
|
|
|
|
|
classifier = LogisticRegression(max_iter=1000, random_state=42) |
|
|
classifier.fit(train_embeddings, train_labels) |
|
|
|
|
|
|
|
|
with open(model_path, 'wb') as f: |
|
|
pickle.dump(classifier, f) |
|
|
|
|
|
print("Classifier trained and saved!") |
|
|
|
|
|
def detect_causality(text: str) -> dict: |
|
|
"""Detect causality in Swedish text.""" |
|
|
if not text.strip(): |
|
|
return {"Causal": 0.0, "Non-causal": 0.0} |
|
|
|
|
|
|
|
|
embedding = embedder.encode([text]) |
|
|
|
|
|
|
|
|
probs = classifier.predict_proba(embedding)[0] |
|
|
|
|
|
return { |
|
|
"Non-causal": float(probs[0]), |
|
|
"Causal": float(probs[1]) |
|
|
} |
|
|
|
|
|
def analyze_text(text: str) -> tuple: |
|
|
"""Analyze text and return results.""" |
|
|
if not text.strip(): |
|
|
return {}, "Please enter some text to analyze." |
|
|
|
|
|
|
|
|
scores = detect_causality(text) |
|
|
|
|
|
|
|
|
is_causal = scores["Causal"] > scores["Non-causal"] |
|
|
confidence = max(scores.values()) |
|
|
|
|
|
if is_causal: |
|
|
result = f"**Causal relation detected** (confidence: {confidence:.1%})\n\n" |
|
|
result += "This sentence appears to express a cause-effect relationship." |
|
|
else: |
|
|
result = f"**No causal relation detected** (confidence: {confidence:.1%})\n\n" |
|
|
result += "This sentence does not appear to express a cause-effect relationship." |
|
|
|
|
|
return scores, result |
|
|
|
|
|
|
|
|
EXAMPLES = [ |
|
|
["Den lägre produktiviteten kan bero på att kvinnor har kortare arbetslivserfarenhet än män."], |
|
|
["Klimatförändringarna leder till ökade havsnivåer och extrema väderhändelser."], |
|
|
["Sverige är ett land i Nordeuropa med ungefär 10 miljoner invånare."], |
|
|
["Regeringen presenterade sin budget för nästa år."], |
|
|
["Bristen på utbildning orsakar hög arbetslöshet bland unga."], |
|
|
["Stockholm är Sveriges huvudstad och största stad."], |
|
|
] |
|
|
|
|
|
|
|
|
print("Loading models...") |
|
|
load_models() |
|
|
print("Models loaded!") |
|
|
|
|
|
|
|
|
with gr.Blocks(title="Swedish Causality Detection", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# Swedish Causality Detection |
|
|
|
|
|
Detect causal relations in Swedish text using machine learning. |
|
|
|
|
|
**Author:** Birger Moëll, Uppsala NLP |
|
|
|
|
|
This tool classifies whether a Swedish sentence expresses a cause-effect relationship. |
|
|
The model is trained on the [Swedish Causality Binary Dataset](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary). |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
text_input = gr.Textbox( |
|
|
label="Swedish Text", |
|
|
placeholder="Enter a Swedish sentence to analyze...", |
|
|
lines=3 |
|
|
) |
|
|
analyze_btn = gr.Button("Analyze", variant="primary") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
label_output = gr.Label(label="Causality Score") |
|
|
|
|
|
result_output = gr.Markdown(label="Analysis Result") |
|
|
|
|
|
gr.Examples( |
|
|
examples=EXAMPLES, |
|
|
inputs=text_input, |
|
|
label="Example Sentences (click to try)" |
|
|
) |
|
|
|
|
|
analyze_btn.click( |
|
|
fn=analyze_text, |
|
|
inputs=text_input, |
|
|
outputs=[label_output, result_output] |
|
|
) |
|
|
|
|
|
text_input.submit( |
|
|
fn=analyze_text, |
|
|
inputs=text_input, |
|
|
outputs=[label_output, result_output] |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
|
|
|
## About |
|
|
|
|
|
This classifier uses sentence embeddings from `paraphrase-multilingual-MiniLM-L12-v2` |
|
|
and logistic regression trained on Swedish government reports annotated for causality. |
|
|
|
|
|
**Dataset:** [UppsalaNLP/swedish-causality-binary](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary) |
|
|
|
|
|
**Citation:** |
|
|
``` |
|
|
Dürlich et al. (2022). Cause and Effect in Governmental Reports: |
|
|
Two Data Sets for Causality Detection in Swedish. |
|
|
``` |
|
|
|
|
|
**Links:** |
|
|
- [Uppsala NLP](https://huggingface.co/UppsalaNLP) |
|
|
- [GitHub Repository](https://github.com/UppsalaNLP/Swedish-Causality-Datasets) |
|
|
""") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |
|
|
|