birgermoell's picture
Upload app.py with huggingface_hub
0bcc156 verified
#!/usr/bin/env python3
"""Swedish Causality Detection - HuggingFace Space"""
import gradio as gr
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from datasets import load_dataset
import pickle
import os
# Global model variables
classifier = None
embedder = None
def load_models():
"""Load or train the causality classifier."""
global classifier, embedder
# Load embedding model
embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
model_path = "causality_classifier.pkl"
if os.path.exists(model_path):
with open(model_path, 'rb') as f:
classifier = pickle.load(f)
else:
# Train classifier on the dataset
print("Training classifier...")
dataset = load_dataset("UppsalaNLP/swedish-causality-binary")
train_texts = dataset['train']['target_sentence']
train_labels = dataset['train']['label']
# Generate embeddings
train_embeddings = embedder.encode(train_texts, show_progress_bar=True)
# Train logistic regression
classifier = LogisticRegression(max_iter=1000, random_state=42)
classifier.fit(train_embeddings, train_labels)
# Save model
with open(model_path, 'wb') as f:
pickle.dump(classifier, f)
print("Classifier trained and saved!")
def detect_causality(text: str) -> dict:
"""Detect causality in Swedish text."""
if not text.strip():
return {"Causal": 0.0, "Non-causal": 0.0}
# Generate embedding
embedding = embedder.encode([text])
# Get prediction probabilities
probs = classifier.predict_proba(embedding)[0]
return {
"Non-causal": float(probs[0]),
"Causal": float(probs[1])
}
def analyze_text(text: str) -> tuple:
"""Analyze text and return results."""
if not text.strip():
return {}, "Please enter some text to analyze."
# Get causality scores
scores = detect_causality(text)
# Determine result
is_causal = scores["Causal"] > scores["Non-causal"]
confidence = max(scores.values())
if is_causal:
result = f"**Causal relation detected** (confidence: {confidence:.1%})\n\n"
result += "This sentence appears to express a cause-effect relationship."
else:
result = f"**No causal relation detected** (confidence: {confidence:.1%})\n\n"
result += "This sentence does not appear to express a cause-effect relationship."
return scores, result
# Example sentences
EXAMPLES = [
["Den lägre produktiviteten kan bero på att kvinnor har kortare arbetslivserfarenhet än män."],
["Klimatförändringarna leder till ökade havsnivåer och extrema väderhändelser."],
["Sverige är ett land i Nordeuropa med ungefär 10 miljoner invånare."],
["Regeringen presenterade sin budget för nästa år."],
["Bristen på utbildning orsakar hög arbetslöshet bland unga."],
["Stockholm är Sveriges huvudstad och största stad."],
]
# Load models at startup
print("Loading models...")
load_models()
print("Models loaded!")
# Create Gradio interface
with gr.Blocks(title="Swedish Causality Detection", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# Swedish Causality Detection
Detect causal relations in Swedish text using machine learning.
**Author:** Birger Moëll, Uppsala NLP
This tool classifies whether a Swedish sentence expresses a cause-effect relationship.
The model is trained on the [Swedish Causality Binary Dataset](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary).
""")
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Swedish Text",
placeholder="Enter a Swedish sentence to analyze...",
lines=3
)
analyze_btn = gr.Button("Analyze", variant="primary")
with gr.Column(scale=1):
label_output = gr.Label(label="Causality Score")
result_output = gr.Markdown(label="Analysis Result")
gr.Examples(
examples=EXAMPLES,
inputs=text_input,
label="Example Sentences (click to try)"
)
analyze_btn.click(
fn=analyze_text,
inputs=text_input,
outputs=[label_output, result_output]
)
text_input.submit(
fn=analyze_text,
inputs=text_input,
outputs=[label_output, result_output]
)
gr.Markdown("""
---
## About
This classifier uses sentence embeddings from `paraphrase-multilingual-MiniLM-L12-v2`
and logistic regression trained on Swedish government reports annotated for causality.
**Dataset:** [UppsalaNLP/swedish-causality-binary](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary)
**Citation:**
```
Dürlich et al. (2022). Cause and Effect in Governmental Reports:
Two Data Sets for Causality Detection in Swedish.
```
**Links:**
- [Uppsala NLP](https://huggingface.co/UppsalaNLP)
- [GitHub Repository](https://github.com/UppsalaNLP/Swedish-Causality-Datasets)
""")
if __name__ == "__main__":
demo.launch()