#!/usr/bin/env python3 """Swedish Causality Detection - HuggingFace Space""" import gradio as gr import numpy as np from sentence_transformers import SentenceTransformer from sklearn.linear_model import LogisticRegression from datasets import load_dataset import pickle import os # Global model variables classifier = None embedder = None def load_models(): """Load or train the causality classifier.""" global classifier, embedder # Load embedding model embedder = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2') model_path = "causality_classifier.pkl" if os.path.exists(model_path): with open(model_path, 'rb') as f: classifier = pickle.load(f) else: # Train classifier on the dataset print("Training classifier...") dataset = load_dataset("UppsalaNLP/swedish-causality-binary") train_texts = dataset['train']['target_sentence'] train_labels = dataset['train']['label'] # Generate embeddings train_embeddings = embedder.encode(train_texts, show_progress_bar=True) # Train logistic regression classifier = LogisticRegression(max_iter=1000, random_state=42) classifier.fit(train_embeddings, train_labels) # Save model with open(model_path, 'wb') as f: pickle.dump(classifier, f) print("Classifier trained and saved!") def detect_causality(text: str) -> dict: """Detect causality in Swedish text.""" if not text.strip(): return {"Causal": 0.0, "Non-causal": 0.0} # Generate embedding embedding = embedder.encode([text]) # Get prediction probabilities probs = classifier.predict_proba(embedding)[0] return { "Non-causal": float(probs[0]), "Causal": float(probs[1]) } def analyze_text(text: str) -> tuple: """Analyze text and return results.""" if not text.strip(): return {}, "Please enter some text to analyze." # Get causality scores scores = detect_causality(text) # Determine result is_causal = scores["Causal"] > scores["Non-causal"] confidence = max(scores.values()) if is_causal: result = f"**Causal relation detected** (confidence: {confidence:.1%})\n\n" result += "This sentence appears to express a cause-effect relationship." else: result = f"**No causal relation detected** (confidence: {confidence:.1%})\n\n" result += "This sentence does not appear to express a cause-effect relationship." return scores, result # Example sentences EXAMPLES = [ ["Den lägre produktiviteten kan bero på att kvinnor har kortare arbetslivserfarenhet än män."], ["Klimatförändringarna leder till ökade havsnivåer och extrema väderhändelser."], ["Sverige är ett land i Nordeuropa med ungefär 10 miljoner invånare."], ["Regeringen presenterade sin budget för nästa år."], ["Bristen på utbildning orsakar hög arbetslöshet bland unga."], ["Stockholm är Sveriges huvudstad och största stad."], ] # Load models at startup print("Loading models...") load_models() print("Models loaded!") # Create Gradio interface with gr.Blocks(title="Swedish Causality Detection", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # Swedish Causality Detection Detect causal relations in Swedish text using machine learning. **Author:** Birger Moëll, Uppsala NLP This tool classifies whether a Swedish sentence expresses a cause-effect relationship. The model is trained on the [Swedish Causality Binary Dataset](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary). """) with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="Swedish Text", placeholder="Enter a Swedish sentence to analyze...", lines=3 ) analyze_btn = gr.Button("Analyze", variant="primary") with gr.Column(scale=1): label_output = gr.Label(label="Causality Score") result_output = gr.Markdown(label="Analysis Result") gr.Examples( examples=EXAMPLES, inputs=text_input, label="Example Sentences (click to try)" ) analyze_btn.click( fn=analyze_text, inputs=text_input, outputs=[label_output, result_output] ) text_input.submit( fn=analyze_text, inputs=text_input, outputs=[label_output, result_output] ) gr.Markdown(""" --- ## About This classifier uses sentence embeddings from `paraphrase-multilingual-MiniLM-L12-v2` and logistic regression trained on Swedish government reports annotated for causality. **Dataset:** [UppsalaNLP/swedish-causality-binary](https://huggingface.co/datasets/UppsalaNLP/swedish-causality-binary) **Citation:** ``` Dürlich et al. (2022). Cause and Effect in Governmental Reports: Two Data Sets for Causality Detection in Swedish. ``` **Links:** - [Uppsala NLP](https://huggingface.co/UppsalaNLP) - [GitHub Repository](https://github.com/UppsalaNLP/Swedish-Causality-Datasets) """) if __name__ == "__main__": demo.launch()