SCANSKY
/

BERTopic_Tourism_8L

Text Classification

Model card Files Files and versions

xet

Community

SCANSKY commited on Mar 1, 2025

Commit

d1f3827

verified ·

1 Parent(s): df5b3c5

Update handler.py

Browse files

Files changed (1) hide show

handler.py +149 -0

handler.py CHANGED Viewed

	@@ -0,0 +1,149 @@

+from transformers import pipeline
+from sklearn.preprocessing import LabelEncoder
+import joblib
+import torch
+import os
+from bertopic import BERTopic
+from sentence_transformers import SentenceTransformer
+# Debugging: Print current directory and contents
+print("Current working directory:", os.getcwd())
+print("Contents of the directory:", os.listdir())
+# Load the label encoder
+label_encoder = joblib.load('/repository/label_encoder.pkl')  # Use absolute path
+print("Label encoder loaded successfully.")
+# Load the sentiment analysis model and tokenizer from Hugging Face
+model_name = "SCANSKY/distilbertTourism-multilingual-sentiment"
+sentiment_analyzer = pipeline(
+    'sentiment-analysis',
+    model=model_name,
+    tokenizer=model_name,
+    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
+)
+# Load BERTopic model
+embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
+topic_model = BERTopic.load("/path/to/bertopic/model", embedding_model=embedding_model)
+def get_average_sentiment(positive_count, negative_count, neutral_count):
+    total = positive_count + negative_count + neutral_count
+    if total == 0:
+        return "neutral"
+    positive_pct = (positive_count / total) * 100
+    negative_pct = (negative_count / total) * 100
+    neutral_pct = (neutral_count / total) * 100
+    max_sentiment = max(positive_pct, negative_pct, neutral_pct)
+    if max_sentiment == positive_pct:
+        return "positive"
+    elif max_sentiment == negative_pct:
+        return "negative"
+    else:
+        return "neutral"
+class EndpointHandler:
+    def __init__(self, model_dir=None):
+        # Model and tokenizer are loaded globally, so no need to reinitialize here
+        # The `model_dir` argument is required by Hugging Face's inference toolkit
+        pass
+    def preprocess(self, data):
+        # Extract the input text from the request
+        text = data.get("inputs", "")
+        return text
+    def inference(self, text):
+        if not text.strip():
+            return {"error": "Please enter some text for sentiment analysis."}
+        # Split text into lines
+        lines = [line.strip() for line in text.split('\n') if line.strip()]
+        if not lines:
+            return {"error": "Please enter valid text for sentiment analysis."}
+        # Analyze each line for sentiment
+        total_confidence = 0
+        positive_count = 0
+        negative_count = 0
+        neutral_count = 0
+        line_results = []  # Store results for each line
+        for line in lines:
+            result = sentiment_analyzer(line)
+            predicted_label_encoded = int(result[0]['label'].split('_')[-1])
+            predicted_label = label_encoder.inverse_transform([predicted_label_encoded])[0]
+            confidence = result[0]['score'] * 100
+            # Store line and its sentiment result
+            line_results.append({
+                'text': line,
+                'sentiment': predicted_label,
+                'confidence': confidence
+            })
+            if predicted_label == 'positive':
+                positive_count += 1
+            elif predicted_label == 'negative':
+                negative_count += 1
+            else:
+                neutral_count += 1
+            total_confidence += confidence
+        # Calculate averages
+        avg_confidence = total_confidence / len(lines)
+        positive_pct = (positive_count / len(lines)) * 100
+        negative_pct = (negative_count / len(lines)) * 100
+        neutral_pct = (neutral_count / len(lines)) * 100
+        # Get average sentiment
+        avg_sentiment = get_average_sentiment(positive_count, negative_count, neutral_count)
+        # Perform topic inference using BERTopic's approximate_distribution
+        merged_docs = "\n".join(lines)
+        appxtopics, appxprobabilities = topic_model.approximate_distribution(
+            merged_docs, window=16, batch_size=16  # Adjust window size for better alignment
+        )
+        doc_topic_distribution = appxtopics[0]
+        # Rank topics by their contribution in descending order
+        ranked_topics = sorted(enumerate(doc_topic_distribution), key=lambda x: x[1], reverse=True)[:10]
+        # Prepare the output
+        output = {
+            "total_lines_analyzed": len(lines),
+            "average_confidence": avg_confidence,
+            "average_sentiment": avg_sentiment,
+            "sentiment_distribution": {
+                "positive": positive_pct,
+                "negative": negative_pct,
+                "neutral": neutral_pct
+            },
+            "line_results": line_results,
+            "topic_distribution": {
+                "ranked_topics": [
+                    {"topic_idx": topic_idx, "contribution": contribution}
+                    for topic_idx, contribution in ranked_topics
+                ]
+            }
+        }
+        return output
+    def postprocess(self, output):
+        if "error" in output:
+            return [{"error": output["error"]}]
+        # Return only the line-level results as a list
+        return output["line_results"]
+    def __call__(self, data):
+        # Main method to handle the request
+        text = self.preprocess(data)
+        output = self.inference(text)
+        return self.postprocess(output)