SCANSKY
/

BERTopic_Tourism_8L

Text Classification

Model card Files Files and versions

xet

Community

SCANSKY commited on Mar 1, 2025

Commit

9179af7

verified ·

1 Parent(s): 584f0f6

Update handler.py

Browse files

Files changed (1) hide show

handler.py +85 -138

handler.py CHANGED Viewed

@@ -1,149 +1,96 @@
-from transformers import pipeline
-from sklearn.preprocessing import LabelEncoder
-import joblib
-import torch
-import os
 from bertopic import BERTopic
-from sentence_transformers import SentenceTransformer
-# Debugging: Print current directory and contents
-print("Current working directory:", os.getcwd())
-print("Contents of the directory:", os.listdir())
-# Load the label encoder
-label_encoder = joblib.load('/repository/label_encoder.pkl')  # Use absolute path
-print("Label encoder loaded successfully.")
-# Load the sentiment analysis model and tokenizer from Hugging Face
-model_name = "SCANSKY/BERTopic_Tourism_8L"
-sentiment_analyzer = pipeline(
-    'sentiment-analysis',
-    model=model_name,
-    tokenizer=model_name,
-    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
-)
-# Load BERTopic model
-embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
-topic_model = BERTopic.load("/path/to/bertopic/model", embedding_model=embedding_model)
-def get_average_sentiment(positive_count, negative_count, neutral_count):
-    total = positive_count + negative_count + neutral_count
-    if total == 0:
-        return "neutral"
-    positive_pct = (positive_count / total) * 100
-    negative_pct = (negative_count / total) * 100
-    neutral_pct = (neutral_count / total) * 100
-    max_sentiment = max(positive_pct, negative_pct, neutral_pct)
-    if max_sentiment == positive_pct:
-        return "positive"
-    elif max_sentiment == negative_pct:
-        return "negative"
-    else:
-        return "neutral"
-class EndpointHandler:
-    def __init__(self, model_dir=None):
-        # Model and tokenizer are loaded globally, so no need to reinitialize here
-        # The `model_dir` argument is required by Hugging Face's inference toolkit
-        pass
-    def preprocess(self, data):
-        # Extract the input text from the request
-        text = data.get("inputs", "")
-        return text
-    def inference(self, text):
-        if not text.strip():
-            return {"error": "Please enter some text for sentiment analysis."}
-        # Split text into lines
-        lines = [line.strip() for line in text.split('\n') if line.strip()]
-        if not lines:
-            return {"error": "Please enter valid text for sentiment analysis."}
-        # Analyze each line for sentiment
-        total_confidence = 0
-        positive_count = 0
-        negative_count = 0
-        neutral_count = 0
-        line_results = []  # Store results for each line
-        for line in lines:
-            result = sentiment_analyzer(line)
-            predicted_label_encoded = int(result[0]['label'].split('_')[-1])
-            predicted_label = label_encoder.inverse_transform([predicted_label_encoded])[0]
-            confidence = result[0]['score'] * 100
-            # Store line and its sentiment result
-            line_results.append({
-                'text': line,
-                'sentiment': predicted_label,
-                'confidence': confidence
-            })
-            if predicted_label == 'positive':
-                positive_count += 1
-            elif predicted_label == 'negative':
-                negative_count += 1
-            else:
-                neutral_count += 1
-            total_confidence += confidence
-        # Calculate averages
-        avg_confidence = total_confidence / len(lines)
-        positive_pct = (positive_count / len(lines)) * 100
-        negative_pct = (negative_count / len(lines)) * 100
-        neutral_pct = (neutral_count / len(lines)) * 100
-        # Get average sentiment
-        avg_sentiment = get_average_sentiment(positive_count, negative_count, neutral_count)
-        # Perform topic inference using BERTopic's approximate_distribution
-        merged_docs = "\n".join(lines)
-        appxtopics, appxprobabilities = topic_model.approximate_distribution(
-            merged_docs, window=16, batch_size=16  # Adjust window size for better alignment
-        )
-        doc_topic_distribution = appxtopics[0]
-        # Rank topics by their contribution in descending order
-        ranked_topics = sorted(enumerate(doc_topic_distribution), key=lambda x: x[1], reverse=True)[:10]
-        # Prepare the output
-        output = {
-            "total_lines_analyzed": len(lines),
-            "average_confidence": avg_confidence,
-            "average_sentiment": avg_sentiment,
-            "sentiment_distribution": {
-                "positive": positive_pct,
-                "negative": negative_pct,
-                "neutral": neutral_pct
-            },
-            "line_results": line_results,
-            "topic_distribution": {
-                "ranked_topics": [
-                    {"topic_idx": topic_idx, "contribution": contribution}
-                    for topic_idx, contribution in ranked_topics
-                ]
-            }
-        }
-        return output
-    def postprocess(self, output):
-        if "error" in output:
-            return [{"error": output["error"]}]
-        # Return only the line-level results as a list
-        return output["line_results"]
-    def __call__(self, data):
-        # Main method to handle the request
-        text = self.preprocess(data)
-        output = self.inference(text)
-        return self.postprocess(output)

+import json
 from bertopic import BERTopic
+class EndpointHandler:
+    def __init__(self, model_path="SCANSKY/BERTopic_Tourism_8L"):
+        """
+        Initialize the handler. Load the BERTopic model from Hugging Face.
+        """
+        self.topic_model = BERTopic.load(model_path)
+    def preprocess(self, data):
+        """
+        Preprocess the incoming request data.
+        - Extract text input from the request.
+        """
+        try:
+            # Directly work with the incoming data dictionary
+            text_input = data.get("inputs", "")
+            return text_input
+        except Exception as e:
+            raise ValueError(f"Error during preprocessing: {str(e)}")
+    def inference(self, text_input):
+        """
+        Perform inference using the BERTopic model.
+        - Combine all sentences into a single document and find shared topics.
+        """
+        try:
+            # Split text into sentences (assuming one sentence per line)
+            sentences = text_input.strip().split('\n')
+            # Combine all sentences into a single document
+            combined_document = " ".join(sentences)
+            # Perform topic inference on the combined document
+            topics, probabilities = self.topic_model.transform([combined_document])
+            # Perform approximate distribution to get detailed topic contributions
+            appxtopics, appxprobabilities = self.topic_model.approximate_distribution(
+                combined_document, window=16, batch_size=16
+            )
+            doc_topic_distribution = appxtopics[0]
+            # Rank topics by their contribution in descending order
+            ranked_topics = sorted(enumerate(doc_topic_distribution), key=lambda x: x[1], reverse=True)[:10]
+            # Prepare the results
+            results = []
+            for topic, prob in zip(topics, probabilities):
+                topic_info = self.topic_model.get_topic(topic)
+                topic_words = [word for word, _ in topic_info] if topic_info else []
+                # Get custom label for the topic
+                if hasattr(self.topic_model, "custom_labels_") and self.topic_model.custom_labels_ is not None:
+                    custom_label = self.topic_model.custom_labels_[topic + 1]
+                else:
+                    custom_label = f"Topic {topic}"  # Fallback label
+                # Get the contribution from approximate distribution
+                contribution = next((contribution for idx, contribution in ranked_topics if idx == topic), 0.0)
+                results.append({
+                    "topic": int(topic),
+                    "probability": float(prob),
+                    "top_words": topic_words[:5],  # Top 5 words
+                    "customLabel": custom_label,  # Add custom label
+                    "contribution": float(contribution)  # Add contribution from approximate distribution
+                })
+            return results
+        except Exception as e:
+            raise ValueError(f"Error during inference: {str(e)}")
+    def postprocess(self, results):
+        """
+        Postprocess the inference results into a JSON-serializable list.
+        """
+        return results  # Directly returning the list of results
+    def __call__(self, data):
+        """
+        Handle the incoming request.
+        """
+        try:
+            # Preprocess the data
+            text_input = self.preprocess(data)
+            # Perform inference
+            results = self.inference(text_input)
+            # Postprocess the results
+            response = self.postprocess(results)
+            return response
+        except Exception as e:
+            return [{"error": str(e)}]  # Returning error as a list with a dictionary