File size: 4,331 Bytes

from transformers import pipeline
from sklearn.preprocessing import LabelEncoder
import joblib
import torch
import os

# Debugging: Print current directory and contents
print("Current working directory:", os.getcwd())
print("Contents of the directory:", os.listdir())

# Load the label encoder
label_encoder = joblib.load('/repository/label_encoder.pkl')  # Use absolute path
print("Label encoder loaded successfully.")

# Load the model and tokenizer from Hugging Face
model_name = "SCANSKY/distilbertTourism-multilingual-sentiment"
sentiment_analyzer = pipeline(
    'sentiment-analysis',
    model=model_name,
    tokenizer=model_name,
    device=0 if torch.cuda.is_available() else -1  # Use GPU if available
)

def get_average_sentiment(positive_count, negative_count, neutral_count):
    total = positive_count + negative_count + neutral_count
    if total == 0:
        return "neutral"
    
    positive_pct = (positive_count / total) * 100
    negative_pct = (negative_count / total) * 100
    neutral_pct = (neutral_count / total) * 100
    
    max_sentiment = max(positive_pct, negative_pct, neutral_pct)
    
    if max_sentiment == positive_pct:
        return "positive"
    elif max_sentiment == negative_pct:
        return "negative"
    else:
        return "neutral"

class EndpointHandler:
    def __init__(self, model_dir=None):
        # Model and tokenizer are loaded globally, so no need to reinitialize here
        # The `model_dir` argument is required by Hugging Face's inference toolkit
        pass

    def preprocess(self, data):
        # Extract the input text from the request
        text = data.get("inputs", "")
        return text

    def inference(self, text):
        if not text.strip():
            return {"error": "Please enter some text for sentiment analysis."}
        
        # Split text into lines
        lines = [line.strip() for line in text.split('\n') if line.strip()]
        
        if not lines:
            return {"error": "Please enter valid text for sentiment analysis."}
        
        # Analyze each line
        total_confidence = 0
        positive_count = 0
        negative_count = 0
        neutral_count = 0
        line_results = []  # Store results for each line
        
        for line in lines:
            result = sentiment_analyzer(line)
            predicted_label_encoded = int(result[0]['label'].split('_')[-1])
            predicted_label = label_encoder.inverse_transform([predicted_label_encoded])[0]
            confidence = result[0]['score'] * 100
            
            # Store line and its sentiment result
            line_results.append({
                'text': line,
                'sentiment': predicted_label,
                'confidence': confidence
            })
            
            if predicted_label == 'positive':
                positive_count += 1
            elif predicted_label == 'negative':
                negative_count += 1
            else:
                neutral_count += 1
            
            total_confidence += confidence
        
        # Calculate averages
        avg_confidence = total_confidence / len(lines)
        positive_pct = (positive_count / len(lines)) * 100
        negative_pct = (negative_count / len(lines)) * 100
        neutral_pct = (neutral_count / len(lines)) * 100
        
        # Get average sentiment
        avg_sentiment = get_average_sentiment(positive_count, negative_count, neutral_count)
        
        # Prepare the output
        output = {
            "total_lines_analyzed": len(lines),
            "average_confidence": avg_confidence,
            "average_sentiment": avg_sentiment,
            "sentiment_distribution": {
                "positive": positive_pct,
                "negative": negative_pct,
                "neutral": neutral_pct
            },
            "line_results": line_results
        }
        
        return output

    def postprocess(self, output):
        if "error" in output:
            return [{"error": output["error"]}]
        
        # Return only the line-level results as a list
        return output["line_results"]


    def __call__(self, data):
        # Main method to handle the request
        text = self.preprocess(data)
        output = self.inference(text)
        return self.postprocess(output)