Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import os | |
| import pickle | |
| import re | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.naive_bayes import MultinomialNB | |
| from sklearn.pipeline import make_pipeline | |
| from sklearn.model_selection import train_test_split | |
| # ====================== | |
| # Paths | |
| # ====================== | |
| MODEL_PATH = "sklearn_nb_model.pkl" | |
| DATA_PATH = "test.csv" | |
| SAMPLE_SIZE = 50000 | |
| # ====================== | |
| # Helper function for text cleaning (basic) | |
| # ====================== | |
| def clean_text(text): | |
| if not isinstance(text, str): | |
| text = str(text) | |
| text = text.lower() | |
| text = re.sub(r'[^a-z\s]', '', text) | |
| text = re.sub(r'\s+', ' ', text).strip() | |
| return text | |
| # ====================== | |
| # 1. Train or Load Model | |
| # ====================== | |
| if os.path.exists(MODEL_PATH): | |
| with open(MODEL_PATH, "rb") as f: | |
| model = pickle.load(f) | |
| print("Loaded existing scikit-learn model.") | |
| else: | |
| assert os.path.exists(DATA_PATH), f"{DATA_PATH} not found! Upload a CSV dataset." | |
| # Load CSV instead of Parquet | |
| df = pd.read_csv(DATA_PATH, encoding='ISO-8859-1') | |
| print(f"Type of df after loading csv: {type(df)}") | |
| print(f"df columns after loading: {df.columns.tolist()}") | |
| # Sample to reduce memory usage | |
| if len(df) > SAMPLE_SIZE: | |
| df = df.sample(n=SAMPLE_SIZE, random_state=42) | |
| # Normalize column names | |
| df.columns = [c.lower().strip() for c in df.columns] | |
| print(f"df columns after normalization: {df.columns.tolist()}") | |
| # Check required columns | |
| if 'text' not in df.columns or 'sentiment' not in df.columns: | |
| raise ValueError("CSV file must contain 'text' and 'sentiment' columns after column normalization") | |
| # Clean only the 'text' column | |
| df['text'] = df['text'].fillna("").astype(str).apply(clean_text) | |
| print(f"Type of df after cleaning: {type(df)}") # <class 'pandas.core.frame.DataFrame'> | |
| print(f"Type of df['text'] after processing: {type(df['text'])}") # <class 'pandas.core.series.Series'> | |
| # Process 'sentiment' column | |
| print(f"Type of df['sentiment'] immediately before fillna: {type(df['sentiment'])}") | |
| print(f"df['sentiment'] head immediately before fillna: {df['sentiment'].head()}") | |
| df['sentiment'] = df['sentiment'].fillna("").astype(str) | |
| print(f"Type of df['sentiment'] after processing: {type(df['sentiment'])}") | |
| X = df['text'] | |
| y = df['sentiment'] | |
| # Train a simple pipeline: CountVectorizer + MultinomialNB | |
| model = make_pipeline(CountVectorizer(min_df=2), MultinomialNB(alpha=1.0)) | |
| model.fit(X, y) | |
| # Save the trained model | |
| with open(MODEL_PATH, "wb") as f: | |
| pickle.dump(model, f) | |
| print(f"Model trained on {len(df)} samples and saved!") | |
| # ====================== | |
| # 2. Sentiment Analysis Function | |
| # ====================== | |
| def sentiment_analysis(text: str) -> dict: | |
| if not text or len(text.strip()) < 2: | |
| return { | |
| "prediction": "neutral", | |
| "probabilities": {"pos": 0.5, "neg": 0.5}, | |
| "message": "Input too short or empty for meaningful analysis. Defaulting to neutral." | |
| } | |
| cleaned_text = clean_text(text) | |
| try: | |
| if not cleaned_text: | |
| return { | |
| "prediction": "neutral", | |
| "probabilities": {"pos": 0.5, "neg": 0.5}, | |
| "message": "Input became empty after cleaning. Defaulting to neutral." | |
| } | |
| pred_array = model.predict([cleaned_text]) | |
| prob_array = model.predict_proba([cleaned_text]) | |
| classes = model.classes_ | |
| pred = pred_array[0] # Access the first (and only) element of the prediction array | |
| prob_dict = {cls: round(float(prob_array[0][i]), 4) for i, cls in enumerate(classes)} # Get probabilities | |
| if sum(prob_dict.values()) == 0: | |
| prob_dict = {"pos": 0.01, "neg": 0.01} | |
| pred = "neutral" | |
| return { | |
| "prediction": pred, | |
| "probabilities": prob_dict, | |
| "cleaned_input": cleaned_text | |
| } | |
| except Exception as e: | |
| return { | |
| "prediction": "error", | |
| "probabilities": {"pos": 0.5, "neg": 0.5}, | |
| "error_details": str(e), | |
| "message": "An error occurred during analysis." | |
| } | |
| # ====================== | |
| # 3. Gradio Interface | |
| # ====================== | |
| demo = gr.Interface( | |
| fn=sentiment_analysis, | |
| inputs=gr.Textbox(placeholder="Enter text to analyze...", lines=3), | |
| outputs=gr.JSON(label="Analysis Results"), | |
| title="Naive Bayes Sentiment Analysis (scikit-learn)", | |
| description="Sentiment analysis using a memory-efficient Naive Bayes classifier. <br> " | |
| "Enter text to get its predicted sentiment and probability distribution." | |
| ) | |
| # ====================== | |
| # 4. Launch | |
| # ====================== | |
| if __name__ == "__main__": | |
| demo.launch(server_name="0.0.0.0") | |