Spaces:

WizardCoder2007
/

social_media_analyzer

Running

File size: 3,622 Bytes

bbd259b

import numpy as np
import joblib
import os
from sklearn.linear_model import LogisticRegression

# Output path
MODEL_PATH = os.path.join(os.path.dirname(__file__), "..", "models", "final_classifier.pkl")
os.makedirs(os.path.dirname(MODEL_PATH), exist_ok=True)

print(">>> Generating Synthetic Logic-Aligned Training Data...")

# Features:
# 0: sim_pro_india
# 1: sim_anti_india
# 2: sim_pro_govt
# 3: sim_anti_govt
# 4: sim_neutral
# 5: neg
# 6: neu
# 7: pos
# 8: sarcasm
# 9: context_pol_crit (Anti-Govt)
# 10: context_nat_crit (Anti-India)
# 11: context_pol_praise (Pro-Govt)
# 12: context_nat_praise (Pro-India)

def generate_sample(label_idx):
    # Base noise for 13 features
    feats = np.random.uniform(0.0, 0.3, 13)
    
    # 0: Pro-India
    if label_idx == 0:
        feats[0] = np.random.uniform(0.6, 1.0) # High Pro-India Sim
        feats[7] = np.random.uniform(0.5, 1.0) # High Positive
        feats[5] = np.random.uniform(0.0, 0.2) # Low Negative
        feats[8] = np.random.uniform(0.0, 0.2) # IGNORE SARCASM
        # LLM Context
        feats[12] = np.random.uniform(0.7, 1.0) # High National Praise
        feats[9] = np.random.uniform(0.0, 0.2)  # Low Pol Crit
        
    # 1: Anti-India
    elif label_idx == 1:
        feats[1] = np.random.uniform(0.6, 1.0) # High Anti-India Sim
        feats[5] = np.random.uniform(0.5, 1.0) # High Negative
        feats[7] = np.random.uniform(0.0, 0.2) # Low Positive
        feats[8] = np.random.uniform(0.0, 0.2) # IGNORE SARCASM
        # LLM Context
        feats[10] = np.random.uniform(0.7, 1.0) # High National Criticism
        feats[9] = np.random.uniform(0.0, 0.3)  # Low/Med Pol Crit

    # 2: Pro-Government
    elif label_idx == 2:
        feats[2] = np.random.uniform(0.6, 1.0) # High Pro-Govt Sim
        feats[7] = np.random.uniform(0.5, 1.0) # High Positive
        feats[5] = np.random.uniform(0.0, 0.2) # Low Negative
        feats[8] = np.random.uniform(0.0, 0.2) # IGNORE SARCASM
        # LLM Context
        feats[11] = np.random.uniform(0.7, 1.0) # High Political Praise
        feats[10] = np.random.uniform(0.0, 0.2) # Low Nat Crit

    # 3: Anti-Government
    elif label_idx == 3:
        feats[3] = np.random.uniform(0.6, 1.0) # High Anti-Govt Sim
        feats[5] = np.random.uniform(0.5, 1.0) # High Negative
        feats[7] = np.random.uniform(0.0, 0.2) # Low Positive
        feats[8] = np.random.uniform(0.0, 0.2) # IGNORE SARCASM
        # LLM Context
        feats[9] = np.random.uniform(0.7, 1.0) # High Political Criticism!
        feats[10] = np.random.uniform(0.0, 0.4) # Low/Med Nat Crit

    # 4: Neutral
    elif label_idx == 4:
        feats[4] = np.random.uniform(0.5, 1.0) # High Neutral Sim
        feats[6] = np.random.uniform(0.5, 1.0) # High Neutral Sentiment
        feats[5] = np.random.uniform(0.0, 0.2) 
        feats[7] = np.random.uniform(0.0, 0.2)
        feats[8] = np.random.uniform(0.0, 0.1)
        # LLM Context -> All low or balanced
        feats[9] = np.random.uniform(0.0, 0.3)
        feats[10] = np.random.uniform(0.0, 0.3)

    return feats

# Generate data
X = []
y = []
SAMPLES_PER_CLASS = 500

for label in range(5):
    for _ in range(SAMPLES_PER_CLASS):
        X.append(generate_sample(label))
        y.append(label)

X = np.array(X)
y = np.array(y)

print(f"Training Logistic Regression on {len(X)} synthetic samples (13 features)...")
clf = LogisticRegression(max_iter=1000, multi_class='multinomial', solver='lbfgs')
clf.fit(X, y)

print(f"Accuracy on Training Set: {clf.score(X, y):.4f}")

print(f"Saving model to {MODEL_PATH}...")
joblib.dump(clf, MODEL_PATH)
print("DONE.")