Spaces:

Quivara
/

alisto-project

Running

App Files Files Community

alisto-project / alisto_project /backend /train_ensemble.py

Quivara

Fresh upload with LFS

bdb271a 2 days ago

raw

history blame contribute delete

2.39 kB

	import pandas as pd
	import pickle
	import os
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.svm import LinearSVC
	from sklearn.calibration import CalibratedClassifierCV
	from sklearn.pipeline import make_pipeline
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import accuracy_score, classification_report

	# Paths
	BASE_DIR = os.path.dirname(os.path.abspath(__file__))
	DATA_PATH = os.path.join(BASE_DIR, '../data/reddit_disaster_posts.csv')
	MODEL_DIR = os.path.join(BASE_DIR, 'models')

	# --- 1. CUSTOM TAGLISH STOP WORDS ---
	# removing these prevents the model from cheating by memorizing common grammar words
	TAGLISH_STOP_WORDS = [
	'ang', 'mga', 'ng', 'sa', 'na', 'ko', 'mo', 'ba', 'ka', 'yung',
	'ni', 'no', 'at', 'o', 'kay', 'to', 'po', 'pa', 'din', 'rin',
	'naman', 'nyo', 'nila', 'namin', 'kasi', 'kame', 'kami', 'tayo',
	'sana', 'lang', 'talaga', 'di', 'eh', 'oh', 'ah', 'yun', 'yan',
	'the', 'is', 'a', 'an', 'and', 'or', 'of', 'to', 'in', 'on', 'for'
	]

	def train_tfidf():
	print(f"Loading data from: {DATA_PATH}")
	if not os.path.exists(DATA_PATH):
	print("Error: CSV not found.")
	return

	df = pd.read_csv(DATA_PATH)
	df['text'] = df['text'].astype(str).fillna('')
	df = df.dropna(subset=['label'])

	X = df['text']
	y = df['label'].astype(int)

	print(f"Training TF-IDF + SVM Ensemble on {len(df)} posts...")

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# --- 2. THE UPGRADED PIPELINE ---
	model = make_pipeline(
	TfidfVectorizer(
	stop_words=TAGLISH_STOP_WORDS,
	max_features=5000,
	ngram_range=(1, 3),
	sublinear_tf=True
	),
	CalibratedClassifierCV(LinearSVC(dual=False, class_weight='balanced'), method='sigmoid')
	)

	model.fit(X_train, y_train)

	# Evaluate
	y_pred = model.predict(X_test)
	acc = accuracy_score(y_test, y_pred)
	print(f"✅ TF-IDF Model Accuracy: {acc * 100:.2f}%")
	print("Classification Report:")
	print(classification_report(y_test, y_pred))

	# Save
	save_path = os.path.join(MODEL_DIR, 'tfidf_ensemble.pkl')
	with open(save_path, 'wb') as f:
	pickle.dump(model, f)

	print(f"Model saved to: {save_path}")

	if __name__ == "__main__":
	train_tfidf()