Spaces:

odeyaaa
/

social-assistent

Sleeping

App Files Files Community

social-assistent / model-prep.py

odeyaaa

Upload 10 files

cf49347 verified about 1 month ago

raw

history blame contribute delete

11.4 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import warnings
	import os
	import torch
	import google.generativeai as genai
	from faker import Faker
	from datetime import datetime, timedelta
	from sklearn.metrics.pairwise import cosine_similarity
	import pickle
	from dotenv import load_dotenv

	# Load environment variables from the .env filea monk
	load_dotenv()

	# Machine Learning Imports
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.ensemble import RandomForestRegressor
	from xgboost import XGBRegressor
	from sklearn.linear_model import LinearRegression
	from sklearn.metrics import mean_squared_error, f1_score
	from sklearn.decomposition import PCA
	from sentence_transformers import SentenceTransformer

	# ---------------------------------------------------------
	# 0. SETUP & CONFIGURATION
	# ---------------------------------------------------------
	warnings.filterwarnings('ignore')
	pd.set_option('display.max_columns', None)

	# OPTIMIZATION: Check for Apple Silicon (MPS)
	device = "mps" if torch.backends.mps.is_available() else "cpu"
	print(f"🚀 Optimization: Running on {device.upper()} device")

	if not os.path.exists('project_plots'):
	os.makedirs('project_plots')

	# ---------------------------------------------------------
	# 1. DATA GENERATION (With 2025 Trends)
	# ---------------------------------------------------------
	def generate_enhanced_data(n_rows=10000):
	print(f"\n[1/8] Generating {n_rows} rows of Real-World 2025 Data...")
	fake = Faker()

	trends = [
	'Delulu', 'Girl Dinner', 'Roman Empire', 'Silent Slay', 'Soft Life',
	'Grimace Shake', 'Wes Anderson Style', 'Beige Flag', 'Canon Event',
	'NPC Stream', 'Skibidi', 'Fanum Tax', 'Yapping', 'Glow Up', 'Fit Check'
	]
	formats = [
	'POV: You realize...', 'GRWM for...', 'Day in the life:',
	'Storytime:', 'Trying the viral...', 'ASMR packing orders',
	'Rating my exes...', 'Turn the lights off challenge'
	]
	categories = ['Gaming', 'Beauty', 'Comedy', 'Edutainment', 'Lifestyle', 'Food']

	data = []
	start_date = datetime(2024, 1, 1)

	for _ in range(n_rows):
	upload_time = start_date + timedelta(days=np.random.randint(0, 365), hours=np.random.randint(0, 23))
	trend = np.random.choice(trends)
	fmt = np.random.choice(formats)
	cat = np.random.choice(categories)

	description = f"{fmt} {trend} edition! {fake.sentence(nb_words=6)}"
	tags = ['#fyp', '#foryou', '#viral', f'#{trend.replace(" ", "").lower()}', f'#{cat.lower()}']
	if np.random.random() > 0.5: tags.append('#trending2025')

	full_text = f"{description} {' '.join(tags)}"

	# Meta Features
	duration = np.random.randint(5, 180)
	hour = upload_time.hour
	is_weekend = 1 if upload_time.weekday() >= 5 else 0

	# View Count Logic
	base_virality = np.random.lognormal(mean=9.5, sigma=1.8)
	multiplier = 1.0
	if is_weekend: multiplier *= 1.2
	if duration < 15: multiplier *= 1.4
	if "Delulu" in full_text or "POV" in full_text: multiplier *= 1.6
	if hour >= 18: multiplier *= 1.1

	views = int(base_virality * multiplier)

	data.append({
	'upload_date': upload_time,
	'description': full_text,
	'category': cat,
	'video_duration_sec': duration,
	'hour_of_day': hour,
	'is_weekend': is_weekend,
	'hashtag_count': len(tags),
	'views': views
	})

	df = pd.DataFrame(data)
	df = df.sort_values('upload_date').reset_index(drop=True)
	threshold = df['views'].quantile(0.80)
	df['is_viral_binary'] = (df['views'] > threshold).astype(int)
	df['log_views'] = np.log1p(df['views'])

	return df, threshold

	# ---------------------------------------------------------
	# 2. EDA & PREPROCESSING
	# ---------------------------------------------------------
	def process_data_pipeline(df):
	print("\n[2/8] Processing Data Pipeline...")

	# Simple EDA Save
	clean_df = df[df['video_duration_sec'] > 0].copy()
	plt.figure(figsize=(6,4))
	sns.histplot(clean_df['log_views'], color='teal')
	plt.title('Log Views Distribution')
	plt.savefig('project_plots/eda_distribution.png')
	plt.close()

	# TF-IDF & Split
	tfidf = TfidfVectorizer(max_features=2000, stop_words='english')
	X_text = tfidf.fit_transform(df['description']).toarray()

	num_cols = ['video_duration_sec', 'hour_of_day', 'is_weekend', 'hashtag_count']
	X_num = df[num_cols].values

	X = np.hstack((X_text, X_num))
	y = df['log_views'].values
	y_bin = df['is_viral_binary'].values

	split_idx = int(len(df) * 0.80)
	return X[:split_idx], X[split_idx:], y[:split_idx], y[split_idx:], y_bin[split_idx:], tfidf

	# ---------------------------------------------------------
	# 3. TRAINING
	# ---------------------------------------------------------
	def train_best_model(X_train, y_train, X_test, y_test):
	print("\n[3/8] Training Model (XGBoost)...")
	model = XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=6, n_jobs=-1)
	model.fit(X_train, y_train)

	rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))
	print(f" - Model RMSE: {rmse:.3f}")
	return model

	# ---------------------------------------------------------
	# 4. EMBEDDINGS GENERATION (For Search)
	# ---------------------------------------------------------
	def create_search_index(df):
	print("\n[4/8] Creating Vector Search Index...")
	# Generate embeddings for ALL data so we can search the whole history
	st_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)
	embeddings = st_model.encode(df['description'].tolist(), convert_to_numpy=True, show_progress_bar=True)

	df['embedding'] = list(embeddings)

	# Save to Parquet (The Knowledge Base)
	save_path = 'tiktok_knowledge_base.parquet'
	df.to_parquet(save_path)
	print(f" - Knowledge Base saved to {save_path}")
	return df, st_model

	# ---------------------------------------------------------
	# 5. RETRIEVAL & IMPROVEMENT ENGINE (The Magic Step)
	# ---------------------------------------------------------
	def optimize_content_with_gemini(user_input, model, vectorizer, knowledge_df, st_model):
	"""
	1. Scores original idea.
	2. Finds top 3 similar VIRAL videos.
	3. Asks Gemini to rewrite the idea.
	4. Re-scores the new idea.
	"""
	print("\n" + "="*50)
	print("🚀 VIRAL OPTIMIZATION ENGINE")
	print("="*50)

	# --- STEP 1: INITIAL SCORE ---
	text_vec = vectorizer.transform([user_input]).toarray()
	# Assume default meta for prediction (15s, 6 PM, weekday)
	meta_vec = np.array([[15, 18, 0, user_input.count('#')]])
	feat_vec = np.hstack((text_vec, meta_vec))

	initial_log = model.predict(feat_vec)[0]
	initial_views = int(np.expm1(initial_log))

	print(f"\n📝 ORIGINAL IDEA: {user_input}")
	print(f"📊 Predicted Views: {initial_views:,}")

	# --- STEP 2: VECTOR SEARCH (Find similar successful videos) ---
	print("\n🔍 Searching for similar viral hits in Parquet file...")

	# Filter only for successful videos (e.g., top 25% of views)
	high_performance_df = knowledge_df[knowledge_df['views'] > knowledge_df['views'].quantile(0.75)].copy()

	# Encode user input
	user_embedding = st_model.encode([user_input], convert_to_numpy=True)

	# Stack embeddings from the dataframe into a matrix
	target_embeddings = np.stack(high_performance_df['embedding'].values)

	# Calculate Cosine Similarity
	similarities = cosine_similarity(user_embedding, target_embeddings)

	# Get Top 3 indices
	top_3_indices = similarities[0].argsort()[-3:][::-1]
	top_3_videos = high_performance_df.iloc[top_3_indices]['description'].tolist()

	print(" -> Found 3 similar viral videos to learn from:")
	for i, vid in enumerate(top_3_videos, 1):
	print(f" {i}. {vid[:80]}...")

	# --- STEP 3: GEMINI OPTIMIZATION ---
	api_key = os.getenv("GEMINI_API_KEY")
	if not api_key:
	print("\n⚠️ SKIPPING AI REWRITE: No 'GEMINI_API_KEY' found in environment variables.")
	print(" (Set it via 'export GEMINI_API_KEY=your_key' in terminal)")
	return

	print("\n🤖 Sending context to Gemini LLM for optimization...")
	genai.configure(api_key=api_key)
	llm = genai.GenerativeModel('gemini-2.5-flash-lite')

	prompt = f"""
	You are a TikTok Virality Expert.

	My Draft Description: "{user_input}"

	Here are 3 successful, viral videos that are similar to my topic:
	1. {top_3_videos[0]}
	2. {top_3_videos[1]}
	3. {top_3_videos[2]}

	Task: Rewrite my draft description to make it go viral.
	Use the slang, hashtag style, and structure of the successful examples provided.
	Keep it under 20 words plus hashtags. Return ONLY the new description.
	"""

	try:
	response = llm.generate_content(prompt)
	improved_idea = response.text.strip()

	print(f"\n✨ IMPROVED IDEA (By Gemini): {improved_idea}")

	# --- STEP 4: RE-EVALUATION ---
	new_text_vec = vectorizer.transform([improved_idea]).toarray()
	# Update hashtag count for new features
	new_meta_vec = np.array([[15, 18, 0, improved_idea.count('#')]])
	new_feat_vec = np.hstack((new_text_vec, new_meta_vec))

	new_log = model.predict(new_feat_vec)[0]
	new_views = int(np.expm1(new_log))

	print(f"📊 New Predicted Views: {new_views:,}")

	improvement = ((new_views - initial_views) / initial_views) * 100
	if improvement > 0:
	print(f"🚀 POTENTIAL UPLIFT: +{improvement:.1f}%")
	else:
	print(f"😐 No significant uplift predicted (Model is strict!).")

	except Exception as e:
	print(f"❌ Error calling Gemini API: {e}")

	# ---------------------------------------------------------
	# MAIN EXECUTION
	# ---------------------------------------------------------
	if __name__ == "__main__":
	# 1. Pipeline
	df, _ = generate_enhanced_data(10000)
	X_train, X_test, y_train, y_test, _, tfidf = process_data_pipeline(df)

	# 2. Train Prediction Model
	best_model = train_best_model(X_train, y_train, X_test, y_test)

	# 3. Create Knowledge Base (Embeddings)
	knowledge_df, st_model = create_search_index(df)

	# 4. Save Artifacts for App
	print("\n[5/8] Saving Model Artifacts for Production...")
	best_model.save_model("viral_model.json")
	print(" - Model saved to 'viral_model.json'")

	with open("tfidf_vectorizer.pkl", "wb") as f:
	pickle.dump(tfidf, f)
	print(" - Vectorizer saved to 'tfidf_vectorizer.pkl'")

	# 5. User Interaction Loop
	while True:
	print("\n" + "-"*30)
	user_input = input("Enter your video idea (or 'q' to quit): ")
	if user_input.lower() == 'q':
	break

	optimize_content_with_gemini(
	user_input=user_input,
	model=best_model,
	vectorizer=tfidf,
	knowledge_df=knowledge_df,
	st_model=st_model
	)