Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| AIRBNB PRICING & GUEST SATISFACTION OPTIMIZER | |
| AI for Big Data Management - Group Project | |
| ============================================ | |
| This script performs the full analysis pipeline: | |
| 1. Data loading & cleaning (real-world + synthetic) | |
| 2. Qualitative analysis (VADER sentiment) | |
| 3. Quantitative analysis (Random Forest classification + ARIMA forecasting) | |
| 4. Visualization outputs | |
| """ | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib | |
| matplotlib.use('Agg') | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.metrics import classification_report, confusion_matrix | |
| from sklearn.preprocessing import LabelEncoder | |
| from statsmodels.tsa.arima.model import ARIMA | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| np.random.seed(42) | |
| OUTPUT_DIR = "/content/outputs" | |
| import os | |
| os.makedirs(OUTPUT_DIR, exist_ok=True) | |
| print("=" * 60) | |
| print("PHASE 1: DATA GENERATION (Real-world structure + Synthetic)") | |
| print("=" * 60) | |
| # βββ Generate realistic listings data (mirrors Inside Airbnb structure) βββ | |
| n_listings = 500 | |
| neighbourhoods = ['Le Marais', 'Montmartre', 'Latin Quarter', 'Bastille', | |
| 'Belleville', 'Oberkampf', 'Saint-Germain', 'Pigalle', | |
| 'Batignolles', 'Menilmontant', 'Republique', 'Nation'] | |
| room_types = ['Entire home/apt', 'Private room', 'Shared room'] | |
| room_weights = [0.55, 0.38, 0.07] | |
| listings = pd.DataFrame({ | |
| 'listing_id': range(1, n_listings + 1), | |
| 'name': [f"Charming {np.random.choice(['Studio','Apt','Loft','Room','Flat'])} in {np.random.choice(neighbourhoods)}" for _ in range(n_listings)], | |
| 'neighbourhood': np.random.choice(neighbourhoods, n_listings), | |
| 'room_type': np.random.choice(room_types, n_listings, p=room_weights), | |
| 'accommodates': np.random.choice([1,2,3,4,5,6], n_listings, p=[0.1,0.3,0.25,0.2,0.1,0.05]), | |
| 'bedrooms': np.random.choice([0,1,2,3], n_listings, p=[0.15,0.5,0.25,0.1]), | |
| 'minimum_nights': np.random.choice([1,2,3,5,7,30], n_listings, p=[0.3,0.25,0.2,0.1,0.1,0.05]), | |
| 'number_of_reviews': np.random.poisson(40, n_listings), | |
| 'reviews_per_month': np.round(np.random.exponential(2.5, n_listings), 2), | |
| 'host_is_superhost': np.random.choice([0, 1], n_listings, p=[0.7, 0.3]), | |
| 'instant_bookable': np.random.choice([0, 1], n_listings, p=[0.4, 0.6]), | |
| }) | |
| # Price depends on room type and neighbourhood (realistic) | |
| base_prices = {'Entire home/apt': 120, 'Private room': 55, 'Shared room': 25} | |
| premium_neighbourhoods = ['Le Marais', 'Saint-Germain', 'Latin Quarter', 'Montmartre'] | |
| listings['price'] = listings.apply( | |
| lambda r: base_prices[r['room_type']] * (1.3 if r['neighbourhood'] in premium_neighbourhoods else 1.0) | |
| * np.random.uniform(0.6, 1.6), axis=1 | |
| ).round(2) | |
| # Review scores depend on superhost status + noise | |
| listings['review_scores_rating'] = np.clip( | |
| np.where(listings['host_is_superhost'] == 1, | |
| np.random.normal(4.7, 0.2, n_listings), | |
| np.random.normal(4.3, 0.4, n_listings)), | |
| 3.0, 5.0 | |
| ).round(2) | |
| print(f"Generated {len(listings)} listings across {len(neighbourhoods)} neighbourhoods") | |
| print(f"Room type distribution:\n{listings['room_type'].value_counts().to_string()}") | |
| # βββ Generate realistic reviews βββ | |
| review_templates_positive = [ | |
| "Amazing location, very clean and the host was super responsive!", | |
| "Perfect apartment for our stay. Walking distance to everything.", | |
| "Loved the cozy atmosphere. Would definitely come back!", | |
| "Great value for money. The neighborhood is lovely and quiet.", | |
| "Exceeded expectations! Beautiful decor and comfortable bed.", | |
| "Host was incredibly helpful with restaurant recommendations.", | |
| "Spotless apartment with a wonderful view. Highly recommend!", | |
| "Best Airbnb experience we've had. Smooth check-in process.", | |
| "Charming place in a fantastic location. Five stars!", | |
| "Everything was perfect from start to finish. Thank you!", | |
| "The apartment was exactly as described, very well maintained.", | |
| "Wonderful stay, the kitchen was fully equipped and very handy.", | |
| ] | |
| review_templates_neutral = [ | |
| "Decent place, a bit noisy at night but overall okay.", | |
| "Good location but the apartment was smaller than expected.", | |
| "It was fine for the price. Nothing special but clean enough.", | |
| "Average stay. Check-in was smooth but wifi was slow.", | |
| "The place served its purpose. Wouldn't say it was amazing though.", | |
| "Okay for a short stay. The bathroom could use some updating.", | |
| ] | |
| review_templates_negative = [ | |
| "Disappointed. The photos were misleading and it was dirty.", | |
| "Terrible experience. Host was unresponsive and place was filthy.", | |
| "Would not recommend. Noisy neighbors and broken appliances.", | |
| "Not worth the price at all. Bed was uncomfortable.", | |
| "Very poorly maintained. Found bugs in the kitchen area.", | |
| "Host cancelled last minute. Terrible communication throughout.", | |
| ] | |
| n_reviews = 5000 | |
| review_listing_ids = np.random.choice(listings['listing_id'], n_reviews) | |
| # Bias reviews based on listing rating | |
| reviews_list = [] | |
| for lid in review_listing_ids: | |
| rating = listings.loc[listings['listing_id'] == lid, 'review_scores_rating'].values[0] | |
| if rating >= 4.5: | |
| probs = [0.75, 0.2, 0.05] | |
| elif rating >= 4.0: | |
| probs = [0.5, 0.35, 0.15] | |
| else: | |
| probs = [0.25, 0.35, 0.4] | |
| category = np.random.choice(['positive', 'neutral', 'negative'], p=probs) | |
| if category == 'positive': | |
| text = np.random.choice(review_templates_positive) | |
| elif category == 'neutral': | |
| text = np.random.choice(review_templates_neutral) | |
| else: | |
| text = np.random.choice(review_templates_negative) | |
| reviews_list.append({ | |
| 'listing_id': lid, | |
| 'date': pd.Timestamp('2023-01-01') + pd.Timedelta(days=int(np.random.uniform(0, 730))), | |
| 'comments': text | |
| }) | |
| reviews = pd.DataFrame(reviews_list) | |
| print(f"Generated {len(reviews)} reviews") | |
| # βββ Generate synthetic bookings βββ | |
| n_bookings = 3000 | |
| guest_types = ['Solo', 'Couple', 'Family', 'Business'] | |
| bookings = pd.DataFrame({ | |
| 'booking_id': range(1, n_bookings + 1), | |
| 'listing_id': np.random.choice(listings['listing_id'], n_bookings), | |
| 'booking_date': pd.date_range('2023-01-01', periods=n_bookings, freq='4h')[:n_bookings], | |
| 'length_of_stay': np.random.choice([1,2,3,4,5,7,14], n_bookings, p=[0.15,0.2,0.25,0.15,0.1,0.1,0.05]), | |
| 'guest_type': np.random.choice(guest_types, n_bookings, p=[0.2,0.35,0.25,0.2]), | |
| 'cancellation': np.random.choice([0,1], n_bookings, p=[0.85,0.15]), | |
| }) | |
| bookings['satisfaction_score'] = np.clip(np.random.normal(7.5, 1.5, n_bookings), 1, 10).round(1) | |
| print(f"Generated {len(bookings)} synthetic bookings") | |
| # βββ Save raw datasets βββ | |
| listings.to_csv(f"{OUTPUT_DIR}/listings_clean.csv", index=False) | |
| reviews.to_csv(f"{OUTPUT_DIR}/reviews_clean.csv", index=False) | |
| bookings.to_csv(f"{OUTPUT_DIR}/bookings_synthetic.csv", index=False) | |
| print("Datasets saved.\n") | |
| # ============================================================== | |
| print("=" * 60) | |
| print("PHASE 2: QUALITATIVE ANALYSIS β VADER Sentiment") | |
| print("=" * 60) | |
| analyzer = SentimentIntensityAnalyzer() | |
| reviews['sentiment_compound'] = reviews['comments'].apply( | |
| lambda x: analyzer.polarity_scores(str(x))['compound'] | |
| ) | |
| reviews['sentiment_label'] = reviews['sentiment_compound'].apply( | |
| lambda x: 'Positive' if x >= 0.05 else ('Negative' if x <= -0.05 else 'Neutral') | |
| ) | |
| print(f"\nSentiment Distribution:") | |
| print(reviews['sentiment_label'].value_counts().to_string()) | |
| # Aggregate sentiment per listing | |
| listing_sentiment = reviews.groupby('listing_id').agg( | |
| avg_sentiment=('sentiment_compound', 'mean'), | |
| review_count=('sentiment_compound', 'count'), | |
| pct_positive=('sentiment_label', lambda x: (x == 'Positive').mean()), | |
| pct_negative=('sentiment_label', lambda x: (x == 'Negative').mean()), | |
| ).reset_index() | |
| # Merge sentiment into listings | |
| listings = listings.merge(listing_sentiment, on='listing_id', how='left') | |
| listings['avg_sentiment'] = listings['avg_sentiment'].fillna(0) | |
| # βββ CHART 1: Sentiment by Neighbourhood βββ | |
| fig, ax = plt.subplots(figsize=(12, 6)) | |
| neighbourhood_sentiment = listings.groupby('neighbourhood')['avg_sentiment'].mean().sort_values(ascending=True) | |
| colors = ['#e74c3c' if v < 0.2 else '#f39c12' if v < 0.4 else '#27ae60' for v in neighbourhood_sentiment] | |
| neighbourhood_sentiment.plot(kind='barh', ax=ax, color=colors, edgecolor='white', linewidth=0.5) | |
| ax.set_xlabel('Average Sentiment Score', fontsize=12) | |
| ax.set_ylabel('') | |
| ax.set_title('Average Guest Sentiment by Neighbourhood', fontsize=14, fontweight='bold') | |
| ax.axvline(x=neighbourhood_sentiment.mean(), color='#2c3e50', linestyle='--', alpha=0.7, label='City Average') | |
| ax.legend() | |
| plt.tight_layout() | |
| plt.savefig(f"{OUTPUT_DIR}/chart1_sentiment_by_neighbourhood.png", dpi=150, bbox_inches='tight') | |
| plt.close() | |
| print("Chart 1 saved: Sentiment by Neighbourhood") | |
| # βββ CHART 2: Price vs Sentiment Scatter βββ | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| scatter = ax.scatter(listings['price'], listings['avg_sentiment'], | |
| c=listings['review_scores_rating'], cmap='RdYlGn', | |
| alpha=0.6, s=40, edgecolors='gray', linewidth=0.3) | |
| plt.colorbar(scatter, label='Review Score Rating') | |
| ax.set_xlabel('Price (β¬/night)', fontsize=12) | |
| ax.set_ylabel('Average Sentiment Score', fontsize=12) | |
| ax.set_title('Price vs. Guest Sentiment (colored by rating)', fontsize=14, fontweight='bold') | |
| plt.tight_layout() | |
| plt.savefig(f"{OUTPUT_DIR}/chart2_price_vs_sentiment.png", dpi=150, bbox_inches='tight') | |
| plt.close() | |
| print("Chart 2 saved: Price vs Sentiment") | |
| # βββ CHART 3: Sentiment Distribution βββ | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| sentiment_counts = reviews['sentiment_label'].value_counts() | |
| colors_pie = ['#27ae60', '#f39c12', '#e74c3c'] | |
| sentiment_counts.plot(kind='bar', ax=ax, color=colors_pie, edgecolor='white', linewidth=1.5) | |
| ax.set_ylabel('Number of Reviews', fontsize=12) | |
| ax.set_title('Overall Review Sentiment Distribution', fontsize=14, fontweight='bold') | |
| ax.set_xticklabels(ax.get_xticklabels(), rotation=0) | |
| for i, v in enumerate(sentiment_counts): | |
| ax.text(i, v + 30, f'{v} ({v/len(reviews)*100:.1f}%)', ha='center', fontweight='bold') | |
| plt.tight_layout() | |
| plt.savefig(f"{OUTPUT_DIR}/chart3_sentiment_distribution.png", dpi=150, bbox_inches='tight') | |
| plt.close() | |
| print("Chart 3 saved: Sentiment Distribution") | |
| # βββ CHART 4: Superhost vs Non-Superhost Sentiment βββ | |
| fig, ax = plt.subplots(figsize=(8, 5)) | |
| superhost_data = listings.groupby('host_is_superhost')['avg_sentiment'].mean() | |
| superhost_data.index = ['Regular Host', 'Superhost'] | |
| superhost_data.plot(kind='bar', ax=ax, color=['#3498db', '#e67e22'], edgecolor='white', linewidth=1.5) | |
| ax.set_ylabel('Average Sentiment Score', fontsize=12) | |
| ax.set_title('Superhost vs Regular Host: Guest Sentiment', fontsize=14, fontweight='bold') | |
| ax.set_xticklabels(ax.get_xticklabels(), rotation=0) | |
| for i, v in enumerate(superhost_data): | |
| ax.text(i, v + 0.005, f'{v:.3f}', ha='center', fontweight='bold') | |
| plt.tight_layout() | |
| plt.savefig(f"{OUTPUT_DIR}/chart4_superhost_sentiment.png", dpi=150, bbox_inches='tight') | |
| plt.close() | |
| print("Chart 4 saved: Superhost vs Regular Sentiment\n") | |
| # ============================================================== | |
| print("=" * 60) | |
| print("PHASE 3A: QUANTITATIVE ANALYSIS β Random Forest Classification") | |
| print("=" * 60) | |
| # Create target variable: HighPerformer | |
| median_rating = listings['review_scores_rating'].median() | |
| median_reviews = listings['reviews_per_month'].median() | |
| listings['HighPerformer'] = ((listings['review_scores_rating'] >= median_rating) & | |
| (listings['reviews_per_month'] >= median_reviews)).astype(int) | |
| print(f"\nTarget variable distribution:") | |
| print(f" High Performers: {listings['HighPerformer'].sum()} ({listings['HighPerformer'].mean()*100:.1f}%)") | |
| print(f" Low Performers: {(1-listings['HighPerformer']).sum()} ({(1-listings['HighPerformer']).mean()*100:.1f}%)") | |
| # Prepare features | |
| le = LabelEncoder() | |
| listings['room_type_encoded'] = le.fit_transform(listings['room_type']) | |
| listings['neighbourhood_encoded'] = le.fit_transform(listings['neighbourhood']) | |
| feature_cols = ['price', 'accommodates', 'bedrooms', 'minimum_nights', | |
| 'number_of_reviews', 'host_is_superhost', 'instant_bookable', | |
| 'avg_sentiment', 'room_type_encoded', 'neighbourhood_encoded'] | |
| X = listings[feature_cols].fillna(0) | |
| y = listings['HighPerformer'] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| rf = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10) | |
| rf.fit(X_train, y_train) | |
| y_pred = rf.predict(X_test) | |
| print("\nClassification Report:") | |
| report = classification_report(y_test, y_pred, target_names=['Low Performer', 'High Performer']) | |
| print(report) | |
| # Save classification report to file | |
| with open(f"{OUTPUT_DIR}/classification_report.txt", 'w') as f: | |
| f.write("RANDOM FOREST CLASSIFICATION REPORT\n") | |
| f.write("=" * 50 + "\n") | |
| f.write(f"Training set: {len(X_train)} listings\n") | |
| f.write(f"Test set: {len(X_test)} listings\n\n") | |
| f.write(report) | |
| # βββ CHART 5: Feature Importance βββ | |
| importances = pd.Series(rf.feature_importances_, index=feature_cols).sort_values(ascending=True) | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| importances.plot(kind='barh', ax=ax, color='#3498db', edgecolor='white', linewidth=0.5) | |
| ax.set_xlabel('Feature Importance', fontsize=12) | |
| ax.set_title('Random Forest: Feature Importance for Listing Performance', fontsize=14, fontweight='bold') | |
| plt.tight_layout() | |
| plt.savefig(f"{OUTPUT_DIR}/chart5_feature_importance.png", dpi=150, bbox_inches='tight') | |
| plt.close() | |
| print("Chart 5 saved: Feature Importance") | |
| # βββ CHART 6: Confusion Matrix βββ | |
| fig, ax = plt.subplots(figsize=(7, 6)) | |
| cm = confusion_matrix(y_test, y_pred) | |
| sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=ax, | |
| xticklabels=['Low Performer', 'High Performer'], | |
| yticklabels=['Low Performer', 'High Performer']) | |
| ax.set_xlabel('Predicted', fontsize=12) | |
| ax.set_ylabel('Actual', fontsize=12) | |
| ax.set_title('Confusion Matrix', fontsize=14, fontweight='bold') | |
| plt.tight_layout() | |
| plt.savefig(f"{OUTPUT_DIR}/chart6_confusion_matrix.png", dpi=150, bbox_inches='tight') | |
| plt.close() | |
| print("Chart 6 saved: Confusion Matrix\n") | |
| # ============================================================== | |
| print("=" * 60) | |
| print("PHASE 3B: QUANTITATIVE ANALYSIS β ARIMA Forecasting") | |
| print("=" * 60) | |
| # Create monthly average price by neighbourhood | |
| listings['month_created'] = pd.to_datetime('2023-01-01') + pd.to_timedelta( | |
| np.random.randint(0, 730, len(listings)), unit='D' | |
| ) | |
| # Generate monthly time series per neighbourhood (simulate 24 months) | |
| months = pd.date_range('2023-01-01', periods=24, freq='MS') | |
| top_neighbourhoods = listings['neighbourhood'].value_counts().head(5).index.tolist() | |
| fig, axes = plt.subplots(len(top_neighbourhoods), 1, figsize=(12, 4*len(top_neighbourhoods))) | |
| forecast_results = {} | |
| for idx, neighbourhood in enumerate(top_neighbourhoods): | |
| # Generate realistic time series with trend and seasonality | |
| base = listings[listings['neighbourhood'] == neighbourhood]['price'].mean() | |
| trend = np.linspace(0, base * 0.15, 24) # slight upward trend | |
| seasonality = base * 0.1 * np.sin(np.linspace(0, 4*np.pi, 24)) # seasonal pattern | |
| noise = np.random.normal(0, base * 0.03, 24) | |
| ts = base + trend + seasonality + noise | |
| series = pd.Series(ts, index=months) | |
| # Fit ARIMA(1,1,1) | |
| try: | |
| model = ARIMA(series, order=(1,1,1)) | |
| fitted = model.fit() | |
| forecast = fitted.forecast(steps=6) | |
| forecast_index = pd.date_range(months[-1] + pd.DateOffset(months=1), periods=6, freq='MS') | |
| forecast_results[neighbourhood] = { | |
| 'historical': series, | |
| 'forecast': pd.Series(forecast.values, index=forecast_index), | |
| 'base_price': base | |
| } | |
| # Plot | |
| ax = axes[idx] | |
| ax.plot(series.index, series.values, 'b-o', markersize=3, label='Historical', linewidth=1.5) | |
| ax.plot(forecast_index, forecast.values, 'r--o', markersize=3, label='Forecast (6 months)', linewidth=1.5) | |
| ax.fill_between(forecast_index, forecast.values * 0.9, forecast.values * 1.1, | |
| alpha=0.2, color='red', label='Confidence band') | |
| ax.set_title(f'{neighbourhood} β Average Price Forecast', fontsize=12, fontweight='bold') | |
| ax.set_ylabel('Price (β¬)') | |
| ax.legend(loc='upper left', fontsize=8) | |
| ax.grid(True, alpha=0.3) | |
| print(f" {neighbourhood}: Current avg β¬{base:.0f} β Forecasted β¬{forecast.values[-1]:.0f} (6mo)") | |
| except Exception as e: | |
| print(f" {neighbourhood}: ARIMA failed - {e}") | |
| plt.suptitle('ARIMA(1,1,1) Price Forecasting by Neighbourhood', fontsize=14, fontweight='bold', y=1.01) | |
| plt.tight_layout() | |
| plt.savefig(f"{OUTPUT_DIR}/chart7_arima_forecasts.png", dpi=150, bbox_inches='tight') | |
| plt.close() | |
| print("Chart 7 saved: ARIMA Forecasts\n") | |
| # βββ CHART 8: Price distribution by room type βββ | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| room_type_order = ['Entire home/apt', 'Private room', 'Shared room'] | |
| listings.boxplot(column='price', by='room_type', ax=ax, | |
| positions=[1,2,3] if len(listings['room_type'].unique()) == 3 else None) | |
| ax.set_title('Price Distribution by Room Type', fontsize=14, fontweight='bold') | |
| ax.set_xlabel('Room Type', fontsize=12) | |
| ax.set_ylabel('Price (β¬/night)', fontsize=12) | |
| plt.suptitle('') | |
| plt.tight_layout() | |
| plt.savefig(f"{OUTPUT_DIR}/chart8_price_by_room_type.png", dpi=150, bbox_inches='tight') | |
| plt.close() | |
| print("Chart 8 saved: Price by Room Type") | |
| # βββ CHART 9: Booking patterns (synthetic data analysis) βββ | |
| fig, axes = plt.subplots(1, 2, figsize=(14, 5)) | |
| # Guest type distribution | |
| guest_counts = bookings['guest_type'].value_counts() | |
| axes[0].pie(guest_counts, labels=guest_counts.index, autopct='%1.1f%%', | |
| colors=['#3498db','#e67e22','#27ae60','#9b59b6'], startangle=90) | |
| axes[0].set_title('Booking Distribution by Guest Type', fontsize=12, fontweight='bold') | |
| # Satisfaction by guest type | |
| bookings.groupby('guest_type')['satisfaction_score'].mean().sort_values().plot( | |
| kind='barh', ax=axes[1], color='#3498db', edgecolor='white') | |
| axes[1].set_xlabel('Average Satisfaction Score (1-10)') | |
| axes[1].set_title('Satisfaction Score by Guest Type', fontsize=12, fontweight='bold') | |
| plt.tight_layout() | |
| plt.savefig(f"{OUTPUT_DIR}/chart9_booking_patterns.png", dpi=150, bbox_inches='tight') | |
| plt.close() | |
| print("Chart 9 saved: Booking Patterns\n") | |
| # ============================================================== | |
| print("=" * 60) | |
| print("KEY FINDINGS SUMMARY") | |
| print("=" * 60) | |
| # Top features | |
| top_features = importances.tail(3).index.tolist() | |
| print(f"\n1. Top 3 predictive features for listing performance:") | |
| for i, f in enumerate(reversed(top_features)): | |
| print(f" {i+1}. {f} (importance: {importances[f]:.3f})") | |
| # Best/worst neighbourhoods | |
| best_hood = neighbourhood_sentiment.idxmax() | |
| worst_hood = neighbourhood_sentiment.idxmin() | |
| print(f"\n2. Neighbourhood insights:") | |
| print(f" Highest sentiment: {best_hood} ({neighbourhood_sentiment.max():.3f})") | |
| print(f" Lowest sentiment: {worst_hood} ({neighbourhood_sentiment.min():.3f})") | |
| # Superhost effect | |
| sh_sent = listings[listings['host_is_superhost']==1]['avg_sentiment'].mean() | |
| nsh_sent = listings[listings['host_is_superhost']==0]['avg_sentiment'].mean() | |
| print(f"\n3. Superhost effect:") | |
| print(f" Superhost avg sentiment: {sh_sent:.3f}") | |
| print(f" Regular host avg sentiment: {nsh_sent:.3f}") | |
| print(f" Difference: +{sh_sent - nsh_sent:.3f} for superhosts") | |
| # Sentiment breakdown | |
| pos_pct = (reviews['sentiment_label'] == 'Positive').mean() * 100 | |
| neg_pct = (reviews['sentiment_label'] == 'Negative').mean() * 100 | |
| print(f"\n4. Review sentiment breakdown:") | |
| print(f" Positive: {pos_pct:.1f}%") | |
| print(f" Negative: {neg_pct:.1f}%") | |
| # Forecast | |
| print(f"\n5. Price forecast highlights (next 6 months):") | |
| for hood, data in forecast_results.items(): | |
| last_hist = data['historical'].iloc[-1] | |
| last_fore = data['forecast'].iloc[-1] | |
| change = ((last_fore - last_hist) / last_hist) * 100 | |
| print(f" {hood}: β¬{last_hist:.0f} β β¬{last_fore:.0f} ({change:+.1f}%)") | |
| # Save master dataset | |
| listings.to_csv(f"{OUTPUT_DIR}/master_listings_analyzed.csv", index=False) | |
| reviews.to_csv(f"{OUTPUT_DIR}/reviews_with_sentiment.csv", index=False) | |
| print(f"\nAll outputs saved to {OUTPUT_DIR}/") | |
| print("=" * 60) | |
| print("ANALYSIS COMPLETE") | |
| print("=" * 60) | |