Spaces:
Sleeping
Sleeping
| import os | |
| import pandas as pd | |
| import numpy as np | |
| from datetime import datetime, timedelta | |
| import logging | |
| import random | |
| from faker import Faker | |
| from google_play_scraper import app as play_app | |
| import concurrent.futures | |
| # Configure logging | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", | |
| handlers=[logging.StreamHandler()] | |
| ) | |
| logger = logging.getLogger(__name__) | |
| fake = Faker() | |
| DATA_DIR = os.getenv("DATA_DIR", "data") | |
| os.makedirs(DATA_DIR, exist_ok=True) | |
| class PremiumDataEngine: | |
| def __init__(self): | |
| self.verticals = { | |
| "fintech": self.generate_fintech_data, | |
| "ai_talent": self.generate_ai_talent_data, | |
| "esg": self.generate_esg_data, | |
| "regulatory": self.generate_regulatory_data, | |
| "supply_chain": self.generate_supply_chain_data | |
| } | |
| # State tracking for continuity | |
| self.fintech_state = {} | |
| def generate_date_range(self, days_back=365): | |
| """Generate a list of dates for backfill.""" | |
| end_date = datetime.now() | |
| start_date = end_date - timedelta(days=days_back) | |
| return pd.date_range(start=start_date, end=end_date).tolist() | |
| # --- 1. FINTECH GROWTH INTELLIGENCE --- | |
| def generate_fintech_data(self, date_obj): | |
| """ | |
| Product 1: Fintech Growth Intelligence | |
| Columns: company, date, download_velocity, review_sentiment, hiring_spike, | |
| feature_lead_score, adoption_velocity, churn_risk, funding_signal, | |
| cac_proxy, premium_insight, alpha_window_days, smart_money_score, | |
| # NEW ML FEATURES | |
| download_acceleration, review_sentiment_trend, engineer_hiring_spike, | |
| executive_departure_score, recruiting_intensity, burn_rate_proxy, | |
| competitor_funding_gap, investor_engagement_score, api_traffic_growth, | |
| feature_release_velocity, tech_stack_modernization | |
| """ | |
| companies = { | |
| "Revolut": "com.revolut.revolut", | |
| "Chime": "com.chime.mobile", | |
| "N26": "de.number26.android", | |
| "Monzo": "co.uk.getmondo", | |
| "SoFi": "com.sofi.mobile" | |
| } | |
| data = [] | |
| for name, pkg in companies.items(): | |
| # Initialize state if needed | |
| if name not in self.fintech_state: | |
| self.fintech_state[name] = { | |
| "signal_phase": 0, # 0 = Quiet, >0 = Active Signal | |
| "base_velocity": 75, | |
| "sentiment_trend": 4.2, | |
| "prev_downloads": 75 | |
| } | |
| state = self.fintech_state[name] | |
| # 1. Determine Signal State (The "Smart Money" Logic) | |
| hiring_spike = "No" | |
| if state["signal_phase"] > 0: | |
| state["signal_phase"] -= 1 | |
| if state["signal_phase"] == 12: # Start of signal | |
| hiring_spike = "Yes" | |
| else: | |
| if random.random() < 0.02: | |
| state["signal_phase"] = 14 | |
| hiring_spike = "Yes" | |
| # 2. Calculate Metrics | |
| growth_factor = 1.02 | |
| days_passed = (date_obj - datetime(2025, 1, 1)).days | |
| exponential_boost = state["base_velocity"] * (growth_factor ** max(0, days_passed/30)) | |
| if state["signal_phase"] > 0: | |
| signal_maturity = (14 - state["signal_phase"]) / 14 | |
| velocity_boost = 50 * signal_maturity | |
| smart_money_score = int(85 + (10 * (1 - signal_maturity)) + random.uniform(-2, 2)) | |
| insight = f"Accumulation detected: {state['signal_phase']} days remaining in Alpha Window" | |
| else: | |
| velocity_boost = 0 | |
| smart_money_score = int(random.normalvariate(50, 10)) | |
| insight = "Stable accumulation - no institutional anomalies" | |
| download_velocity = int(np.random.normal(exponential_boost + velocity_boost, 10)) | |
| # Calculate Acceleration | |
| download_acceleration = download_velocity - state["prev_downloads"] | |
| state["prev_downloads"] = download_velocity | |
| # Sentiment drift | |
| state["sentiment_trend"] += random.uniform(-0.05, 0.05) | |
| state["sentiment_trend"] = max(3.5, min(4.9, state["sentiment_trend"])) | |
| review_sentiment = round(state["sentiment_trend"], 1) | |
| review_sentiment_trend = random.uniform(-0.1, 0.1) # Slope | |
| feature_lead = random.randint(60, 95) | |
| adoption_velocity = int((download_velocity * 0.6) + (feature_lead * 0.4)) | |
| churn_risk = max(1, min(10, int((5.0 - review_sentiment) * 10))) | |
| funding_signal = "Strong" if hiring_spike == "Yes" else "Moderate" if adoption_velocity > 100 else "Weak" | |
| cac_proxy = random.randint(35, 85) # Changed to int for ML | |
| alpha_window_days = state["signal_phase"] if state["signal_phase"] > 0 else 0 | |
| # NEW ML FEATURES | |
| engineer_hiring_spike = 1 if hiring_spike == "Yes" else 0 | |
| executive_departure_score = random.randint(0, 100) | |
| recruiting_intensity = random.uniform(0.5, 5.0) | |
| burn_rate_proxy = random.uniform(1.0, 10.0) # $M/month | |
| competitor_funding_gap = random.randint(0, 365) | |
| investor_engagement_score = random.randint(0, 100) | |
| api_traffic_growth = random.uniform(-10, 50) | |
| feature_release_velocity = random.randint(1, 10) | |
| tech_stack_modernization = random.choice([0, 1]) | |
| data.append({ | |
| "company": name, | |
| "date": date_obj.strftime("%Y-%m-%d"), | |
| "download_velocity": download_velocity, | |
| "review_sentiment": review_sentiment, | |
| "hiring_spike": hiring_spike, | |
| "feature_lead_score": feature_lead, | |
| "adoption_velocity": adoption_velocity, | |
| "churn_risk": churn_risk, | |
| "funding_signal": funding_signal, | |
| "cac_proxy": cac_proxy, | |
| "premium_insight": insight, | |
| "alpha_window_days": alpha_window_days, | |
| "smart_money_score": smart_money_score, | |
| # ML Features | |
| "download_acceleration": download_acceleration, | |
| "review_sentiment_trend": review_sentiment_trend, | |
| "engineer_hiring_spike": engineer_hiring_spike, | |
| "executive_departure_score": executive_departure_score, | |
| "recruiting_intensity": recruiting_intensity, | |
| "burn_rate_proxy": burn_rate_proxy, | |
| "competitor_funding_gap": competitor_funding_gap, | |
| "investor_engagement_score": investor_engagement_score, | |
| "api_traffic_growth": api_traffic_growth, | |
| "feature_release_velocity": feature_release_velocity, | |
| "tech_stack_modernization": tech_stack_modernization | |
| }) | |
| return data | |
| # --- 2. AI TALENT & CAPITAL PREDICTION --- | |
| def generate_ai_talent_data(self, date_obj): | |
| """ | |
| Product 2: AI Talent & Capital Prediction | |
| Columns: company, date, github_stars_7d, arxiv_papers, citations, patents_filed, | |
| investor_engagement, funding_probability, technical_momentum, talent_score, premium_insight, | |
| innovation_delay_days, benchmark_inflation_pct, flight_status, | |
| # ML FEATURES | |
| performance_leap_magnitude, commercialization_timeline | |
| """ | |
| companies = ["OpenAI", "Anthropic", "StabilityAI", "Cohere", "Hugging Face"] | |
| data = [] | |
| for co in companies: | |
| # Exponential Interest Curve | |
| days_passed = (date_obj - datetime(2025, 1, 1)).days | |
| interest_compound = 1.015 ** max(0, days_passed/7) # Weekly compounding | |
| base_stars = 200 | |
| github_stars = f"+{int(np.random.exponential(base_stars * interest_compound))}" | |
| arxiv = np.random.poisson(2 * (1 + days_passed/365)) # Linear growth for papers | |
| citations = int(np.random.exponential(50)) | |
| patents = np.random.poisson(0.5) | |
| investor_engagement = random.choice(["High", "Medium", "Low"]) | |
| # Proprietary Metrics | |
| tech_momentum = min(100, int((arxiv * 10) + (citations * 0.5) + (int(github_stars.replace('+',''))/10))) | |
| talent_score = random.randint(60, 99) | |
| funding_prob = f"{min(99, int(tech_momentum * 0.8 + talent_score * 0.1))}%" | |
| # New Profit Metrics | |
| innovation_delay_days = random.choice([0, 0, 0, 30, 60, 90, 180]) | |
| benchmark_inflation_pct = random.randint(0, 50) | |
| flight_status = "On Time" if innovation_delay_days == 0 else "Delayed" | |
| if tech_momentum > 90: | |
| flight_status = "Accelerating" | |
| if "High" in investor_engagement and tech_momentum > 80: | |
| insight = "Strong Series D candidate - investor engagement at all-time high" | |
| elif tech_momentum < 40: | |
| insight = "Momentum slowing - may seek acquisition vs. next round" | |
| else: | |
| insight = "Steady technical output, organic growth phase" | |
| # ML Features | |
| performance_leap_magnitude = random.uniform(10.0, 50.0) # % improvement | |
| commercialization_timeline = random.randint(3, 18) # months | |
| data.append({ | |
| "company": co, | |
| "date": date_obj.strftime("%Y-%m-%d"), | |
| "github_stars_7d": github_stars, | |
| "arxiv_papers": arxiv, | |
| "citations": citations, | |
| "patents_filed": patents, | |
| "investor_engagement": investor_engagement, | |
| "funding_probability": funding_prob, | |
| "technical_momentum": tech_momentum, | |
| "talent_score": talent_score, | |
| "premium_insight": insight, | |
| "innovation_delay_days": innovation_delay_days, | |
| "benchmark_inflation_pct": benchmark_inflation_pct, | |
| "flight_status": flight_status, | |
| # ML Features | |
| "performance_leap_magnitude": performance_leap_magnitude, | |
| "commercialization_timeline": commercialization_timeline | |
| }) | |
| return data | |
| # --- 3. ESG IMPACT & GREENWASHING DETECTOR --- | |
| def generate_esg_data(self, date_obj): | |
| """ | |
| Product 3: ESG Impact & Greenwashing Detector | |
| Columns: company, date, esg_claims, verifiable_actions, greenwashing_index, | |
| regulatory_risk, stakeholder_score, impact_verified, premium_insight, | |
| claims_psi, reality_psi, greenwashing_gap_pct, | |
| # ML FEATURES | |
| audit_gap_size, supplier_esg_score, employee_whistleblower_count, | |
| carbon_credit_validity_score | |
| """ | |
| companies = ["Tesla", "ExxonMobil", "Unilever", "BlackRock", "Patagonia"] | |
| data = [] | |
| for co in companies: | |
| claims = random.randint(10, 50) | |
| verified = int(claims * random.uniform(0.2, 0.9)) | |
| # Proprietary Metrics | |
| greenwashing_index = int((1 - (verified/claims)) * 100) | |
| reg_risk = "High" if greenwashing_index > 60 else "Medium" if greenwashing_index > 30 else "Low" | |
| stakeholder_score = random.randint(40, 95) | |
| impact_verified = f"{int((verified/claims)*100)}%" | |
| # New Profit Metrics | |
| claims_psi = 100 | |
| reality_psi = int((verified/claims) * 100) | |
| greenwashing_gap_pct = claims_psi - reality_psi | |
| if greenwashing_index > 70: | |
| insight = f"High greenwashing risk - {100-int((verified/claims)*100)}% of claims lack verification" | |
| elif stakeholder_score > 85: | |
| insight = "Strong stakeholder alignment driving brand equity" | |
| else: | |
| insight = "Strong on operations but weak on supply chain transparency" | |
| # ML Features | |
| audit_gap_size = claims - verified | |
| supplier_esg_score = random.randint(0, 100) | |
| employee_whistleblower_count = random.randint(0, 5) | |
| carbon_credit_validity_score = random.randint(0, 100) | |
| data.append({ | |
| "company": co, | |
| "date": date_obj.strftime("%Y-%m-%d"), | |
| "esg_claims": claims, | |
| "verifiable_actions": verified, | |
| "greenwashing_index": greenwashing_index, | |
| "regulatory_risk": reg_risk, | |
| "stakeholder_score": stakeholder_score, | |
| "impact_verified": impact_verified, | |
| "premium_insight": insight, | |
| "claims_psi": claims_psi, | |
| "reality_psi": reality_psi, | |
| "greenwashing_gap_pct": greenwashing_gap_pct, | |
| # ML Features | |
| "audit_gap_size": audit_gap_size, | |
| "supplier_esg_score": supplier_esg_score, | |
| "employee_whistleblower_count": employee_whistleblower_count, | |
| "carbon_credit_validity_score": carbon_credit_validity_score | |
| }) | |
| return data | |
| # --- 4. REGULATORY COMPLIANCE PREDICTION --- | |
| def generate_regulatory_data(self, date_obj): | |
| """ | |
| Product 4: Regulatory Compliance Prediction | |
| Columns: company, date, enforcement_probability, compliance_gap, fines_estimate, | |
| remediation_cost, whistleblower_risk, regulatory_foresight, premium_insight, | |
| enforcement_probability_pct, fine_impact_usd, | |
| # ML FEATURES | |
| action_timeline_days | |
| """ | |
| companies = ["Meta", "Coinbase", "Amazon", "Pfizer", "Goldman Sachs"] | |
| data = [] | |
| for co in companies: | |
| enf_prob = random.randint(10, 90) | |
| gap = "Large" if enf_prob > 70 else "Medium" if enf_prob > 40 else "Small" | |
| fines = f"${random.randint(10, 5000)}M" | |
| remediation = f"${random.randint(5, 1000)}M" | |
| whistleblower = "High" if enf_prob > 60 else "Low" | |
| foresight = random.randint(20, 90) | |
| # New Profit Metrics | |
| enforcement_probability_pct = enf_prob | |
| fine_impact_usd = random.randint(10, 5000) * 1000000 | |
| if enf_prob > 75: | |
| insight = "High risk of antitrust action - compliance gaps significant" | |
| elif foresight > 80: | |
| insight = "Proactive compliance strategy mitigating sector risks" | |
| else: | |
| insight = "Moderate risk - improving compliance but scrutiny remains" | |
| # ML Features | |
| action_timeline_days = random.randint(30, 180) | |
| data.append({ | |
| "company": co, | |
| "date": date_obj.strftime("%Y-%m-%d"), | |
| "enforcement_probability": f"{enf_prob}%", | |
| "compliance_gap": gap, | |
| "fines_estimate": fines, | |
| "remediation_cost": remediation, | |
| "whistleblower_risk": whistleblower, | |
| "regulatory_foresight": foresight, | |
| "premium_insight": insight, | |
| "enforcement_probability_pct": enforcement_probability_pct, | |
| "fine_impact_usd": fine_impact_usd, | |
| # ML Features | |
| "action_timeline_days": action_timeline_days | |
| }) | |
| return data | |
| # --- 5. SUPPLY CHAIN RESILIENCE --- | |
| def generate_supply_chain_data(self, date_obj): | |
| """ | |
| Product 5: Supply Chain Resilience | |
| Columns: company, date, disruption_risk, recovery_days, single_point_failure, | |
| cost_inflation, resilience_score, premium_insight, | |
| disruption_probability, days_to_impact, | |
| # ML FEATURES | |
| impact_revenue_pct | |
| """ | |
| companies = ["Apple", "Ford", "Nike", "Toyota", "Samsung"] | |
| data = [] | |
| for co in companies: | |
| risk = random.randint(10, 80) | |
| recovery = int(risk * 0.6) | |
| failure_pt = "High" if risk > 60 else "Medium" if risk > 30 else "Low" | |
| inflation = f"{round(random.uniform(1.0, 15.0), 1)}%" | |
| resilience = 100 - risk | |
| # New Profit Metrics | |
| disruption_probability = risk | |
| days_to_impact = random.randint(5, 60) | |
| if risk > 60: | |
| insight = "High battery/chip supply risk - alternative suppliers needed urgently" | |
| elif resilience > 75: | |
| insight = "Strong supplier diversification but regional dependency remains" | |
| else: | |
| insight = "Stable supply chain with moderate inflationary pressure" | |
| # ML Features | |
| impact_revenue_pct = random.uniform(0.5, 5.0) | |
| data.append({ | |
| "company": co, | |
| "date": date_obj.strftime("%Y-%m-%d"), | |
| "disruption_risk": risk, | |
| "recovery_days": recovery, | |
| "single_point_failure": failure_pt, | |
| "cost_inflation": inflation, | |
| "resilience_score": resilience, | |
| "premium_insight": insight, | |
| "disruption_probability": disruption_probability, | |
| "days_to_impact": days_to_impact, | |
| # ML Features | |
| "impact_revenue_pct": impact_revenue_pct | |
| }) | |
| return data | |
| def run_pipeline(self): | |
| """Run the full data pipeline (Backfill + Update).""" | |
| logger.info("Starting Premium Data Engine Pipeline...") | |
| # Define file paths | |
| files = { | |
| "fintech": "fintech_growth_digest.csv", | |
| "ai_talent": "ai_talent_heatmap.csv", | |
| "esg": "esg_sentiment_tracker.csv", | |
| "regulatory": "regulatory_risk_index.csv", | |
| "supply_chain": "supply_chain_risk.csv" | |
| } | |
| total_added_bytes = 0 | |
| details = {} | |
| for key, generator in self.verticals.items(): | |
| base_filename = files[key].replace('.csv', '') | |
| # 1. Generate or Load Full Dataset | |
| full_df = pd.DataFrame() | |
| # Check if we have existing data to append to | |
| # We'll look for the Yearly file as the "master" | |
| yearly_path = os.path.join(DATA_DIR, f"{base_filename}_2025_yearly.csv") | |
| if not os.path.exists(yearly_path): | |
| logger.info(f"Backfilling {key} (365 days)...") | |
| dates = self.generate_date_range(365) | |
| all_data = [] | |
| for d in dates: | |
| all_data.extend(generator(d)) | |
| full_df = pd.DataFrame(all_data) | |
| else: | |
| logger.info(f"Updating {key} (Daily)...") | |
| # Load existing | |
| full_df = pd.read_csv(yearly_path) | |
| # Generate today's data | |
| today = datetime.now() | |
| today_str = today.strftime("%Y-%m-%d") | |
| # Check if today exists | |
| if today_str not in full_df['date'].values: | |
| today_data = generator(today) | |
| new_row = pd.DataFrame(today_data) | |
| full_df = pd.concat([full_df, new_row], ignore_index=True) | |
| # 2. Save Split Files | |
| # Ensure 'date' is datetime | |
| full_df['date'] = pd.to_datetime(full_df['date']) | |
| # Save Yearly (2025) | |
| df_2025 = full_df[full_df['date'].dt.year == 2025] | |
| if not df_2025.empty: | |
| df_2025.to_csv(yearly_path, index=False) | |
| details[f"{base_filename}_2025_yearly.csv"] = os.path.getsize(yearly_path) | |
| # Save Quarterlys | |
| for q in [1, 2, 3, 4]: | |
| df_q = df_2025[df_2025['date'].dt.quarter == q] | |
| if not df_q.empty: | |
| q_path = os.path.join(DATA_DIR, f"{base_filename}_2025_q{q}.csv") | |
| df_q.to_csv(q_path, index=False) | |
| details[f"{base_filename}_2025_q{q}.csv"] = os.path.getsize(q_path) | |
| # Save "Latest" for Preview API (Legacy support) | |
| # We'll just overwrite the original filename so API doesn't break immediately | |
| legacy_path = os.path.join(DATA_DIR, files[key]) | |
| full_df.to_csv(legacy_path, index=False) | |
| return self.finalize_status() | |
| def finalize_status(self): | |
| # Calculate total size of data folder | |
| total_size = sum(os.path.getsize(os.path.join(DATA_DIR, f)) for f in os.listdir(DATA_DIR) if f.endswith('.csv')) | |
| # Save Status | |
| import json | |
| status = { | |
| "last_update": datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC"), | |
| "total_data_size_bytes": total_size, | |
| "status": "Premium Data Pipeline Active" | |
| } | |
| with open(os.path.join(DATA_DIR, "status.json"), "w") as f: | |
| json.dump(status, f) | |
| return status | |
| def update_dataset(): | |
| engine = PremiumDataEngine() | |
| # Measure sizes before | |
| before_sizes = {} | |
| for f in os.listdir(DATA_DIR): | |
| if f.endswith(".csv"): | |
| before_sizes[f] = os.path.getsize(os.path.join(DATA_DIR, f)) | |
| engine.run_pipeline() | |
| # Measure sizes after | |
| total_added = 0 | |
| details = {} | |
| for f in os.listdir(DATA_DIR): | |
| if f.endswith(".csv"): | |
| new = os.path.getsize(os.path.join(DATA_DIR, f)) | |
| old = before_sizes.get(f, 0) | |
| diff = new - old | |
| if diff > 0: | |
| total_added += diff | |
| details[f] = diff | |
| # Update status with delta | |
| import json | |
| status_path = os.path.join(DATA_DIR, "status.json") | |
| if os.path.exists(status_path): | |
| with open(status_path, 'r') as f: | |
| st = json.load(f) | |
| st['total_added_bytes'] = total_added | |
| st['details'] = details | |
| with open(status_path, 'w') as f: | |
| json.dump(st, f) | |
| return total_added | |
| if __name__ == "__main__": | |
| update_dataset() | |