5abaya / main.py
briskwave's picture
Upload 2 files
85873eb verified
import asyncio
import pandas as pd
import os
from src.scraping.scraper import AmazonScraper
from src.analysis.data_processor import DataProcessor
from src.analysis.nlp_models import NLPModels
from src.analysis.scoring_engine import ScoringEngine
from src.dashboard.app import demo # Import the Gradio app
async def run_analysis(url):
# Ensure data directory exists
os.makedirs("data", exist_ok=True)
print("Starting scraping...")
scraper = AmazonScraper()
products_data = await scraper.scrape_product_listing(url, output_dir="data")
if not products_data:
print("No products scraped. Exiting.")
return
print("Processing data...")
processor = DataProcessor()
df = processor.load_raw_data("data/product_data.json")
processed_df = processor.clean_and_structure(df)
processor.save_processed_data(processed_df, "data/processed_product_data.csv")
print("Applying NLP analysis and scoring...")
nlp_models = NLPModels()
scoring_engine = ScoringEngine(nlp_models)
# Apply NLP to each product description/title for sentiment, keyphrases, etc.
# This is a simplified example; in a real app, you'd iterate through products
# and apply NLP models more thoroughly.
processed_df["sentiment_score"] = processed_df["title"].apply(lambda x: nlp_models.get_sentiment(x)["score"])
processed_df["keyphrases"] = processed_df["title"].apply(lambda x: nlp_models.extract_keyphrases(x))
# Calculate virality score (using dummy values for now, as actual NLP-derived metrics are complex)
scored_df = scoring_engine.calculate_virality_score(processed_df.copy())
# Save the final scored data
processor.save_processed_data(scored_df, "data/final_product_insights.csv")
print("Analysis complete. Data saved to data/final_product_insights.csv")
print("Launching dashboard...")
# The Gradio app will load data from data/processed_product_data.csv or data/final_product_insights.csv
# For this example, we'll just launch the demo directly.
demo.launch()
if __name__ == "__main__":
# Example Amazon URL (replace with a dynamic input in a real application)
amazon_url = "https://www.amazon.com/s?bbn=12035955011&i=fashion-novelty&oq=Solid%20colors%3A%20100%25%2BCotton%3B%20Heather%20Grey%3A%2090%25%2BCotton%2C%2010%25%2BPolyester%3B%20All%20Other%20Heathers%3A%2050%25%2BCotton%2C%2050%25%2BPolyester%20Lightweight%2C%20Classic%20fit%2C%20Double-needle%20sleeve%20and%20bottom%20hem%20Machine%20wash%20cold%20with%20like%20colors%2C%20dry%20low%20heat%20-long%20-premium%20-sweatshirt%20-v-neck%20-tank%2010%20x%208%20x%201%20inches%3B%204.8%20Ounces&qid=1699392328&ref=glow_cls&refresh=1&rh=p_6%3AATVPDKIKX0DER&s=date-desc-rank"
asyncio.run(run_analysis(amazon_url))