import streamlit as st import pandas as pd import os import tempfile from scrapy import IGDBSpider # Make sure to use the correct spider name from scrapy.crawler import CrawlerRunner from twisted.internet import reactor, defer from scrapy.utils.log import configure_logging # Function to run the Scrapy spider and store data in a temporary CSV file @st.cache_data def run_scrapy_spider(): # Disable Scrapy's default log handling configure_logging() with tempfile.NamedTemporaryFile(delete=False, suffix=".csv") as temp_file: temp_file_path = temp_file.name # CrawlerRunner does not handle signals, avoiding the 'EPollReactor' issue runner = CrawlerRunner(settings={ 'FEED_FORMAT': 'csv', 'FEED_URI': temp_file_path }) @defer.inlineCallbacks def crawl(): yield runner.crawl(IGDBSpider) reactor.stop() # Start the reactor manually and crawl the website reactor.callWhenRunning(crawl) reactor.run() # Blocking call until spider completes return temp_file_path # Return the temporary file path # Load scraped CSV data def load_data(file_path): if os.path.exists(file_path) and os.path.getsize(file_path) > 0: return pd.read_csv(file_path) else: return None # Streamlit app layout st.title("B2B Game Marketplace - Recently Released Games Scraping") st.write(""" This application scrapes recently released games from IGDB and converts the data into a CSV dataset for the B2B game marketplace. """) if st.button('Run Scraping'): with st.spinner('Scraping recently released games...'): file_path = run_scrapy_spider() st.success('Scraping completed!') # Display scraped game data data = load_data(file_path) if data is not None and not data.empty: st.write("### Scraped Game Data", data.head()) # Convert to CSV for download csv = data.to_csv(index=False) st.download_button( label="Download Game Data as CSV", data=csv, file_name='recent_games.csv', mime='text/csv', ) else: st.info('No data available. Please run the scraping again.') else: st.info('Please click the button to start scraping.')