import streamlit as st from pyspark.sql import SparkSession from huggingface_hub import HfApi, hf_hub_download import requests # Initialize a Spark session spark = SparkSession.builder.appName("HuggingFaceSpark").getOrCreate() # Function to read Parquet files from Hugging Face def read_parquet_from_hf(repo_id, filename): file_path = hf_hub_download(repo_id=repo_id, filename=filename) df = spark.read.parquet(file_path) return df # Example use case: Load a dataset from Hugging Face and display it def load_and_display_dataset(): st.title("Hugging Face Spark Integration") repo_id = st.text_input("Enter Hugging Face Dataset Repo ID (e.g., 'james-burton/wine_reviews'):") filename = st.text_input("Enter Dataset File Name (e.g., 'train/0.parquet'):") if st.button("Load Dataset"): if repo_id and filename: try: df = read_parquet_from_hf(repo_id, filename) st.write("Dataset Schema:") st.write(df.printSchema()) st.write("First 10 rows of the dataset:") st.write(df.show(10)) except Exception as e: st.error(f"Error loading dataset: {e}") else: st.error("Please enter both the repo ID and filename.") if __name__ == "__main__": load_and_display_dataset()