sparksession / app.py
jaothan's picture
Create app.py
abb054a verified
import streamlit as st
from pyspark.sql import SparkSession
from huggingface_hub import HfApi, hf_hub_download
import requests
# Initialize a Spark session
spark = SparkSession.builder.appName("HuggingFaceSpark").getOrCreate()
# Function to read Parquet files from Hugging Face
def read_parquet_from_hf(repo_id, filename):
file_path = hf_hub_download(repo_id=repo_id, filename=filename)
df = spark.read.parquet(file_path)
return df
# Example use case: Load a dataset from Hugging Face and display it
def load_and_display_dataset():
st.title("Hugging Face Spark Integration")
repo_id = st.text_input("Enter Hugging Face Dataset Repo ID (e.g., 'james-burton/wine_reviews'):")
filename = st.text_input("Enter Dataset File Name (e.g., 'train/0.parquet'):")
if st.button("Load Dataset"):
if repo_id and filename:
try:
df = read_parquet_from_hf(repo_id, filename)
st.write("Dataset Schema:")
st.write(df.printSchema())
st.write("First 10 rows of the dataset:")
st.write(df.show(10))
except Exception as e:
st.error(f"Error loading dataset: {e}")
else:
st.error("Please enter both the repo ID and filename.")
if __name__ == "__main__":
load_and_display_dataset()